<a href="https://colab.research.google.com/github/MarigoldJ/ygl2/blob/main/class/20210615_nlp_day4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSA

## LSA 실습하기

In [25]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer

import pandas as pd
import numpy as np

In [26]:
def get_dtm(sentence):
    '''
    dtm을 만드는 함수
    '''
    vectorizer = CountVectorizer(min_df=1, stop_words='english', dtype=np.float32) # 불용어 한국어 지원 X...
    dtm = vectorizer.fit_transform(sentence)    # dtm : document-term matrix

    return dtm, vectorizer

def print_dtm_matrix(dtm, vectorizer, sentence):
    return pd.DataFrame(dtm.toarray(), index=sentence, columns=vectorizer.get_feature_names())

def lsa_tsvd(n_components, dtm, vectorizer, sentence):
    
    # dtm에 truncated SVD 적용
    lsa = TruncatedSVD(n_components, algorithm='arpack')    # arpack : 기본적인 알고리즘
    dtm_lsa = lsa.fit_transform(dtm)

    # dtm에 normalizer 적용
    dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

    # 출력
    components = pd.DataFrame(lsa.components_, index=['components-1', 'components-2'], columns=vectorizer.get_feature_names())
    lsa = pd.DataFrame(dtm_lsa, index=sentence, columns=['components-1', 'components-2'])

    # dtm과 dtm_lsa의 유사도 확인
    similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T)
    similarity = pd.DataFrame(similarity, index=sentence, columns=sentence)

    return components, lsa, similarity

def main():
    sentences = ['중앙방역대책본부는 오늘 0시 기준 코로나19 확진 환자가 44명 추가 확인돼 모두 13,417명으로 늘었다고 밝혔습니다.', 
                 '신규 확진 환자 가운데 해외 유입 사례는 23명, 국내 발생은 21명입니다.', 
                 '국내 발생 환자는 지역별로 서울 7명, 경기 8명 등 수도권에서 15명이 확인됐고, 광주에서도 5명, 대전에서도 1명이 확진됐습니다.', 
                 '또한 코로나19로 1명이 추가 사망해 누적 사망자는 모두 289명으로 늘었습니다.']
    dtm, vectorizer = get_dtm(sentences)
    d = print_dtm_matrix(dtm, vectorizer, sentences)

    c, l, s = lsa_tsvd(2, dtm, vectorizer, sentences)

    return d, c, l, s


In [27]:
d, c, l, s = main()

In [32]:
d.head(2)

Unnamed: 0,0시,13,15명이,1명이,21명입니다,23명,289명으로,417명으로,44명,5명,7명,8명,가운데,경기,광주에서도,국내,기준,누적,늘었다고,늘었습니다,대전에서도,또한,모두,발생,발생은,밝혔습니다,사례는,사망자는,사망해,서울,수도권에서,신규,오늘,유입,중앙방역대책본부는,지역별로,추가,코로나19,코로나19로,해외,확인돼,확인됐고,확진,확진됐습니다,환자,환자가,환자는
"중앙방역대책본부는 오늘 0시 기준 코로나19 확진 환자가 44명 추가 확인돼 모두 13,417명으로 늘었다고 밝혔습니다.",1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
"신규 확진 환자 가운데 해외 유입 사례는 23명, 국내 발생은 21명입니다.",0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [29]:
c

Unnamed: 0,0시,13,15명이,1명이,21명입니다,23명,289명으로,417명으로,44명,5명,7명,8명,가운데,경기,광주에서도,국내,기준,누적,늘었다고,늘었습니다,대전에서도,또한,모두,발생,발생은,밝혔습니다,사례는,사망자는,사망해,서울,수도권에서,신규,오늘,유입,중앙방역대책본부는,지역별로,추가,코로나19,코로나19로,해외,확인돼,확인됐고,확진,확진됐습니다,환자,환자가,환자는
components-1,0.115642,0.115642,0.198653,0.263701,0.056029,0.056029,0.065048,0.115642,0.115642,0.198653,0.198653,0.198653,0.056029,0.198653,0.198653,0.254682,0.115642,0.065048,0.115642,0.065048,0.198653,0.065048,0.180691,0.198653,0.056029,0.115642,0.056029,0.065048,0.065048,0.198653,0.198653,0.056029,0.115642,0.056029,0.115642,0.198653,0.180691,0.115642,0.065048,0.056029,0.115642,0.198653,0.171671,0.198653,0.056029,0.115642,0.198653
components-2,0.20545,0.20545,-0.139691,-0.090792,0.014464,0.014464,0.048898,0.20545,0.20545,-0.139691,-0.139691,-0.139691,0.014464,-0.139691,-0.139691,-0.125227,0.20545,0.048898,0.20545,0.048898,-0.139691,0.048898,0.254349,-0.139691,0.014464,0.20545,0.014464,0.048898,0.048898,-0.139691,-0.139691,0.014464,0.20545,0.014464,0.20545,-0.139691,0.254349,0.20545,0.048898,0.014464,0.20545,-0.139691,0.219914,-0.139691,0.014464,0.20545,-0.139691


In [30]:
l

Unnamed: 0,components-1,components-2
"중앙방역대책본부는 오늘 0시 기준 코로나19 확진 환자가 44명 추가 확인돼 모두 13,417명으로 늘었다고 밝혔습니다.",0.515354,0.856977
"신규 확진 환자 가운데 해외 유입 사례는 23명, 국내 발생은 21명입니다.",0.972027,0.23487
"국내 발생 환자는 지역별로 서울 7명, 경기 8명 등 수도권에서 15명이 확인됐고, 광주에서도 5명, 대전에서도 1명이 확진됐습니다.",0.835306,-0.549785
또한 코로나19로 1명이 추가 사망해 누적 사망자는 모두 289명으로 늘었습니다.,0.817843,0.575441


In [31]:
s

Unnamed: 0,"중앙방역대책본부는 오늘 0시 기준 코로나19 확진 환자가 44명 추가 확인돼 모두 13,417명으로 늘었다고 밝혔습니다.","신규 확진 환자 가운데 해외 유입 사례는 23명, 국내 발생은 21명입니다.","국내 발생 환자는 지역별로 서울 7명, 경기 8명 등 수도권에서 15명이 확인됐고, 광주에서도 5명, 대전에서도 1명이 확진됐습니다.",또한 코로나19로 1명이 추가 사망해 누적 사망자는 모두 289명으로 늘었습니다.
"중앙방역대책본부는 오늘 0시 기준 코로나19 확진 환자가 44명 추가 확인돼 모두 13,417명으로 늘었다고 밝혔습니다.",1.0,0.702216,-0.040675,0.914619
"신규 확진 환자 가운데 해외 유입 사례는 23명, 국내 발생은 21명입니다.",0.702216,1.0,0.682812,0.930119
"국내 발생 환자는 지역별로 서울 7명, 경기 8명 등 수도권에서 15명이 확인됐고, 광주에서도 5명, 대전에서도 1명이 확진됐습니다.",-0.040675,0.682812,1.0,0.366781
또한 코로나19로 1명이 추가 사망해 누적 사망자는 모두 289명으로 늘었습니다.,0.914619,0.930119,0.366781,1.0


## LSA 실습한거 뜯어보기

In [None]:
# 함수 사용하지 않고 코드 써보고, 변수 각각 확인해보기

## Sklearn dataset *fetch_20newsgroups* 를 활용한 실습

In [33]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [34]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [39]:
new_df = pd.DataFrame({'document': documents})
# new_df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})

new_df['clean_doc'] = new_df['document'].str.replace('[^a-zA-Z]', ' ')   # 영어 외에 제거
new_df['clean_doc'] = new_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
new_df['clean_doc'] = new_df['clean_doc'].apply(lambda x: x.lower())
new_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [40]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [42]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
# tokenized_doc = new_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = new_df['clean_doc'].copy().apply(lambda x: [item for item in x.split() if item not in stop_words])


In [43]:
detokenized_doc = []
for i in range(len(new_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

new_df['clean_doc'] = detokenized_doc

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf=True)
X = vectorizer.fit_transform(new_df['clean_doc'])

In [48]:
from sklearn.decomposition import TruncatedSVD

svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
             random_state=122, tol=0.0)

In [53]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f'Topic {idx+1}: {[(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]]}')

get_topics(svd_model.components_, terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10

# LDA

## LDA 실습하기

In [54]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [55]:
dataset = fetch_20newsgroups(shuffle=True, random_state=2, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [56]:
news_df = pd.DataFrame({'document': documents})
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['document'].apply(lambda x: x.lower())

In [57]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

In [59]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].copy().apply(lambda x: [item for item in x.split() if item not in stop_words])

In [60]:
from gensim import corpora

# 사전 만들기
dictionary = corpora.Dictionary(tokenized_doc)  # 사전 완성
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [68]:
import gensim

num_topic = 20
k = 20

lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=4)    # 상위 4개 topic만 가져오기

In [69]:
for topic in topics:
    print(topic)

(0, '0.074*"x" + 0.029*"*" + 0.012*"=" + 0.010*"*/"')
(1, '0.010*"would" + 0.008*"people" + 0.006*"one" + 0.004*"get"')
(2, '0.008*"-" + 0.004*"gun" + 0.003*"jews" + 0.003*"state"')
(3, '0.014*"|>" + 0.003*"water" + 0.003*"mpc" + 0.002*"\\_"')
(4, '0.014*"would" + 0.011*"like" + 0.009*"i\'m" + 0.009*"one"')
(5, '0.010*"key" + 0.007*"chip" + 0.006*"clipper" + 0.004*"escrow"')
(6, '0.011*"turkish" + 0.010*"armenian" + 0.006*"armenians" + 0.005*"government"')
(7, '0.048*">" + 0.006*"->" + 0.005*"*" + 0.005*"|"')
(8, '0.008*"use" + 0.007*"available" + 0.006*"file" + 0.005*"files"')
(9, '0.022*"." + 0.006*"one" + 0.005*"went" + 0.005*"people"')
(10, '0.013*"$" + 0.010*"dos" + 0.006*"$1" + 0.005*"$2"')
(11, '0.048*"1" + 0.040*"-" + 0.029*"0" + 0.028*"2"')
(12, '0.118*":" + 0.007*"----" + 0.005*"]" + 0.003*">>"')
(13, '0.007*"one" + 0.007*"god" + 0.007*"people" + 0.006*"would"')
(14, '0.011*"file:" + 0.006*"=" + 0.003*"char" + 0.002*"blues"')
(15, '0.037*"|" + 0.005*"-" + 0.004*"image" + 0.00

In [70]:
print('Perplexity: ', lda_model.log_perplexity(corpus))
# 내부 평가 지표, 숫자가 낮을수록 좋은 성능

Perplexity:  -10.011532400570404


In [72]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_doc, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('Coherence Score :', coherence_lda)

Coherence Score : 0.5290906225372973


In [76]:
print(lda_model.show_topics(formatted=False))

[(14, [('file:', 0.010953427), ('=', 0.005838026), ('char', 0.0030708397), ('blues', 0.0016404018), ('0)', 0.0012122155), ('//', 0.0011949205), ('sweden', 0.0011436974), ('m"`@("`@("`@("`@("`@("`@("`@("`@("`@("`@("`@("`@("`@("`@("`@(', 0.0010421346), ('from:', 0.00089708366), ('fantasy', 0.00086196506)]), (5, [('key', 0.010301148), ('chip', 0.006540466), ('clipper', 0.006037982), ('escrow', 0.004380645), ('keys', 0.0032950423), ('two', 0.0028900139), ('nsa', 0.0025661862), ('serial', 0.0025258944), ('algorithm', 0.0024922108), ('chips', 0.002368585)]), (8, [('use', 0.008141652), ('available', 0.0066474658), ('file', 0.006230068), ('files', 0.0049159797), ('program', 0.004712736), ('using', 0.004707946), ('window', 0.004661933), ('get', 0.0043690996), ('information', 0.004360211), ('-', 0.0043093674)]), (15, [('|', 0.03652437), ('-', 0.005066953), ('image', 0.0039183125), ('--', 0.003251351), ('university', 0.0031427792), ('package', 0.0029396883), ('+', 0.002754297), ('graphics', 0.002

In [77]:
import pprint
pprint.pprint(lda_model.show_topics(formatted=False))

[(9,
  [('.', 0.021603972),
   ('one', 0.006144928),
   ('went', 0.0049036318),
   ('people', 0.0048954394),
   ('?', 0.004437625),
   ('israel', 0.0042950185),
   ('know', 0.0036393795),
   ('came', 0.0034864473),
   ('go', 0.0034649998),
   ('said', 0.0033517561)]),
 (6,
  [('turkish', 0.010839144),
   ('armenian', 0.010156543),
   ('armenians', 0.0061457525),
   ('government', 0.005091),
   ('greek', 0.00497608),
   ('_/', 0.0049044485),
   ('encryption', 0.0039843107),
   ('turks', 0.0036263117),
   ('turkey', 0.0035953084),
   ('armenia', 0.0032019778)]),
 (15,
  [('|', 0.03652437),
   ('-', 0.005066953),
   ('image', 0.0039183125),
   ('--', 0.003251351),
   ('university', 0.0031427792),
   ('package', 0.0029396883),
   ('+', 0.002754297),
   ('graphics', 0.0026411896),
   ('grounding', 0.002384958),
   ('ftp', 0.0022975092)]),
 (4,
  [('would', 0.014043168),
   ('like', 0.01138232),
   ("i'm", 0.009460373),
   ('one', 0.009412077),
   ('get', 0.00927401),
   ('know', 0.008348506