In [2]:
# LDA(Latent Dirichlet Allocation, 잠재 디리클레 할당)
# 주제별 단어 분포를 바탕으로 주어진 문서에서 발견된 단어수 분포를 분석
# 해당 문서의 주제들을 예측하는 기업
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

#영어 단어의 어근만 추출
stm = PorterStemmer()

#영어 단어의 불용어 집합
stopwords = set(stopwords.words('english'))

#특수문자를 제거하기 위한 정규식
# 첫 글자는 알파벳으로 시작하고 그 뒤에 영문자 대소문자, 숫자, -,_,.만 허용
pattern = re.compile('[a-zA-Z][-_a-zA-Z0-9.]*')

#문장을 단어 단위로 분리하고 불용어 및 특수문자 제거 후 어근만 추출하여 list로 반환
def tokenize(sentence):
    def stem(w):
        try: return stm.stem(w)
        except: return w
        #소문자로 바꾼 후 단어 구분, 불용어 제거, 패턴에 맞는 단어만 선택
    return [stem(w) for w in word_tokenize(sentence.lower())
    if w not in stopwords and pattern.match(w)]

In [3]:
import tomotopy as tp

#LDAModel 생성
# 잠재 디클레리 할당
# 주어진 문서에 대해 각 문서에 어떤 주자게 존재하는가를 서술하는 확률적 토픽 모델 기법
model = tp.LDAModel(k=20, min_cf=5)

#파일에서 한 줄씩 읽어와서 model에 추가
for i, line in enumerate(open('c:/data/text/trumph.txt',
encoding='ms949')):
    model.add_doc(tokenize(line)) #공백을 기준으로 단어를 나누어 model에 추가

#train(0): 0회 학습, model의 num_words, num_vocabs 값을 확인하기 위해
#실제로 학습은 하지 않고 학습 준비만 하는 상태

model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print('Vocab size:', model.num_vocabs)
model.train(200) # 200회 학습
for i in range(model.k):
    res = model.get_topic_words(i, top_n=10) #토픽 별 상위 10개 단어
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

Total docs: 1
Total words: 162
Vocab size: 21
Topic #0	everi, back, mani, america, american, nation, countri, peopl, one, protect
Topic #1	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #2	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #3	world, across, god, america, american, nation, countri, peopl, one, everi
Topic #4	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #5	make, america, american, nation, countri, peopl, one, everi, protect, world
Topic #6	nation, peopl, one, america, american, countri, everi, protect, world, great
Topic #7	countri, protect, america, american, nation, peopl, one, everi, world, great
Topic #8	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #9	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #10	america, american, nation, countri, peopl, one, everi, protect, world, great
Topic #11	ame

In [5]:
import nltk
emma_raw = nltk.corpus.gutenberg.raw('austen-emma.txt')

In [6]:
tokenize(emma_raw)[:100]

['emma',
 'jane',
 'austen',
 'volum',
 'chapter',
 'emma',
 'woodhous',
 'handsom',
 'clever',
 'rich',
 'comfort',
 'home',
 'happi',
 'disposit',
 'seem',
 'unit',
 'best',
 'bless',
 'exist',
 'live',
 'nearli',
 'twenty-on',
 'year',
 'world',
 'littl',
 'distress',
 'vex',
 'youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period',
 'mother',
 'die',
 'long',
 'ago',
 'indistinct',
 'remembr',
 'caress',
 'place',
 'suppli',
 'excel',
 'woman',
 'gover',
 'fallen',
 'littl',
 'short',
 'mother',
 'affect',
 'sixteen',
 'year',
 'miss',
 'taylor',
 'mr.',
 'woodhous',
 'famili',
 'less',
 'gover',
 'friend',
 'fond',
 'daughter',
 'particularli',
 'emma',
 'intimaci',
 'sister',
 'even',
 'miss',
 'taylor',
 'ceas',
 'hold',
 'nomin',
 'offic',
 'gover',
 'mild',
 'temper',
 'hardli',
 'allow',
 'impos',
 'restraint',
 'shadow',
 'author',
 'long',
 'pass',
 'away',
 'live',
 'togeth',
 'friend

In [7]:
model = tp.LDAModel(k=5, min_cf=5) # 토픽 모델링함수
model.add_doc(tokenize(emma_raw))
model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print('Vocab size:', model.num_vocabs)

Total docs: 1
Total words: 67219
Vocab size: 1775


In [8]:
model.train(100)
for i in range(model.k):
    res = model.get_topic_words(i, top_n=2)
    print(res)
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

[('weston', 0.024195490404963493), ('elton', 0.024074817076325417)]
Topic #0	weston, elton
[('emma', 0.05767618119716644), ('must', 0.04423421993851662)]
Topic #1	emma, must
[('mr.', 0.05470478534698486), ('think', 0.027563318610191345)]
Topic #2	mr., think
[('mr.', 0.033242397010326385), ('would', 0.02983599342405796)]
Topic #3	mr., would
[('much', 0.03445718437433243), ('know', 0.03428183123469353)]
Topic #4	much, know


In [10]:
#LDA(한글뉴스)
import tomotopy as tp
import re
from konlpy.tag import Hannanum

han = Hannanum()

model = tp.LDAModel(k = 10, min_cf=2)

#파일에서 한 줄씩 읽어와서 model 추가
for i, line in enumerate(open('c:/data/text/news1.txt', encoding='utf-8')):
    sentence = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', line)
    a=sentence.strip()
    n = han.nouns(a)
    n2 = [x for x in n if len(x) > 1]
    if len(n2) > 0:
        model.add_doc(n2)

model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print('Vocab size:', model.num_vocabs)

model.train(200)
for i in range(model.k):
    res = model.get_topic_words(i, top_n=10)
    print('Topic #{0}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

Total docs: 7
Total words: 62
Vocab size: 18
Topic #0	민정수석, 논란, 수석, 청와대, 사의, 수용, 대통령, 아들, 입사지원서, 이날
Topic #1	입사지원서, 문재, 이날, 수석, 청와대, 사의, 수용, 민정수석, 대통령, 아들
Topic #2	사의, 표명, 출근, 수석, 청와대, 수용, 민정수석, 대통령, 아들, 입사지원서
Topic #3	아들, 김진국, 수석, 청와대, 사의, 수용, 민정수석, 대통령, 입사지원서, 논란
Topic #4	수용, 이날, 대통령의, 수석, 청와대, 사의, 민정수석, 대통령, 아들, 입사지원서
Topic #5	수석, 청와대, 대통령, 참석, 사의, 수용, 민정수석, 아들, 입사지원서, 논란
Topic #6	수석, 청와대, 사의, 수용, 민정수석, 대통령, 아들, 입사지원서, 논란, 이날
Topic #7	수석, 청와대, 사의, 수용, 민정수석, 대통령, 아들, 입사지원서, 논란, 이날
Topic #8	기자들, 관계자, 수석, 청와대, 사의, 수용, 민정수석, 대통령, 아들, 입사지원서
Topic #9	수석, 청와대, 사의, 수용, 민정수석, 대통령, 아들, 입사지원서, 논란, 이날


In [12]:
#감성분석 : 텍스트에 나타난 주간적 요소를 분석하여 긍정, 부정의 요소 및 그 정도를 판별하여 정량화 하는 기법
import glob
from afinn import Afinn

pos_review = (glob.glob('c:/data/imdb/train/pos/*.txt'))[20]

f = open(pos_review, 'r')
lines1 = f.readlines()[0]
f.close()

afinn = Afinn()
afinn.score(lines1)

8.0

In [13]:
files = list(glob.glob('c:/data/imdb/train/pos/*.txt')[:10])
files

['c:/data/imdb/train/pos\\0_9.txt',
 'c:/data/imdb/train/pos\\10000_8.txt',
 'c:/data/imdb/train/pos\\10001_10.txt',
 'c:/data/imdb/train/pos\\10002_7.txt',
 'c:/data/imdb/train/pos\\10003_8.txt',
 'c:/data/imdb/train/pos\\10004_8.txt',
 'c:/data/imdb/train/pos\\10005_7.txt',
 'c:/data/imdb/train/pos\\10006_7.txt',
 'c:/data/imdb/train/pos\\10007_7.txt',
 'c:/data/imdb/train/pos\\10008_7.txt']

In [16]:
#학습용 긍정리뷰 10개 파일만 테스트
afinn = Afinn() #감성분석 함수
for i in files:
    f = open(i)
    lines1=f.readlines()[0]
    print(afinn.score(lines1))
    f.close()

-1.0
2.0
19.0
3.0
14.0
8.0
22.0
28.0
13.0
5.0


In [17]:
#부정리뷰데이터 20번째 내용
neg_review = (glob.glob('c:/data/imdb/train/neg/*.txt')[20])

f = open(neg_review, 'r')
lines2 = f.readlines()[0]
f.close()

afinn.score(lines2)

-4.0

In [18]:
files = list(glob.glob('c:/data/imdb/train/neg/*.txt')[:10])
files

['c:/data/imdb/train/neg\\0_3.txt',
 'c:/data/imdb/train/neg\\10000_4.txt',
 'c:/data/imdb/train/neg\\10001_4.txt',
 'c:/data/imdb/train/neg\\10002_1.txt',
 'c:/data/imdb/train/neg\\10003_1.txt',
 'c:/data/imdb/train/neg\\10004_3.txt',
 'c:/data/imdb/train/neg\\10005_3.txt',
 'c:/data/imdb/train/neg\\10006_4.txt',
 'c:/data/imdb/train/neg\\10007_1.txt',
 'c:/data/imdb/train/neg\\10008_2.txt']

In [19]:
afinn = Afinn()
for i in files:
    f = open(i)
    lines1 = f.readlines()[0]
    print(afinn.score(lines1))
    f.close()

6.0
-4.0
9.0
5.0
-7.0
1.0
13.0
4.0
7.0
6.0


In [20]:
#기계학습으로 감성분석
#긍정 텍스트 로딩
pos_review=(glob.glob('c:/data/imdb/train/pos/*.txt')[:100])

lines_pos = []
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except Exception as e:
        continue

len(lines_pos)

100

In [21]:
#부정 텍스트 로딩
neg_review=(glob.glob('c:/data/imdb/train/neg/*.txt')[:100])

lines_neg=[]
for i in neg_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_neg.append(temp)
        f.close()
    except Exception as e:
        continue

len(lines_neg)
#긍정 부정 리뷰를 합침
total_text = lines_pos + lines_neg

len(total_text)

200

In [22]:
import numpy as np
from nltk.corpus import stopwords
x = np.array(['pos', 'neg'])
class_Index = np.repeat(x, [len(lines_pos), len(lines_neg)], axis=0)

stop_words = stopwords.words('english')

In [23]:
# 단어들이 tfidf 가중치를 부여한 후 문서-단어 매트릭스로 바꿈
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words=stop_words).fit(total_text)
X_train_vectorized = vect.transform(total_text)
X_train_vectorized.index = class_Index

In [24]:
import pandas as pd
#데이터프레임으로 변환
df = pd.DataFrame(X_train_vectorized.toarray(), columns=vect.vocabulary_.keys())
df.head()

Unnamed: 0,bromwell,high,cartoon,comedy,ran,time,programs,school,life,teachers,...,zombified,auteur,ample,opportunities,golden,geist,uttered,downloading,midget,tricking
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#로지스틱 회귀 모형
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=10)
logit.fit(X_train_vectorized, class_Index)

LogisticRegression(random_state=10)

In [26]:
#긍정 리뷰들을 하나씩 불러와서 실험
def pos_review(model):
    count_all = 0
    count = 0
    num = 100
    tests1 = []
    for idx in range(0, num):
        pos_review_test = (glob.glob(
            'c:/data/imdb/test/pos/*.txt'
        ))[idx]

        f = open(pos_review_test, 'r', encoding='utf-8')
        tests1.append(f.readlines())
        f.close()
    
    for test in tests1:
        preds = model.predict(vect.transform(test))
        result = preds[0]
        if result == 'pos':
            count += 1
        count_all += 1
    
    rate = count*100 / count_all
    print('예측정확도:{0:.1f}%'.format(rate))

In [31]:
#부정 리뷰들을 하나씩 불러와서 실험
def neg_review(model):
    count_all = 0
    count = 0
    num = 100
    tests2 = []
    for idx in range(0, num):
        neg_review_test = (glob.glob(
            'c:/data/imdb/test/neg/*.txt'
        ))[idx]

        f = open(neg_review_test, 'r', encoding='utf-8')
        tests2.append(f.readlines())
        f.close()
    
    for test in tests2:
        preds = model.predict(vect.transform(test))
        result = preds[0]
        if result == 'neg':
            count += 1
        count_all += 1
    
    rate = count*100 / count_all
    print('예측정확도:{0:.1f}%'.format(rate))

In [32]:
pos_review(logit)
neg_review(logit)

예측정확도:66.0%
예측정확도:81.0%


In [33]:
#의사결정나무
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=10)
tree.fit(X_train_vectorized, class_Index)

DecisionTreeClassifier(random_state=10)

In [34]:
pos_review(tree)
neg_review(tree)

예측정확도:39.0%
예측정확도:66.0%


In [35]:
#랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
#10개의 트리로 구성된 랜덤 포레스트
forest = RandomForestClassifier(n_estimators=10, random_state=10)
forest.fit(X_train_vectorized, class_Index)

RandomForestClassifier(n_estimators=10, random_state=10)

In [36]:
pos_review(forest)
neg_review(forest)

예측정확도:44.0%
예측정확도:75.0%


In [37]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_vectorized, class_Index)

KNeighborsClassifier(n_neighbors=2)

In [38]:
pos_review(knn)
neg_review(knn)

예측정확도:34.0%
예측정확도:85.0%


In [39]:
#인공신경망
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=10)
mlp.fit(X_train_vectorized, class_Index)

MLPClassifier(random_state=10)

In [40]:
pos_review(mlp)
neg_review(mlp)

예측정확도:58.0%
예측정확도:79.0%


In [41]:
#SVM
from sklearn.svm import SVC
svm = SVC(random_state=10)
svm.fit(X_train_vectorized, class_Index)

SVC(random_state=10)

In [42]:
pos_review(svm)
neg_review(svm)

예측정확도:63.0%
예측정확도:87.0%


In [44]:
from textblob import TextBlob

#언어탐지
a = TextBlob('파이썬은 재미있다.')
a.detect_language()

'ko'

In [45]:
b = TextBlob('I like python.')
b.detect_language()

'en'

In [46]:
#언어 번역 기능
c = TextBlob('파이썬은 재미있다.')
c.translate(to='en')

TextBlob("Python is fun.")

In [48]:
from textblob.classifiers import NaiveBayesClassifier

train = [
    ('I love this sandwich', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('This is my best work.', 'pos'),
    ('What an awesome view.', 'pos'),
    ('I do not like this restaurant.', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this.", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg')
]

test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job.', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

In [49]:
cl = NaiveBayesClassifier(train)

In [50]:
print(cl.classify("Their burgers are amazing"))
print(cl.classify("I don't like their pizza."))

pos
neg


In [52]:
#여러 문장을 종합하여 부정으로 분류
blob = TextBlob('The beer was amazing. But the hangover was horrible. My boss was not happy.', classifier=cl)
blob.classify()

'neg'

In [53]:
#개별 문장으로 분류
for sentence in blob.sentences:
    print(sentence, '==>', sentence.classify()
    )

The beer was amazing. ==> pos
But the hangover was horrible. ==> neg
My boss was not happy. ==> neg


In [54]:
for row in test:
    print(row[0], '==>', sentence.classify())

The beer was good. ==> neg
I do not enjoy my job. ==> neg
I ain't feeling dandy today. ==> neg
I feel amazing! ==> neg
Gary is a friend of mine. ==> neg
I can't believe I'm doing this. ==> neg


In [55]:
cl.accuracy(test)

0.8333333333333334

In [56]:
cl.show_informative_features(5)

Most Informative Features
          contains(this) = True              neg : pos    =      2.3 : 1.0
          contains(this) = False             pos : neg    =      1.8 : 1.0
          contains(This) = False             neg : pos    =      1.6 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0


In [62]:
#연관어 분석
#동시출현빈도 기반
import glob
# 긍정리뷰 100개 로딩
pos_review = (glob.glob('c:/data/imdb/train/pos/*.txt'))[0:100]
lines_pos=[]
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except Exception as e:
        continue

len(lines_pos)

100

In [63]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd

tokenizer = RegexpTokenizer('[\w]+')
stop_words = stopwords.words('english')

#동시출현 단어 계산
count = {} #동시출현 빈도가 저장될 dict
for line in lines_pos:
    words = line.lower()
    tokens = tokenizer.tokenize(words)
    #불용어 제거, 불용어에 br추가
    stopped_tokens = [i for i in list(set(tokens)) if i not in stop_words+['br']]
    #글자수가 1인 단어 제외
    stopped_tokens2 = [i for i in stopped_tokens if len(i) > 1]
    for i, a in enumerate(stopped_tokens2):
        for b in stopped_tokens2[i+1:]:
            if a > b:
                count[b,a] = count.get((b,a),0)+1
            else:
                count[a,b] = count.get((a,b),0)+1

df = pd.DataFrame.from_dict(count, orient='index')
#리스트 구성
list1 = []
for i in range(len(df)):
    list1.append([df.index[i][0], df.index[i][1], df[0][i]])

df2 = pd.DataFrame(list1, columns=['terms', 'terms2', 'freq'])
df3 = df2.sort_values(by=['freq'], ascending=False)
df3_pos = df3.reset_index(drop=True)

#동시출현 단어 페어 빈도 상위 20개 출력
df3_pos.head(20)

Unnamed: 0,terms,terms2,freq
0,film,story,41
1,movie,one,41
2,film,movie,35
3,movie,story,35
4,one,story,33
5,good,movie,32
6,film,one,31
7,movie,see,30
8,one,see,27
9,film,like,27


In [64]:
#부정 리뷰에 적용
neg_review=(glob.glob('c:/data/imdb/train/neg/*.txt'))[0:100]

lines_neg=[]
for i in neg_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_neg.append(temp)
        f.close()

    except Exception as e:
        continue

len(lines_neg)

100

In [66]:
count = {} #동시출현 빈도가 저장될 dict
for line in lines_neg:
    words = line.lower()
    tokens = tokenizer.tokenize(words)
    stopped_tokens = [i for i in list(set(tokens)) if i not in stop_words+['br']]
    stopped_tokens2 = [i for i in stopped_tokens if len(i) > 1]
    for i, a in enumerate(stopped_tokens2):
        for b in stopped_tokens2[i+1:]:
            if a > b:
                count[b,a] = count.get((b,a),0)+1
            else:
                count[a,b] = count.get((a,b),0)+1

df = pd.DataFrame.from_dict(count, orient='index')
#리스트 구성
list1 = []
for i in range(len(df)):
    list1.append([df.index[i][0], df.index[i][1], df[0][i]])

df2 = pd.DataFrame(list1, columns=['terms', 'terms2', 'freq'])
df3 = df2.sort_values(by=['freq'], ascending=False)
df3_neg = df3.reset_index(drop=True)

#동시출현 단어 페어 빈도 상위 20개 출력
df3_neg.head(20)

Unnamed: 0,terms,terms2,freq
0,film,movie,42
1,like,movie,40
2,movie,one,38
3,film,one,35
4,like,one,33
5,even,movie,32
6,good,movie,32
7,even,like,31
8,good,one,30
9,film,like,29


In [80]:
#통계적 가중치 기반
import glob

#긍정리뷰 100개
pos_review=(glob.glob('c:/data/imdb/train/pos/*.txt'))[0:100]

lines_pos = []
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except Exception as e:
        continue

len(lines_pos)

100

In [81]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

#전처리
tokenizer = RegexpTokenizer('[\w]+')
stop_words = stopwords.words('english')

#TF-IDF가중치 할당
vec = TfidfVectorizer(stop_words=stop_words)
vector_lines_pos = vec.fit_transform(lines_pos)
A = vector_lines_pos.toarray()
print(A.shape)
print(A)
# x축 단어, y축 문서

(100, 4001)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.06538462 0.         ... 0.         0.         0.        ]
 [0.         0.23078109 0.         ... 0.         0.         0.        ]]


In [82]:
#현재 : 100개의 문서의 유사도
#목적 : 단어간 유사도를 구하는 것
#단어-문서 행렬로 변경 >>> x축 문서, y축 단어
A =A.transpose()
print(A.shape)
print(A)

(4001, 100)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.06538462 0.23078109]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [71]:
import numpy as np
from scipy import sparse
#밀집행렬(모든 값이 다 채워져있음) <--> 희소행렬
a = np.array([[0.5,0,0],[0,1,0],[0.7,0,1.5]])

b = sparse.csr_matrix(a)
print('{}'.format(b))
c = b.toarray() #다시 밀집행렬로 변환
print(c)

  (0, 0)	0.5
  (1, 1)	1.0
  (2, 0)	0.7
  (2, 2)	1.5
[[0.5 0.  0. ]
 [0.  1.  0. ]
 [0.7 0.  1.5]]


In [83]:
from sklearn.metrics.pairwise import cosine_similarity

A_sparse = sparse.csr_matrix(A)
#코사인 유사도 계산
similarities_sparse = cosine_similarity(A_sparse, dense_output=False)
#todok() 행렬을 딕셔너리 형태로 변환
list(similarities_sparse.todok().items())[35000:35010]

#에러는 아닙니다. 위에서 코사인 유사도를 계산한 후에 빈 리스트에 입력해서 출력되어야 하는데,
# 값들이 입력이 안되서 단어의 인덱스를 확인하지 못하고 있습니다.

[((1469, 108), 0.37803585968894865),
 ((1470, 108), 0.2189685434746738),
 ((1476, 108), 0.06407477897013734),
 ((1477, 108), 0.185189577514238),
 ((1480, 108), 0.20111036876169444),
 ((1489, 108), 0.06995711757772019),
 ((1496, 108), 0.10714874067068783),
 ((1503, 108), 0.30487333830091773),
 ((1504, 108), 0.30487333830091773),
 ((1512, 108), 0.30487333830091773)]

In [75]:
vec.get_feature_names()[1469]

'fraud'

In [77]:
vec.get_feature_names()[108]

'actual'

In [78]:
import pandas as pd
import numpy as np

df = pd.DataFrame(list(similarities_sparse.todok().items()), columns=['words', 'weight'])
df2 = df.sort_values(by=['weight'], ascending=False)
df2 = df2.reset_index(drop=True)

df3 = df2.loc[np.round(df2['weight']) < 1]
df3.head(10)

Unnamed: 0,words,weight
100,"(49, 46)",0.364635
101,"(46, 49)",0.364635
102,"(50, 53)",0.327337
103,"(53, 50)",0.327337
104,"(7, 16)",0.325747
105,"(16, 7)",0.325747
106,"(8, 7)",0.323691
107,"(7, 8)",0.323691
108,"(53, 42)",0.32294
109,"(42, 53)",0.32294


In [84]:
#word2vec 기반
# 단어의 의미는 그 단어 주변 단어의 분포로 이해될 수 있음

import glob
pos_review=(glob.glob('c:/data/imdb/train/pos/*.txt')[0:100])

lines_pos=[]
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except Exception as e:
        continue

len(lines_pos)

100

In [86]:
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import RegexpTokenizer

stop_words = stopwords.words('english')
tokenizer = RegexpTokenizer('[\w]+')

#단어 추출
text=[]
for line in lines_pos:
    words = line.lower()
    tokens = tokenizer.tokenize(words)
    stopped_tokens = [i for i in list(set(tokens)) if i not in stop_words+['br']]
    stopped_tokens2 = [i for i in stopped_tokens if len(i) > 1]
    text.append(stopped_tokens2)

#word2vec 모형 생성, sg = 1 skip-gram을 적용, window=2 중심 단어로부터 좌우 2개의 단어까지 학습에 적용
# min_count=3 최소 3회 이상 출현한 단어들을 대상으로 학습
model = Word2Vec(text, vector_size=10, sg=1, window=2, min_count=3)
#불필요한 메모리 unload
model.wv.similarity('film', 'movie')

0.9341292

In [87]:
#good과 가장 유사한 단어 5개
model.wv.most_similar('good', topn=5)

[('use', 0.9330725073814392),
 ('character', 0.9153137803077698),
 ('think', 0.9129804968833923),
 ('every', 0.8964542150497437),
 ('never', 0.8929697871208191)]

In [88]:
#모델에 저장된 단어의 갯수
len(model.wv.index_to_key)

895

In [89]:
#모델에 저장된 단어 목록
model.wv.index_to_key[:10]

['movie',
 'film',
 'story',
 'one',
 'see',
 'good',
 'like',
 'well',
 'time',
 'much']