# WordExtractor in soynlp

가지고 있는 데이터를 활용하여 soynlp에서 제공하는 WordExtractor tutorial을 진행하였다.

source: https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb

In [1]:
import pandas as pd

In [2]:
!pwd

/notebooks/soynlp


In [3]:
!ls

README.md  description.py  setup.py  soynlp_wordextractor.ipynb
data	   notes	   soynlp    tutorials


In [4]:
# 데이터는 인터넷에서 크롤링해온것
data = pd.read_csv('data/sample.csv', header=0, encoding='CP949')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,movie_nm,score,reg_date,comments
0,1,프로메테우스,6.0,2014.11.17,정리가 안되는 에일리언 시리즈를 리부트한 영화 그러나 외계인에 대한 공포감은 전작들...
1,2,지옥화,9.0,2014.11.17,배우들 연기도 좋고 시나리오도 좋고 여배우 진짜 고생했을듯 아쉬운점은 영화 전개가 ...
2,3,드라큘라: 전설의 시작,9.0,2014.11.17,어디서 낮이 있다 했는데 원조 드라큘라하고 아들은 왕좌의게임에 나온 사람이였네 ㅎㅎ...
3,4,인터스텔라,10.0,2014.11.17,대단한영화다ㅡ최근몇년간흥미와감동을준것은ㅡ아바타이후로처음 ㅡ아바타보다오락성은떨어지나심...
4,5,인터스텔라,10.0,2014.11.17,내 생애 다시 이런 영화을 만나 볼수 있을까다시 만나 보고 싶다


In [6]:
data['comments'].head()

0    정리가 안되는 에일리언 시리즈를 리부트한 영화 그러나 외계인에 대한 공포감은 전작들...
1    배우들 연기도 좋고 시나리오도 좋고 여배우 진짜 고생했을듯 아쉬운점은 영화 전개가 ...
2    어디서 낮이 있다 했는데 원조 드라큘라하고 아들은 왕좌의게임에 나온 사람이였네 ㅎㅎ...
3    대단한영화다ㅡ최근몇년간흥미와감동을준것은ㅡ아바타이후로처음 ㅡ아바타보다오락성은떨어지나심...
4                 내 생애 다시 이런 영화을 만나 볼수 있을까다시 만나 보고 싶다 
Name: comments, dtype: object

In [7]:
data['comments'].to_csv('data/sample.txt', index=False, header=False, encoding='CP949')

In [8]:
class Sentences:
    def __init__(self, fname):
        self.fname = fname
        self.length = 0
    def __iter__(self):
        with open(self.fname, encoding='CP949') as f:
            for doc in f:
                doc = doc.strip()
                if not doc:
                    continue
                for sent in doc.split('  '):
                    yield sent
    def __len__(self):
        if self.length == 0:
            with open(self.fname, encoding='CP949') as f:
                for doc in f:
                    doc = doc.strip()
                    if not doc:
                        continue
                    self.length += len(doc.split('  '))
        return self.length
    
corpus_fname = 'data/sample.txt'
sentences = Sentences(corpus_fname)
print('num sentences = %d' % len(sentences))

num sentences = 44714


In [9]:
%%time
from soynlp.word import WordExtractor

word_extractor = WordExtractor(min_count=100,
                               min_cohesion_forward=0.05, 
                               min_right_branching_entropy=0.0)

word_extractor.train(sentences)
words = word_extractor.extract()

training was done. used memory 0.306 Gbory 0.090 Gb
all cohesion probabilities was computed. # words = 2081
all branching entropies was computed # words = 49532
all accessor variety was computed # words = 49532
CPU times: user 7.24 s, sys: 170 ms, total: 7.41 s
Wall time: 7.44 s


In [10]:
len(words)

2396

In [11]:
print('type: %s\n' % type(words['킬링타임']))
print(words['킬링타임'])

type: <class 'soynlp.word._word.Scores'>

Scores(cohesion_forward=0.9197908301199416, cohesion_backward=0, left_branching_entropy=3.408236216765657, right_branching_entropy=0.34804303823242544, left_accessor_variety=38, right_accessor_variety=4, leftside_frequency=228, rightside_frequency=0)


In [12]:
def word_score(score):
    import math
    return (score.cohesion_forward * math.exp(score.right_branching_entropy))

print('단어   (빈도수, cohesion, branching entropy)\n')
for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:30]:
    print('%s     (%d, %.3f, %.3f)' % (word, 
                                   score.leftside_frequency, 
                                   score.cohesion_forward,
                                   score.right_branching_entropy
                                  ))

단어   (빈도수, cohesion, branching entropy)

영화     (20970, 0.946, 4.587)
합니다     (249, 0.841, 4.613)
너무     (3871, 0.967, 4.411)
ㅠㅠ     (425, 0.924, 4.367)
입니다     (128, 0.530, 4.726)
ㅎㅎ     (523, 0.903, 4.016)
ㅋㅋ     (1199, 0.968, 3.915)
봤는데     (1155, 0.556, 4.459)
뭔가     (492, 0.768, 4.059)
된다     (248, 0.768, 4.058)
진짜     (1437, 0.413, 4.678)
정말     (2703, 0.536, 4.368)
봤습니다     (492, 0.508, 4.385)
역시     (741, 0.475, 4.420)
때문에     (340, 0.645, 4.054)
솔직히     (350, 0.882, 3.734)
근데     (299, 0.584, 4.086)
ㅜㅜ     (143, 0.941, 3.593)
하지만     (726, 0.347, 4.567)
그리고     (852, 0.294, 4.714)
준다     (164, 0.588, 3.950)
완전     (800, 0.638, 3.848)
간만에     (275, 0.674, 3.713)
ㅋㅋㅋ     (627, 0.711, 3.624)
라는     (144, 0.372, 4.271)
얼마나     (289, 0.712, 3.619)
특히     (359, 0.438, 4.093)
든다     (125, 0.874, 3.397)
영화입니다     (475, 0.383, 4.181)
함께     (470, 0.836, 3.391)


## 더 많은 데이터를 가지고 해보자

In [13]:
!ls data/moviereview

commentafter_12001to15000.csv  commentafter_39001to42000.csv
commentafter_15001to18000.csv  commentafter_42001to45000.csv
commentafter_18001to21000.csv  commentafter_45001to48000.csv
commentafter_1to3000.csv       commentafter_48001to51000.csv
commentafter_21001to24000.csv  commentafter_51001to54000.csv
commentafter_24001to27000.csv  commentafter_54001to57000.csv
commentafter_27001to30000.csv  commentafter_57001to60000.csv
commentafter_30001to33000.csv  commentafter_60001to63000.csv
commentafter_3001to6000.csv    commentafter_6001to9000.csv
commentafter_33001to36000.csv  commentafter_63001to65000.csv
commentafter_36001to39000.csv  commentafter_9001to12000.csv


In [14]:
import os

In [15]:
dirpath = 'data/moviereview/'
filelist = os.listdir(dirpath)
isfirst = True

for file in filelist:
    try:
        tmp = pd.read_csv(dirpath + file, header=0, encoding='CP949')
        if isfirst:
            data = tmp
            isfirst = False
        else:
            data = data.append(tmp, ignore_index=True)
    except:
        pass

In [16]:
print(len(data))

930000


In [17]:
data['comments'].to_csv('data/moviereview.txt', index=False, header=False, encoding='CP949')

In [18]:
corpus_fname = 'data/moviereview.txt'
sentences = Sentences(corpus_fname)
print('num sentences = %d' % len(sentences))

num sentences = 925337


In [19]:
%%time
from soynlp.word import WordExtractor

word_extractor = WordExtractor(min_count=100,
                               min_cohesion_forward=0.05, 
                               min_right_branching_entropy=0.0)

word_extractor.train(sentences)
words = word_extractor.extract()

training was done. used memory 2.525 Gbse memory 3.313 Gb
all cohesion probabilities was computed. # words = 32821
all branching entropies was computed # words = 492181
all accessor variety was computed # words = 492181
CPU times: user 2min 22s, sys: 1.87 s, total: 2min 24s
Wall time: 2min 24s


In [20]:
from sys import getsizeof

print(len(words))
print(getsizeof(words)/1024, ' KB')

24995
1536.09375  KB


In [21]:
print('type: %s\n' % type(words['킬링타임']))
print(words['킬링타임'])

type: <class 'soynlp.word._word.Scores'>

Scores(cohesion_forward=0.9069446565911313, cohesion_backward=0, left_branching_entropy=3.977275638579379, right_branching_entropy=0.6093567670846984, left_accessor_variety=159, right_accessor_variety=27, leftside_frequency=3689, rightside_frequency=0)


In [22]:
def word_score(score):
    import math
    return (score.cohesion_forward * math.exp(score.right_branching_entropy))

print('단어   (빈도수, cohesion, branching entropy)\n')
for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:30]:
    print('%s     (%d, %.3f, %.3f)' % (word, 
                                   score.leftside_frequency, 
                                   score.cohesion_forward,
                                   score.right_branching_entropy
                                  ))

단어   (빈도수, cohesion, branching entropy)

됩니다     (1161, 0.973, 5.043)
합니다     (5247, 0.854, 5.157)
든다     (1945, 0.908, 5.093)
ㅡㅡ     (4441, 0.926, 4.971)
영화     (452008, 0.948, 4.944)
항상     (2060, 0.805, 5.025)
ㅠㅠ     (14371, 0.944, 4.857)
봅니다     (1527, 0.923, 4.858)
ㅜㅜ     (4067, 0.914, 4.866)
갑자기     (1871, 0.693, 5.100)
ㅎㅎ     (13420, 0.930, 4.760)
입니다     (3260, 0.598, 5.183)
줍니다     (1071, 0.974, 4.679)
텐데     (484, 0.883, 4.748)
듭니다     (1037, 0.995, 4.625)
근데     (8118, 0.680, 4.991)
솔직히     (9106, 0.897, 4.702)
뭔가     (10888, 0.803, 4.807)
였습니다     (759, 0.619, 5.065)
쵝오     (1902, 0.975, 4.604)
너무     (98389, 0.981, 4.568)
준다     (2653, 0.552, 5.136)
차라리     (2928, 0.486, 5.254)
역시     (20103, 0.566, 5.083)
특히     (8344, 0.522, 5.127)
된다     (3545, 0.672, 4.861)
봤습니다     (12438, 0.520, 5.117)
왔습니다     (2226, 0.631, 4.913)
까지     (1578, 0.411, 5.343)
갠적으로     (849, 0.845, 4.619)


### 생성한 words dictionary를 파일로 저장하기

In [23]:
import numpy as np

np.save('data/moviereview.npy', words)

In [24]:
words = 0
print(words)

0


### 파일에서 words dictionary 불러오기

In [25]:
words = np.load('data/moviereview.npy').item()

In [26]:
from sys import getsizeof

print(len(words))
print(getsizeof(words)/1024, ' KB')

24995
1536.09375  KB


In [27]:
print('type: %s\n' % type(words['킬링타임']))
print(words['킬링타임'])

type: <class 'soynlp.word._word.Scores'>

Scores(cohesion_forward=0.9069446565911313, cohesion_backward=0, left_branching_entropy=3.977275638579379, right_branching_entropy=0.6093567670846984, left_accessor_variety=159, right_accessor_variety=27, leftside_frequency=3689, rightside_frequency=0)
