In [42]:
import konlpy
import gensim
from pprint import pprint

In [43]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data
train_data = read_data('ratings_train.txt')
test_data = read_data('ratings_test.txt')

In [3]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

In [47]:
pos_tagger.pos('나는 밥을 먹는다.')

[('나', 'Noun'),
 ('는', 'Josa'),
 ('밥', 'Noun'),
 ('을', 'Josa'),
 ('먹는', 'Verb'),
 ('다', 'Eomi'),
 ('.', 'Punctuation')]

In [4]:
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [28]:
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

In [49]:
train_docs[0]

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증/Noun',
  '나다/Verb',
  '목소리/Noun'],
 '0')

In [53]:
len(set(tokens))

48765개 columns
15만개 rows

48765

In [51]:
len(tokens)

2194536

In [36]:
tokens = [t for d in train_docs for t in d[0]]

In [37]:
import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)

<Text: NMSC>


In [9]:
pprint(text.vocab().most_common(10))

[('./Punctuation', 68630),
 ('영화/Noun', 51365),
 ('하다/Verb', 50281),
 ('이/Josa', 39123),
 ('보다/Verb', 34764),
 ('의/Josa', 30480),
 ('../Punctuation', 29055),
 ('에/Josa', 27108),
 ('가/Josa', 26696),
 ('을/Josa', 23481)]


In [56]:
selected_words = [f[0] for f in text.vocab().most_common(2000)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

In [60]:
term_exists(train_docs[0][0])

{'exists(아/Exclamation)': True,
 'exists(일본/Noun)': False,
 'exists(반개/Noun)': False,
 'exists(저/Noun)': False,
 'exists(이정/Noun)': False,
 'exists(전문가/Noun)': False,
 'exists(곳/Noun)': False,
 'exists(이쁘다/Adjective)': False,
 'exists(아무리/Adverb)': False,
 'exists(^^/Punctuation)': False,
 'exists(죽음/Noun)': False,
 'exists(전쟁영화/Noun)': False,
 'exists(~!/Punctuation)': False,
 'exists(~^^/Punctuation)': False,
 'exists(하지만/Conjunction)': False,
 'exists(화면/Noun)': False,
 'exists(먹다/Verb)': False,
 'exists(커플/Noun)': False,
 'exists(이름/Noun)': False,
 'exists(짓다/Verb)': False,
 'exists(버리다/Verb)': False,
 'exists(굉장하다/Adjective)': False,
 'exists(는걸/Noun)': False,
 'exists(비디오/Noun)': False,
 'exists(정서/Noun)': False,
 'exists(내/Determiner)': False,
 'exists(</Punctuation)': False,
 'exists(이도/Noun)': False,
 'exists(15/Number)': False,
 'exists(거리/Noun)': False,
 'exists(와/Josa)': False,
 'exists(혼자/Noun)': False,
 'exists(장난/Noun)': False,
 'exists(척/Noun)': False,
 'exists(예/Noun)'

In [14]:
term_exists(train_docs[0][0])

{'exists(든/Josa)': False,
 'exists(소장/Noun)': False,
 'exists(수고/Noun)': False,
 'exists(힐링/Noun)': False,
 'exists(방식/Noun)': False,
 'exists(성룡/Noun)': False,
 'exists(인지/Josa)': False,
 'exists(조잡/Noun)': False,
 'exists(예능/Noun)': False,
 'exists(시키다/Verb)': False,
 'exists(굉장하다/Adjective)': False,
 'exists(등/Noun)': False,
 'exists(그렇다고/Conjunction)': False,
 'exists(남/Noun)': False,
 'exists(보단/Josa)': False,
 'exists(CG/Alpha)': False,
 'exists(가수/Noun)': False,
 'exists(ㅠㅜ/KoreanParticle)': False,
 'exists(야동/Noun)': False,
 'exists(괜히/Adverb)': False,
 'exists(임/Noun)': False,
 'exists(시나리오/Noun)': False,
 'exists(끼리/Noun)': False,
 'exists(!!/Punctuation)': False,
 'exists(독립영화/Noun)': False,
 'exists(D/Alpha)': False,
 'exists(이라니/Josa)': False,
 'exists(이해/Noun)': False,
 'exists(궁금하다/Adjective)': False,
 'exists(형사/Noun)': False,
 'exists(장/Suffix)': False,
 'exists(스릴/Noun)': False,
 'exists(별점/Noun)': False,
 'exists(지루하다/Adjective)': False,
 'exists(여/Josa)': False,
 'e

In [15]:
train_xy = []
test_xy = []

train_docs = train_docs[:30000]

for _ in range(len(train_docs)):
    d, c = train_docs.pop(0)
    train_xy.append((term_exists(d), c))

for _ in range(len(test_docs)):
    d, c = test_docs.pop(0)
    test_xy.append((term_exists(d), c))

In [16]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)
print(nltk.classify.accuracy(classifier, test_xy))

classifier.show_most_informative_features(10)

0.81282
Most Informative Features
         exists(최악/Noun) = True                0 : 1      =     50.5 : 1.0
         exists(낭비/Noun) = True                0 : 1      =     35.2 : 1.0
          exists(똥/Noun) = True                0 : 1      =     34.1 : 1.0
         exists(노잼/Noun) = True                0 : 1      =     28.0 : 1.0
        exists(최고다/Noun) = True                1 : 0      =     26.0 : 1.0
         exists(졸작/Noun) = True                0 : 1      =     25.7 : 1.0
         exists(반개/Noun) = True                0 : 1      =     25.5 : 1.0
       exists(♥/Foreign) = True                1 : 0      =     24.0 : 1.0
        exists(찡/Adverb) = True                1 : 0      =     23.1 : 1.0
          exists(굿/Noun) = True                1 : 0      =     22.7 : 1.0


In [29]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
# 여기서는 15만개 training documents 전부 사용함
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]

In [30]:
from gensim.models import doc2vec
# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234) #size,epoch 등 변경해보기
doc_vectorizer.build_vocab(tagged_train_docs)
# Train document vectors!
for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs)
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

In [31]:
pprint(doc_vectorizer.most_similar('공포/Noun'))

[('공포영화/Noun', 0.4244290888309479),
 ('호러/Noun', 0.3459432125091553),
 ('서스펜스/Noun', 0.3282715678215027),
 ('긴박/Noun', 0.29170575737953186),
 ('미스터리/Noun', 0.28643351793289185),
 ('스릴러/Noun', 0.26772546768188477),
 ('무섭다/Adjective', 0.2608919143676758),
 ('반전/Noun', 0.25883162021636963),
 ('귀신/Noun', 0.2580486536026001),
 ('당혹/Noun', 0.2504921853542328)]


In [32]:
pprint(doc_vectorizer.most_similar('ㅋㅋ/KoreanParticle'))

[('ㅋㅋㄱ/KoreanParticle', 0.33676469326019287),
 ('ㅋ/KoreanParticle', 0.3310341238975525),
 ('!!!!!!!!!!!!!!/Punctuation', 0.2943721413612366),
 ('농부/Noun', 0.2838119864463806),
 ('ㅉㅉ/KoreanParticle', 0.27554744482040405),
 ('^^^/Punctuation', 0.272133469581604),
 ('ㅎㅎ/KoreanParticle', 0.26229777932167053),
 ('-_-;/Punctuation', 0.2622581124305725),
 ('송재림/Noun', 0.2533813714981079),
 ('박형식/Noun', 0.25229310989379883)]


In [33]:
pprint(doc_vectorizer.most_similar(positive=['여자/Noun', '왕/Noun'], negative=['남자/Noun']))

[('압/Noun', 0.2870830297470093),
 ('증인/Noun', 0.23717403411865234),
 ('백치/Noun', 0.23699885606765747),
 ('패전/Noun', 0.23230470716953278),
 ('김삼순/Noun', 0.23064997792243958),
 ('거두다/Verb', 0.22163476049900055),
 ('전도연/Noun', 0.21829411387443542),
 ('궁합/Noun', 0.21358992159366608),
 ('총집/Noun', 0.21237078309059143),
 ('베일/Noun', 0.21168667078018188)]


In [38]:
text.concordance('왕/Noun', lines=10)

Displaying 10 of 145 matches:
Josa 로맨스/Noun 냐/Josa ,,/Punctuation 왕/Noun 짜증/Noun ...../Punctuation 아주/Noun 전
/Noun 함/Noun ../Punctuation 결말/Noun 왕/Noun 실망/Noun 임/Noun 전작/Noun 에/Josa 비/Nou
nction 얼굴/Noun 만/Josa 예쁘다/Adjective 왕/Noun 되다/Verb 맞다/Verb 드라마/Noun 라도/Josa 도덕
/Noun 스릴러/Noun 임/Noun ?/Punctuation 왕/Noun 실망/Noun ./Punctuation 연기/Noun 대본/No
b 금/Noun 사인방/Noun ㅠㅠ/KoreanParticle 왕/Noun 잼/Noun 없다/Adjective ./Punctuation 정
osa 서유기/Noun 보다/Josa 희극/Noun 지/Josa 왕/Noun 이/Josa 더/Noun 최고/Noun 라/Josa 생각/Nou
접/Noun 한/Josa 걸작/Noun ./Punctuation 왕/Noun ,/Punctuation 너무/Noun 감동/Noun 적/Suf
Josa 온/Noun 거/Noun 처럼/Noun 제나라/Noun 왕/Noun 과/Josa 군사/Noun 들/Suffix 을/Josa 속이다/
다/Verb ./Punctuation 기대하다/Adjective 왕/Noun 지루/Noun .../Punctuation 제니퍼/Noun 틸리
tive 움/Noun 짜증/Noun .../Punctuation 왕/Noun 짜증/Noun ../Punctuation 사람/Noun 마다/J


In [39]:
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]

test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]

In [40]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)
classifier.fit(train_x, train_y)
classifier.score(test_x, test_y)

0.63173999999999997

---

# 변형

In [23]:
train_xy[0]

({'exists(든/Josa)': False,
  'exists(소장/Noun)': False,
  'exists(수고/Noun)': False,
  'exists(힐링/Noun)': False,
  'exists(방식/Noun)': False,
  'exists(성룡/Noun)': False,
  'exists(인지/Josa)': False,
  'exists(조잡/Noun)': False,
  'exists(예능/Noun)': False,
  'exists(시키다/Verb)': False,
  'exists(굉장하다/Adjective)': False,
  'exists(등/Noun)': False,
  'exists(그렇다고/Conjunction)': False,
  'exists(남/Noun)': False,
  'exists(보단/Josa)': False,
  'exists(CG/Alpha)': False,
  'exists(가수/Noun)': False,
  'exists(ㅠㅜ/KoreanParticle)': False,
  'exists(야동/Noun)': False,
  'exists(괜히/Adverb)': False,
  'exists(임/Noun)': False,
  'exists(시나리오/Noun)': False,
  'exists(끼리/Noun)': False,
  'exists(!!/Punctuation)': False,
  'exists(독립영화/Noun)': False,
  'exists(D/Alpha)': False,
  'exists(이라니/Josa)': False,
  'exists(이해/Noun)': False,
  'exists(궁금하다/Adjective)': False,
  'exists(형사/Noun)': False,
  'exists(장/Suffix)': False,
  'exists(스릴/Noun)': False,
  'exists(별점/Noun)': False,
  'exists(지루하다/Adjective)': Fa

In [1]:
import konlpy
import gensim
from pprint import pprint

In [2]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data
train_data = read_data('ratings_train.txt')
test_data = read_data('ratings_test.txt')

In [3]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

In [4]:
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [5]:
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

In [18]:
all_docs = []

for doc in train_docs:
    all_docs.append(doc[0])
    
for doc in test_docs:
    all_docs.append(doc[0])

print(len(all_docs))

200000


In [20]:
dics = gensim.corpora.Dictionary(all_docs)

In [21]:
dics

<gensim.corpora.dictionary.Dictionary at 0x145af2c88>

In [23]:
from gensim import models

tf_ko = []

for _ in range(len(all_docs)):
    tf_ko.append(dics.doc2bow(all_docs.pop(0)))

In [63]:
tf_ko[41831]

[(37, 1), (624, 1), (1067, 1), (1308, 1)]

In [41]:
len(tf_ko)

200000