In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def tf_extractor(corpus):  
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [5]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [6]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 < score < 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, 0
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score >= 8 else 0)

In [7]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.2, random_state=0)

In [7]:
tf_vectorizer, train_tf_features = tf_extractor(train_texts) # TF 정보를 이용해서 벡터화 합니다.
# input의 형태 = list of docs
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]
# tf_vectorizer.vocabulary_.items() returns a list of (word, frequency)
# We sort words based on their frequencies and save the words

In [9]:
# tf matrix를 사용한 경우
lr2 = LogisticRegression(C=10, penalty='l2', solver='sag') # Ridge regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr2.fit(train_tf_features, train_labels) # 학습
pred_labels = lr2.predict(test_tf_features)

Misclassified samples: 11760 out of 164523
Accuracy: 0.93




In [None]:
from sklearn.metrics import accuracy_score
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

In [11]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)


Misclassified samples: 14074 out of 164523
Accuracy: 0.91


In [None]:
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

In [10]:
# Get coefficients of the model 
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# 학습에 사용된 각 단어마다의 coefficient (즉 weight) 값이 존재
# coefficient값이 큰 순으로 정렬 'reverse=True'

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(8402, 3.2834260607516965), (50619, 2.8758307801962313), (49991, 2.793400720353563), (50012, 2.7417887272975565), (50561, 2.731604141685505)]
꿀잼 (3.283)
재밌었 (2.876)
재미있게 (2.793)
재미있었 (2.742)
재밌게 (2.732)
존잼 (2.517)
재미있네 (2.500)
여운 (2.481)
재밌어 (2.443)
재밌네 (2.430)
재미있어 (2.422)
강추 (2.414)
굿굿 (2.363)
재밌고 (2.356)
즐겁 (2.171)
최고다 (2.169)
지루할 (2.167)
지루하지 (2.111)
유쾌 (2.110)
개꿀잼 (2.065)
재미있 (2.053)
재밌던 (2.028)
테러 (1.980)
심장 (1.956)
졸잼 (1.954)
재밌 (1.940)
재밋어 (1.938)
수작 (1.937)
재미있고 (1.934)
울었 (1.925)
사랑해 (1.918)
최고 (1.875)
흥미진진 (1.867)
감사합 (1.858)
재밋 (1.855)
빠져 (1.851)
대박 (1.845)
낮아 (1.827)
감사 (1.791)
감탄 (1.772)
충분히 (1.763)
탄탄 (1.762)
가슴 (1.755)
재밋음 (1.751)
멋지 (1.745)
만점 (1.721)
전문가 (1.685)
재미나 (1.665)
웃다 (1.656)
다만 (1.649)
없고 (-1.821)
미화 (-1.837)
알바 (-1.847)
댓글알바 (-1.862)
표절 (-1.866)
짜증 (-1.869)
높아 (-1.885)
신파극 (-1.887)
왜곡 (-1.913)
삼류 (-1.919)
디워 (-1.919)
자다 (-1.921)
별로 (-1.939)
억지 (-1.940)
아까웠 (-1.959)
팔이 (-1.983)
망쳐 (-2.010)
제로 (-2.029)
하품 (-2.045)
아까운 (-2.053)
재미없 (-2.071)
거품 (-2.105)
졸다 (-2