In [1]:
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]   # header 제외

In [2]:
from pprint import pprint
pprint(data[0])

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']


In [3]:
X = list(zip(*data))[1]
y = np.array(list(zip(*data))[2], dtype=int)

---

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [5]:
%%time 
model1.fit(X, y)

CPU times: user 4.57 s, sys: 88.3 ms, total: 4.66 s
Wall time: 4.67 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [6]:
import codecs
with codecs.open("ratings_test.txt", encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]   # header 제외

In [7]:
X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=int)

print(classification_report(y_test, model1.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000



### Tf-idf 방법 사용

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

model2 = Pipeline([
            ('vect', TfidfVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [10]:
%%time 
model2.fit(X, y)

CPU times: user 4.77 s, sys: 40.1 ms, total: 4.81 s
Wall time: 4.82 s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
print(classification_report(y_test, model2.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.83     25173

avg / total       0.83      0.83      0.83     50000



형태소 분석기 사용

In [12]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [13]:
model3 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('mb', MultinomialNB()),
        ])

In [14]:
%%time 
model3.fit(X, y)

CPU times: user 5min 55s, sys: 980 ms, total: 5min 56s
Wall time: 5min 40s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f28>,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [15]:
print(classification_report(y_test, model3.predict(X_test)))

             precision    recall  f1-score   support

          0       0.83      0.85      0.84     24827
          1       0.85      0.83      0.84     25173

avg / total       0.84      0.84      0.84     50000



(1,2)-gram 사용

In [None]:
model4 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])