## Load modules

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS

from scipy import sparse
import pickle

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

## Dataset Preparation

In [2]:
data_url = "../dataset/IMDB_Dataset.csv"
df = pd.read_csv(data_url)

In [3]:
# 학습데이터와 검증데이터 set나누기
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['review'], df['sentiment'])

In [4]:
# 결과데이터들을 문자에서 0과1로 바꿔주기
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [5]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape

((37500,), (37500,), (12500,), (12500,))

In [6]:
test_x = ["The film has clever ideas about literalizing black fear of the pathology of whiteness, but it doesn't go far enough in its indictment of white supremacy"]

## Vectorizer

- analyer : 단어단위로 자르기 때문에 word로 설정
- stop_words : review는 영어임으로 english로, 의미없는 숫자 feature가 많아 숫자도 포함하여 설정
- ngram_range : 일반적으로 (1,3)으로 설정
- max_features : 실험을 통해 100,000개가 가장 적당하다고 판단

#### Count Vectors as features

In [7]:
stop_words = ['0','1','2','3','4','5','6','7','8','9'] + list(ENGLISH_STOP_WORDS)

In [86]:
count_vect = CountVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1,3), max_features=50000)
count_vect.fit(df['review'])

xtrain_count = count_vect.transform(train_x)
xvaild_count = count_vect.transform(valid_x)

xtest_count = count_vect.transform(test_x)

#### TF-IDF Vectors as features

In [90]:
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1,3), max_features=50000)
tfidf_vect.fit(df['review'])

xtrain_tfidf = tfidf_vect.transform(train_x)
xvaild_tfidf = tfidf_vect.transform(valid_x)

xtest_tfidf = tfidf_vect.transform(test_x)

In [134]:
# with open('train_y.pkl', 'wb')as f:
#     pickle.dump(train_y, f)

In [135]:
# with open('valid_y.pkl', 'wb')as f:
#     pickle.dump(valid_y, f)

In [136]:
# sparse.save_npz("xtrain_count.npz", xtrain_count)
# sparse.save_npz("xvaild_count.npz", xvaild_count)
# sparse.save_npz("xtest_count.npz", xtest_count)

# sparse.save_npz("xtrain_tfidf.npz", xtrain_tfidf)
# sparse.save_npz("xvaild_tfidf.npz", xvaild_tfidf)
# sparse.save_npz("xtest_tfidf.npz", xtest_tfidf)


In [2]:
# with open('train_y.pkl', 'rb')as f:
#     train_y = pickle.load(f)

In [3]:
# with open('valid_y.pkl', 'rb')as f:
#     valid_y = pickle.load(f)

In [4]:
# xtrain_count = sparse.load_npz("xtrain_count.npz")
# xvaild_count = sparse.load_npz("xvaild_count.npz")
# xtest_count = sparse.load_npz("xtest_count.npz")

# xtrain_tfidf = sparse.load_npz("xtrain_tfidf.npz")
# xvaild_tfidf = sparse.load_npz("xvaild_tfidf.npz")
# xtest_tfidf = sparse.load_npz("xtest_tfidf.npz")

## Model

#### Bernoulli NB

In [92]:
clf = BernoulliNB(binarize=0)

In [93]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.87824

In [94]:
clf.predict(xtest_count)

array([1])

In [95]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.87824

In [96]:
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [97]:
clf = MultinomialNB(alpha=1)

In [98]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.874

In [99]:
clf.predict(xtest_count)

array([1])

In [100]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.8828

In [101]:
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [102]:
clf = GaussianNB()

In [103]:
# Use Count Vectors
clf.fit(xtrain_count.toarray(), train_y)
sum(clf.predict(xvaild_count.toarray())==valid_y)/len(valid_y)

0.82856

In [104]:
clf.predict(xtest_count.toarray())

array([1])

In [105]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf.toarray(), train_y)
sum(clf.predict(xvaild_tfidf.toarray())==valid_y)/len(valid_y)

0.82896

In [106]:
clf.predict(xtest_tfidf.toarray())

array([1])