## Load modules

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from scipy import sparse
import pickle

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

## Dataset Preparation

In [2]:
data_url = "../dataset/IMDB_Dataset.csv"
df = pd.read_csv(data_url)

In [3]:
# 학습데이터와 검증데이터 set나누기
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['review'], df['sentiment'])

In [4]:
# 결과데이터들을 문자에서 0과1로 바꿔주기
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [5]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape

((37500,), (37500,), (12500,), (12500,))

In [6]:
test_x = ["The film has clever ideas about literalizing black fear of the pathology of whiteness, but it doesn't go far enough in its indictment of white supremacy"]

## Vectorizer

#### Count Vectors as features

In [149]:
# 단어단위로 자르기 때문에 analyzer는 word로 설정한다
# review가 영어임으로 stop_words를 english로 설정해준다
count_vect = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1,3), max_features=4000)
count_vect.fit(df['review'])

xtrain_count = count_vect.transform(train_x)
xvaild_count = count_vect.transform(valid_x)

xtest_count = count_vect.transform(test_x)

#### TF-IDF Vectors as features

In [160]:
# 단어단위로 자르기 때문에 analyzer는 word로 설정한다
# review가 영어임으로 stop_words를 english로 설정해준다
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,3), max_features=4000)
tfidf_vect.fit(df['review'])

xtrain_tfidf = tfidf_vect.transform(train_x)
xvaild_tfidf = tfidf_vect.transform(valid_x)

xtest_tfidf = tfidf_vect.transform(test_x)

## Model

#### Bernoulli NB

In [161]:
clf = BernoulliNB(binarize=0)

In [162]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.86128

In [163]:
clf.predict(xtest_count)

array([1])

In [164]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.86128

In [165]:
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [166]:
clf = MultinomialNB(alpha=1)

In [167]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.85416

In [168]:
clf.predict(xtest_count)

array([0])

In [169]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.86304

In [170]:
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [171]:
clf = GaussianNB()

In [172]:
# Use Count Vectors
clf.fit(xtrain_count.toarray(), train_y)
sum(clf.predict(xvaild_count.toarray())==valid_y)/len(valid_y)

0.80608

In [173]:
clf.predict(xtest_count.toarray())

array([0])

In [174]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf.toarray(), train_y)
sum(clf.predict(xvaild_tfidf.toarray())==valid_y)/len(valid_y)

0.828

In [175]:
clf.predict(xtest_tfidf.toarray())

array([1])

## Save Variable

In [9]:
with open('train_y.pkl', 'wb')as f:
    pickle.dump(train_y, f)

In [10]:
sparse.save_npz("xtrain_count.npz", xtrain_count)
sparse.save_npz("xtest_count.npz", xtest_count)
sparse.save_npz("xtrain_tfidf.npz", xtrain_tfidf)
sparse.save_npz("xtest_tfidf.npz", xtest_tfidf)

## Load Variable

In [2]:
with open('train_y.pkl', 'rb')as f:
    train_y = pickle.load(f)

In [3]:
xtrain_count = sparse.load_npz("xtrain_count.npz")
xtest_count = sparse.load_npz("xtest_count.npz")
xtrain_tfidf = sparse.load_npz("xtrain_tfidf.npz")
xtest_tfidf = sparse.load_npz("xtest_tfidf.npz")

## Model

#### Bernoulli NB

In [4]:
clf = BernoulliNB(binarize=0)

In [5]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [6]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [7]:
clf = MultinomialNB(alpha=1)

In [8]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [9]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [10]:
clf = GaussianNB()

In [11]:
# Use Count Vectors
clf.fit(xtrain_count[:20000].toarray(), train_y[:20000])
clf.predict(xtest_count.toarray())

array([1])

In [12]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf[:20000].toarray(), train_y[:20000])
clf.predict(xtest_tfidf.toarray())

array([1])