## Load modules

In [21]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS

from scipy import sparse
import pickle

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

## Dataset Preparation

In [2]:
data_url = "../dataset/IMDB_Dataset.csv"
df = pd.read_csv(data_url)

In [3]:
# 학습데이터와 검증데이터 set나누기
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['review'], df['sentiment'])

In [4]:
# 결과데이터들을 문자에서 0과1로 바꿔주기
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [5]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape

((37500,), (37500,), (12500,), (12500,))

In [6]:
test_x = ["The film has clever ideas about literalizing black fear of the pathology of whiteness, but it doesn't go far enough in its indictment of white supremacy"]

## Vectorizer

- analyer : 단어단위로 자르기 때문에 word로 설정
- stop_words : review는 영어임으로 english로 설정
- ngram_range : 일반적으로 (1,3)으로 설정

#### Count Vectors as features

In [22]:
stop_words = [0,1,2,3,4,5,6,7,8,9] + list(ENGLISH_STOP_WORDS)

In [None]:
count_vect = CountVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1,3), max_features=5000)
count_vect.fit(df['review'])

xtrain_count = count_vect.transform(train_x)
xvaild_count = count_vect.transform(valid_x)

xtest_count = count_vect.transform(test_x)

In [None]:
count_vect.get_feature_names()

In [320]:
xtrain_count

<37500x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 6217223 stored elements in Compressed Sparse Row format>

In [321]:
xvaild_count

<12500x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 2076586 stored elements in Compressed Sparse Row format>

In [322]:
xtest_count

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

#### TF-IDF Vectors as features

In [323]:
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english'and
            [0,1,2,3,4,5,6,7,8,9], ngram_range=(1,3), max_features=5000)
tfidf_vect.fit(df['review'])

xtrain_tfidf = tfidf_vect.transform(train_x)
xvaild_tfidf = tfidf_vect.transform(valid_x)

xtest_tfidf = tfidf_vect.transform(test_x)

In [324]:
xtrain_tfidf

<37500x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 6217223 stored elements in Compressed Sparse Row format>

## Model

#### Bernoulli NB

In [325]:
clf = BernoulliNB(binarize=0)

In [326]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.83352

In [327]:
clf.predict(xtest_count)

array([1])

In [328]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.83352

In [329]:
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [330]:
clf = MultinomialNB(alpha=1)

In [331]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
sum(clf.predict(xvaild_count)==valid_y)/len(valid_y)

0.83656

In [332]:
clf.predict(xtest_count)

array([1])

In [333]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
sum(clf.predict(xvaild_tfidf)==valid_y)/len(valid_y)

0.856

In [334]:
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [339]:
clf = GaussianNB()

In [340]:
# Use Count Vectors
clf.fit(xtrain_count.toarray(), train_y)
sum(clf.predict(xvaild_count.toarray())==valid_y)/len(valid_y)

MemoryError: 

In [None]:
clf.predict(xtest_count.toarray())

In [None]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf.toarray(), train_y)
sum(clf.predict(xvaild_tfidf.toarray())==valid_y)/len(valid_y)

In [None]:
clf.predict(xtest_tfidf.toarray())

## Save Variable

In [9]:
with open('train_y.pkl', 'wb')as f:
    pickle.dump(train_y, f)

In [10]:
sparse.save_npz("xtrain_count.npz", xtrain_count)
sparse.save_npz("xtest_count.npz", xtest_count)
sparse.save_npz("xtrain_tfidf.npz", xtrain_tfidf)
sparse.save_npz("xtest_tfidf.npz", xtest_tfidf)

## Load Variable

In [2]:
with open('train_y.pkl', 'rb')as f:
    train_y = pickle.load(f)

In [3]:
xtrain_count = sparse.load_npz("xtrain_count.npz")
xtest_count = sparse.load_npz("xtest_count.npz")
xtrain_tfidf = sparse.load_npz("xtrain_tfidf.npz")
xtest_tfidf = sparse.load_npz("xtest_tfidf.npz")

## Model

#### Bernoulli NB

In [4]:
clf = BernoulliNB(binarize=0)

In [5]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [6]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [7]:
clf = MultinomialNB(alpha=1)

In [8]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [9]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [10]:
clf = GaussianNB()

In [11]:
# Use Count Vectors
clf.fit(xtrain_count[:20000].toarray(), train_y[:20000])
clf.predict(xtest_count.toarray())

array([1])

In [12]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf[:20000].toarray(), train_y[:20000])
clf.predict(xtest_tfidf.toarray())

array([1])