## Load modules

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS

from scipy import sparse
import pickle

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

## Dataset Preparation

In [2]:
data_path = "../dataset/IMDB_Dataset.csv"
df = pd.read_csv(data_path)

In [3]:
train_x = df['review'].T
train_y = df['sentiment'].T

In [4]:
# 결과데이터들을 문자에서 0과1로 바꿔주기
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)

In [5]:
train_x.shape, train_y.shape

((50000,), (50000,))

In [6]:
test_x = ["The film has clever ideas about literalizing black fear of the pathology of whiteness, but it doesn't go far enough in its indictment of white supremacy"]

## Vectorizer

- analyer : 단어단위로 자르기 때문에 word로 설정
- stop_words : review는 영어임으로 english로 설정
- ngram_range : 일반적으로 (1,3)으로 설정
- max_features : 실험을 통해 50,000개가 가장 적당하다고 판단

In [7]:
stop_words = ['0','1','2','3','4','5','6','7','8','9'] + list(ENGLISH_STOP_WORDS)

#### Count Vectors as features

In [8]:
count_vect = CountVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1,3), max_features=50000)

xtrain_count = count_vect.fit_transform(train_x)
xtest_count = count_vect.transform(test_x)

#### TF-IDF Vectors as features

In [9]:
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1,3), max_features=50000)

xtrain_tfidf = tfidf_vect.fit_transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)

## Variable

In [2]:
train_y_path = "../preprocessing_dataset/train_y.pkl"
xtrain_count_path = "../preprocessing_dataset/xtrain_count.npz"
xtest_count_path = "../preprocessing_dataset/xtest_count.npz"
xtrain_tfidf_path = "../preprocessing_dataset/xtrain_tfidf.npz"
xtest_tfidf_path = "../preprocessing_dataset/xtest_tfidf.npz"

#### Save

In [11]:
with open(train_y_path, 'wb')as f:
    pickle.dump(train_y, f)

In [12]:
sparse.save_npz(xtrain_count_path, xtrain_count)
sparse.save_npz(xtest_count_path, xtest_count)
sparse.save_npz(xtrain_tfidf_path, xtrain_tfidf)
sparse.save_npz(xtest_tfidf_path, xtest_tfidf)

#### Load

In [3]:
with open(train_y_path, 'rb')as f:
    train_y = pickle.load(f)

In [4]:
xtrain_count = sparse.load_npz(xtrain_count_path)
xtest_count = sparse.load_npz(xtest_count_path)
xtrain_tfidf = sparse.load_npz(xtrain_tfidf_path)
xtest_tfidf = sparse.load_npz(xtest_tfidf_path)

## Model

#### Bernoulli NB

In [5]:
clf = BernoulliNB(binarize=0)

In [6]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [7]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Multinomial NB

In [8]:
clf = MultinomialNB(alpha=1)

In [9]:
# Use Count Vectors
clf.fit(xtrain_count, train_y)
clf.predict(xtest_count)

array([1])

In [10]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf, train_y)
clf.predict(xtest_tfidf)

array([1])

#### Gaussian NB

In [11]:
clf = GaussianNB()

In [12]:
# Use Count Vectors
clf.fit(xtrain_count.toarray(), train_y)
clf.predict(xtest_count.toarray())

array([1])

In [13]:
# Use TF-IDF Vectors
clf.fit(xtrain_tfidf.toarray(), train_y)
clf.predict(xtest_tfidf.toarray())

array([1])