In [7]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


## Dataset Preparation

In [8]:
data_url = "../dataset/IMDB_Dataset.csv"
df = pd.read_csv(data_url)

In [139]:
# 학습데이터와 검증데이터 set나누기
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['review'], df['sentiment'])

In [140]:
train_x

27885    This is a wonderful new movie currently still ...
16777    Surface is one of the best shows that I have e...
35710    A wonderful film in the best Scandinavian eldr...
36449    I know I've already added a comment but I just...
37038    No wonder a lot of us hate classical music; an...
16558    Now, I've seen many many B-grade films in my 1...
14247    This movie has beautiful scenery. Unfortunatel...
11726    This movie is surprisingly good. The ninja fig...
47664    Dirty Sanchez is the more extreme, British ver...
3053     Watching "Kroko" I would have liked to leave t...
35299    During the 1990's, several attempts have been ...
32038    The first time I had the window of opportunity...
12094    This film, originally released at Christmas, 1...
46951    I admit to liking a lot of the so-called "frat...
23047    The fact that after 50 years, it is still a hi...
7749     For anyone craving a remake of 1989's Slaves o...
44048    On more than one level, I can relate to what h.

In [23]:
# 결과데이터들을 문자에서 0과1로 바꿔주기
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [38]:
train_x.shape, train_y.shape, valid_x.shape, valid_y.shape
# 총 데이터 50,000개에서
# train_data는 37,500개
# valid_data는 12,500개로 나누었음

((37500,), (37500,), (12500,), (12500,))

In [95]:
test_x = ["The film has clever ideas about literalizing black fear of the pathology of whiteness, but it doesn't go far enough in its indictment of white supremacy"]

#### Count Vectors as features

In [104]:
# 단어단위로 자르기 때문에 analyzer는 word로 설정한다
# review가 영어임으로 stop_words를 english로 설정해준다
count_vect = CountVectorizer(analyzer='word', stop_words='english')
count_vect.fit(df['review'])

# train_x와 valid_x데이터를 전체 x데이터로 fit 되어진 CountVectorizer를 이용해 transform시켜준다
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [105]:
xtest_count = count_vect.transform(test_x)

#### TF-IDF Vectors as features

In [84]:
# 단어단위로 자르기 때문에 analyzer는 word로 설정한다
# review가 영어임으로 stop_words를 english로 설정해준다
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english', )
tfidf_vect.fit(df['review'])

# train_x와 valid_x데이터를 전체 x데이터로 fit 되어진  TfidfVectorizer이용해 transform시켜준다
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [110]:
xtest_tfidf = tfidf_vect.transform(test_x)

In [142]:
xtrain_tfidf

<37500x101583 sparse matrix of type '<class 'numpy.float64'>'
	with 3319338 stored elements in Compressed Sparse Row format>

In [143]:
xvalid_tfidf

<12500x101583 sparse matrix of type '<class 'numpy.float64'>'
	with 1115162 stored elements in Compressed Sparse Row format>

In [141]:
xtest_tfidf

<1x101583 sparse matrix of type '<class 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

## Model

In [111]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

#### Bernoulli NB

In [134]:
clf = BernoulliNB(binarize=0)
# clf.fit(xtrain_count, train_y)
# clf.fit(xvalid_count, valid_y)
# clf.fit(xtrain_tfidf, train_y)
clf.fit(xvalid_tfidf, valid_y)

BernoulliNB(alpha=1.0, binarize=0, class_prior=None, fit_prior=True)

In [135]:
clf.predict(xtest_count)

array([0])

#### Multinomial NB

In [137]:
clf = MultinomialNB(alpha=1)
# clf.fit(xtrain_count, train_y)
# clf.fit(xvalid_count, valid_y)
# clf.fit(xtrain_tfidf, train_y)
clf.fit(xvalid_tfidf, valid_y)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [138]:
clf.predict(xtest_count)

array([1])

#### Gaussian NB

In [144]:
clf = GaussianNB()
clf.fit(xtrain_count.toarray(), train_y)

MemoryError: 