# Movie sentiment prediction using naive bayes

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

### read data

In [6]:
train_data = pd.read_csv('./data/movie_sentiment_nb2/labeledTrainData.csv', sep='\t')
test_data = pd.read_csv('./data/movie_sentiment_nb2/testData.csv', sep='\t')
print(train_data.head(2))
print(test_data.head(2))

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...


### process review into words

In [39]:
def keep_valid_words(reviews):
    wordslist = []
    for review in reviews:
        review2 = review.lower()
        words = re.findall(r'\w+', review2)        
        wordslist.append(" ".join(words).lower())
    return np.array(wordslist)

x_data = keep_valid_words(train_data['review'])
x_test = keep_valid_words(test_data['review'])
print(x_data[:2])


['with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay br br visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him br br the actual feature film bit when it finally starts is only on for

### train test split

In [40]:
x_train, x_valid, y_train, y_valid = train_test_split(x_data, train_data['sentiment'], test_size=0.2, random_state=0)
print(x_train.shape)
print(x_valid.shape)


(20000,)
(5000,)


### word count processing. TF

In [57]:
vectorizer = TfidfVectorizer(stop_words= 'english')
x_train_count = vectorizer.fit_transform(x_train)
x_valid_count = vectorizer.transform(x_valid)
print(vectorizer.vocabulary_)



### train model

In [58]:
model = MultinomialNB()
model.fit(x_train_count, y_train)
y_predict = model.predict(x_valid_count)
print(y_predict.shape)
print(y_predict.dtype)
print(y_valid.shape)
print(y_valid.dtype)
accuracy_score = accuracy_score(y_valid., y_predict)
# print(accuracy_score)

(5000,)


TypeError: 'numpy.float64' object is not callable




### train model
### predict
### output