In [1]:
import numpy as np 
import pandas as pd 
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 

### Data Preparation

In [2]:
train_df = pd.read_csv("Naive train data.csv")
test_df = pd.read_csv("Naive test data.csv")

In [3]:
print (train_df.head(10))
print (train_df.shape)

                                              review label
0  mature intelligent and highly charged melodram...   pos
1  http://video.google.com/videoplay?docid=211772...   pos
2  Title: Opera (1987) Director: Dario Argento Ca...   pos
3  I think a lot of people just wrote this off as...   pos
4  This is a story of two dogs and a cat looking ...   pos
5  Steve Carell comes into his own in his first s...   pos
6  I'm only going to write more because it's requ...   neg
7  OK, it was a "risky" move to rent this flick, ...   neg
8  Cannibalism, a pair of cinematic references to...   pos
9  This is one of the great modern kung fu films....   pos
(40000, 2)


In [4]:
print (test_df.head(10))
print (test_df.shape)

                                              review
0  Remember those old kung fu movies we used to w...
1  This movie is another one on my List of Movies...
2  How in the world does a thing like this get in...
3  "Queen of the Damned" is one of the best vampi...
4  The Caprica episode (S01E01) is well done as a...
5  I usually really enjoy Steven Seagal movies. T...
6  JiÃ¸Ã­ Trnka made his last animated short an i...
7  This is so bad it will be my contribution to t...
8  Watching this hilariously retro but very enter...
9  Excellent political thriller, played much quie...
(10000, 1)


In [5]:
X_train = train_df['review'].astype(str).values.tolist()

Y_train = np.zeros(train_df.shape[0])
Y_train[train_df.values[:, 1] == "pos"] = 1

print (Y_train.shape)
print (Y_train)

X_test = test_df['review'].astype(str).values.tolist()


(40000,)
[1. 1. 1. ... 0. 1. 1.]


### Data cleaning (Tokenization, stopwords removal, Stemming)

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
eng_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [7]:
def getCleanedReview (review):
    
    review = review.lower()
    review = review.replace("<br /><br />", " ")
    
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in eng_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_reviews = ' '.join(stemmed_tokens)
    
    return cleaned_reviews

In [8]:
X_train_clean = [getCleanedReview(i) for i in X_train]
X_test_clean = [getCleanedReview(i) for i in X_test]
print(len(X_test_clean))

10000


In [9]:
print (len(X_test_clean))
print (len(X_train_clean))

10000
40000


### Vectorization

In [10]:
tv = TfidfVectorizer()

In [11]:
X_train_vec = tv.fit_transform(X_train_clean).toarray()

In [12]:
print (type (X_train_vec))

<class 'numpy.ndarray'>


In [13]:
print (X_train_vec.shape)

(40000, 65742)


In [14]:
X_test_vec = tv.transform(X_test_clean).toarray()

In [15]:
print (X_test_vec.shape)

(10000, 65742)


In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
mnb = MultinomialNB()

In [18]:
mnb.fit(X_train_vec, Y_train)

MultinomialNB()

In [19]:
ypred = mnb.predict(X_test_vec)

In [127]:
final_ypred = []
n = ypred.shape[0]
print (n)

10000


In [128]:
for i in range (n):
    if ypred[i] == 0:
        final_ypred.append("neg")
    else:
        final_ypred.append("pos")

In [129]:
##print (final_ypred)

In [130]:
final_pred = np.array(final_ypred)

In [131]:
final_pred.shape[0]

10000

In [132]:
final_pred = final_pred.reshape((10000,1))

In [133]:
indices = np.arange(n)
indices = indices.reshape((10000, 1))


In [134]:
final_pred = np.hstack((indices, final_pred))

print (final_pred.shape)

(10000, 2)


In [135]:
new_df = pd.DataFrame(final_pred,columns = ["Id", "label"])

In [136]:
new_df

Unnamed: 0,Id,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos
...,...,...
9995,9995,neg
9996,9996,pos
9997,9997,pos
9998,9998,pos


In [138]:
new_df.to_csv("Movie_review.csv", index = False)