In [32]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('popular')
import numpy
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]  

In [33]:
df = pd.read_csv('data.csv')

In [34]:
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [35]:
df = df.replace('\n','', regex=True)

In [36]:
df.head()

Unnamed: 0,review,label
0,films adapted from comic books have had plenty...,positive
1,every now and then a movie comes along from a ...,positive
2,you've got mail works alot better than it dese...,positive
3,""" jaws "" is a rare film that grabs your atten...",positive
4,moviemaking is a lot like being the general ma...,positive


In [37]:
df.shape

(2000, 2)

In [38]:
sid = SentimentIntensityAnalyzer()

In [None]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

df.head()

In [11]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])

df.head()

Unnamed: 0,review,label,scores,compound
0,films adapted from comic books have had plenty...,positive,"{'neg': 0.12, 'neu': 0.753, 'pos': 0.127, 'com...",-0.5887
1,every now and then a movie comes along from a ...,positive,"{'neg': 0.08, 'neu': 0.82, 'pos': 0.1, 'compou...",0.8825
2,you've got mail works alot better than it dese...,positive,"{'neg': 0.083, 'neu': 0.706, 'pos': 0.211, 'co...",0.9964
3,""" jaws "" is a rare film that grabs your atten...",positive,"{'neg': 0.078, 'neu': 0.815, 'pos': 0.106, 'co...",0.9868
4,moviemaking is a lot like being the general ma...,positive,"{'neg': 0.101, 'neu': 0.792, 'pos': 0.108, 'co...",-0.3525


In [12]:
df['vader_label'] = df['compound'].apply(lambda c: 'pos' if c >0.05 else 'neg')

df.head()

Unnamed: 0,review,label,scores,compound,vader_label
0,films adapted from comic books have had plenty...,positive,"{'neg': 0.12, 'neu': 0.753, 'pos': 0.127, 'com...",-0.5887,neg
1,every now and then a movie comes along from a ...,positive,"{'neg': 0.08, 'neu': 0.82, 'pos': 0.1, 'compou...",0.8825,pos
2,you've got mail works alot better than it dese...,positive,"{'neg': 0.083, 'neu': 0.706, 'pos': 0.211, 'co...",0.9964,pos
3,""" jaws "" is a rare film that grabs your atten...",positive,"{'neg': 0.078, 'neu': 0.815, 'pos': 0.106, 'co...",0.9868,pos
4,moviemaking is a lot like being the general ma...,positive,"{'neg': 0.101, 'neu': 0.792, 'pos': 0.108, 'co...",-0.3525,neg


In [20]:
def noiseRemoval(sentences):
    lem = WordNetLemmatizer()
    sentences = sent_tokenize(sentences)
    for i in range(len(sentences)):
        sentences[i] = sentences[i].lower()
        sentences[i] = re.sub(r'\W',' ',sentences[i]) #remove $#&
        sentences[i] = re.sub(r'\d',' ',sentences[i]) #remove digits
        sentences[i] = re.sub(r'\s+',' ',sentences[i]) #remove spaces
        words = nltk.word_tokenize(sentences[i])
        words = [lem.lemmatize(word, pos='v') for word in words]
        new=[]
        for word in words:
            if word not in stopwords.words('english'):
                new.append(word)
        sentences[i] = ' '.join(new)
    sentences = ",".join(sentences)
    return sentences

df['review'] = df['review'].apply(lambda x: noiseRemoval(x))

In [21]:
df['label'].value_counts()

negative    1000
positive    1000
Name: label, dtype: int64

In [22]:
df['vader_label'].value_counts()

pos    1381
neg     619
Name: vader_label, dtype: int64

In [25]:
x=df['review']
y=df['label']
y_vad = df['vader_label']

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=40)
x_train,x_test,y_vad_train,y_vad_test=train_test_split(x,y_vad,test_size=0.3,random_state=40)

In [27]:
model=make_pipeline(TfidfVectorizer(),MultinomialNB())
model.fit(x_train,y_train)
pred=model.predict(x_test)
print(accuracy_score(y_test,pred))

0.8133333333333334


In [28]:
model_vad=make_pipeline(TfidfVectorizer(),MultinomialNB())
model_vad.fit(x_train,y_vad_train)
pred=model_vad.predict(x_test)
print(accuracy_score(y_vad_test,pred))

0.6916666666666667


In [29]:
model.predict(["Bad worst dirty senseless"])

array(['negative'], dtype='<U8')

In [30]:
model_vad.predict(["MIB is an amazing movie"])

array(['pos'], dtype='<U3')

In [31]:
model.predict(["Speed 2 falls far short of its predecessor, thanks to laughable dialogue, thin characterization, unsurprisingly familiar plot devices, and action sequences that fail to generate any excitement."])

array(['negative'], dtype='<U8')