# Sentiment Analysis using IMDB Dataset

In [8]:
import pandas as pd
df = pd.read_csv('C:/Users/USER/Desktop/impc/ML/movie_data.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
5,Leave it to Braik to put on a good show. Final...,1
6,Nathan Detroit (Frank Sinatra) is the manager ...,1
7,"To understand ""Crash Course"" in the right cont...",1
8,I've been impressed with Chavez's stance again...,1
9,This movie is directed by Renny Harlin the fin...,1


In [13]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
d = np.array(['The sun is shining','The weather is sweet','The sun is shining, weather is sweet, and one and one is two'])
bag = count.fit_transform(d)
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.52 0.46 0.52 0.2  0.2  0.2  0.15 0.26 0.2 ]]


In [20]:
import re
def pre(text):
    text = re.sub('<[^>]*>','',text)
    emo = re.findall('(?::|;|=)(?:-)?(?:\)|\|D|P)',text)
    text = re.sub('[\W]+',' ',text.lower()) +\
    ' '.join(emo).replace('-','')
    return(text)

In [22]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [23]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a running like running and runs a lot') if w not in stop]

['run', 'like', 'run', 'run', 'lot']

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None,
                       tokenizer=tokenizer_porter,
                       use_idf=True,
                       norm='l2',
                       smooth_idf=True)
y = df.sentiment.values
x = tfidf.fit_transform(df.review)

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=1,test_size=0.5,shuffle=False)

In [31]:
import pickle
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5,
                          scoring='accuracy',
                          random_state=0,
                          n_jobs=-1,
                          verbose=4,
                          max_iter=300).fit(X_train,y_train)

saved_model = open('saved_model.sav','wb')
pickle.dump(clf,saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.0min remaining: 10.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.9min finished


In [33]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename,'rb'))

In [34]:
saved_clf.score(X_test,y_test)

0.88924