# Sentiment Analysis

In [1]:
#importing dataset
import pandas as pd
df=pd.read_csv('./IMDB_Dataset.csv')

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
#2 Data preparation
# Remove html tags and (if required emojis)
#Kaggle dataset is already cleaned, hence not recommended
#Use a dataset and pre-process it

In [4]:
#Stemming the Documents
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()

In [5]:
def stemmer_tokenize(text):
    return [porter.stem(word) for word in text.split()]
stemmer_tokenize("Hello! How are you doing")

['hello!', 'how', 'are', 'you', 'do']

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\GAURANG
[nltk_data]     RASTOGI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### TF-IDF Vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#4 Vectorization of Documents
tfidf=TfidfVectorizer(strip_accents=None,
                      lowercase=False,
                      tokenizer=stemmer_tokenize,
                      use_idf=True,
                      norm='l2',
                      smooth_idf=True)

Y=df.sentiment.values
X=tfidf.fit_transform(df.review)

In [9]:
print(X,Y)

  (0, 298135)	0.06919594172775405
  (0, 122704)	0.06925466972673622
  (0, 356809)	0.02889561950368614
  (0, 329126)	0.04506530341289699
  (0, 98199)	0.022790243371972132
  (0, 187709)	0.01985988948388115
  (0, 342256)	0.11598635017472783
  (0, 335518)	0.06625581649735467
  (0, 111180)	0.06047287691356779
  (0, 82740)	0.03458453787077541
  (0, 222098)	0.034042229778458326
  (0, 356474)	0.03531231317519018
  (0, 147037)	0.10292730815064878
  (0, 300322)	0.05462847016241417
  (0, 312063)	0.05122494987494821
  (0, 205334)	0.03976236431956519
  (0, 322705)	0.023830442710942767
  (0, 86620)	0.07630496856021392
  (0, 193073)	0.02484043308612746
  (0, 332977)	0.03268927572351406
  (0, 107464)	0.050626218572446736
  (0, 226456)	0.04817660020052929
  (0, 219313)	0.10292730815064878
  (0, 347641)	0.027966952698494967
  (0, 77199)	0.03780803605571362
  :	:
  (49999, 66627)	0.03813544516853703
  (49999, 327217)	0.06882007047956079
  (49999, 240304)	0.0477338810346856
  (49999, 157582)	0.02917512396

### Document Classification Using Logistic Regression

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5,random_state=1)

In [11]:
#5 Document Classification using logistic regression

import pickle
from sklearn.linear_model import LogisticRegressionCV

clf=LogisticRegressionCV(cv=5,
                         scoring='accuracy',
                         random_state=0,
                         n_jobs=1,
                         verbose=2,
                         max_iter=300).fit(X_train,Y_train)

saved_model=open('saved_model.sav','wb')

pickle.dump(clf,saved_model)
saved_model.close()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.1min finished


In [12]:
pred=clf.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(Y_test,pred)

In [14]:
accuracy

0.89