# Author: Sandesh Basnet

# Review Sentiment

In [1]:
import numpy as np
import pandas as pd
import nltk
import joblib

In [2]:
df = pd.read_csv('../dataset/dataset/imdb_labelled.txt', sep= '\t', header=None, names=['Review', 'Sentiment'])
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
df.sample(5)

Unnamed: 0,Review,Sentiment
595,This mostly routine fact-based TV drama gets a...,1
42,"It was too predictable, even for a chick flick.",0
429,"I saw this film over Christmas, and what a gre...",1
40,"Frankly, after Cotton club and Unfaithful, it ...",0
649,This one just fails to create any real suspens...,0


In [4]:
X = df['Review']
y= df['Sentiment']

In [5]:
df['Sentiment'].value_counts()

1    386
0    362
Name: Sentiment, dtype: int64

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  #inverse document frequency log(N/(1+n)) N= no of doc n= count the repeated word form doc
                                                                              # tf(countvectorizer) * idf

In [7]:
tfidf = TfidfVectorizer(tokenizer= nltk.word_tokenize,
                       stop_words= 'english',
                       lowercase = True,
                       ngram_range=(1, 1)  #preserve the word meaning taking only one word (min, max)
                       )

count_vec = CountVectorizer(tokenizer= nltk.word_tokenize,
                           stop_words= 'english',
                           lowercase= True,
                           ngram_range= (1, 1)
                           )

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size= 0.15, stratify=y, random_state=666)

In [10]:
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



In [11]:
X_train[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [12]:
tfidf.get_feature_names_out()

array(['!', '$', '%', ..., 'zillion', 'zombiez', '\x96'], dtype=object)

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [14]:
gauss = GaussianNB()
rand_forest = RandomForestClassifier(n_jobs= -1)
svc = SVC()

In [15]:
gauss.fit(X_train.toarray(), y_train)
rand_forest.fit(X_train, y_train)
svc.fit(X_train, y_train)

In [16]:
pred_gauss = gauss.predict(X_test.toarray())
pred_rand_forest = rand_forest.predict(X_test)
pred_svc = svc.predict(X_test)

In [17]:
from sklearn.metrics import classification_report

In [18]:
print(classification_report(y_test, pred_gauss))
print(classification_report(y_test, pred_rand_forest))
print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

           0       0.61      0.71      0.66        55
           1       0.67      0.57      0.62        58

    accuracy                           0.64       113
   macro avg       0.64      0.64      0.64       113
weighted avg       0.64      0.64      0.64       113

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        55
           1       0.74      0.64      0.69        58

    accuracy                           0.70       113
   macro avg       0.70      0.70      0.70       113
weighted avg       0.70      0.70      0.70       113

              precision    recall  f1-score   support

           0       0.76      0.69      0.72        55
           1       0.73      0.79      0.76        58

    accuracy                           0.74       113
   macro avg       0.75      0.74      0.74       113
weighted avg       0.74      0.74      0.74       113



In [19]:
sent = 'I love One Piece but the episode run time is too slow to watch'

In [20]:
vectorised = tfidf.transform([sent])
svc.predict(vectorised)

array([0], dtype=int64)

In [21]:
import joblib

In [22]:
joblib.dump(tfidf, 'Inverse document.pkl')
joblib.dump(svc, 'snetiment.pkl')

['snetiment.pkl']