In [1]:
import nltk
import pandas as pd
import spacy
import pickle
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC,NuSVC #support vector classifier
from nltk.classify import ClassifierI
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
file = pd.read_csv('C:/Users/Home/Documents/Dataset/IMDB-Dataset.csv')
file

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
pos = file[file["sentiment"].isin(['positive'])]
neg = file[file["sentiment"].isin(['negative'])]
pos

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
49983,"I loved it, having been a fan of the original ...",positive
49985,Imaginary Heroes is clearly the best film of t...,positive
49989,I got this one a few weeks ago and love it! It...,positive
49992,John Garfield plays a Marine who is blinded by...,positive


In [4]:
import string
punct = string.punctuation

In [5]:
nlp = spacy.load("en_core_web_sm")
stopword = set(stopwords.words("english"))
def cleaning(sentence):
    doc = nlp(sentence)
    
    token=[]
    for tokens in doc:
        if tokens.lemma_ !="-PRON-":
            temp = tokens.lemma_.lower().strip()
        else:
            temp = tokens.lower()
        token.append(temp)
    #clean the token
    cleaned = []
    for token in token:
        if token not in stopword and token not in punct:
            cleaned.append(token)
    return cleaned

In [6]:
cleaning("Hello how are you. Like this video")

['hello', 'like', 'video']

In [7]:
tfid = TfidfVectorizer(tokenizer=cleaning)
classifier =LinearSVC()

In [8]:
X = file['review']
y = file['sentiment']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [10]:
X_train

20330    That's about the only redeeming quality in a m...
17532    Even if I had not read Anne Rice's "Queen of t...
45819    I sort of liked this Columbo movie its atmosph...
34807    "Zabriskie Point" (1970): This was especially ...
31888    Quite one of the worst films I have ever seen....
                               ...                        
21243    I did not set very high expectations for this ...
45891    THE BLOB is a great horror movie, not merely b...
42613    After too many years of waiting, Anne Rivers S...
43567    I am a massive fan of the LoG. I thought the f...
2732     AG was an excellent presentation of drama, sus...
Name: review, Length: 40000, dtype: object

In [11]:
clf = Pipeline([('tfid',tfid),('clf',classifier)])

In [13]:
from joblib import parallel_backend
with parallel_backend('threading',n_jobs=2):
    clf.fit(X_train,y_train)

In [14]:
filename ='sentimentanalysis.sav'
pickle.dump(clf,open(filename,'wb'))

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      5035
    positive       0.89      0.90      0.89      4965

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [17]:
confusion_matrix(y_test, y_pred)

array([[4464,  571],
       [ 500, 4465]], dtype=int64)

In [18]:
clf.predict(["Wow, this is amazing movie"])

array(['positive'], dtype=object)

In [19]:
clf.predict(["The movie is not good"])

array(['positive'], dtype=object)

In [20]:
print(tfid)

TfidfVectorizer(tokenizer=<function cleaning at 0x000001ED23445AF0>)
