In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
dataset = pd.read_csv('IMDBDataset.csv')

In [3]:
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
dataset['positive'] = dataset['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [6]:
dataset.head()

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [7]:
dataset.shape

(50000, 3)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset.review, dataset.positive)

In [9]:
X_train

13131    Melissa Joan Hart shines! This show is amazing...
3799     A hard to find film which coasts on the still ...
2689     Steve Martin should quit trying to do remakes ...
31006    like i'm sure other people have said this guy ...
17245    What a great Barbara Stanwyck film that I happ...
                               ...                        
40514    I have recently seen this movie due to Jake's ...
39347    I was initially interested in this film after ...
18300    Not as bad a film as i thought it would be.<br...
40694    An excellent family movie... gives a lot to th...
3411     OK, just what the HELL is all this supposed to...
Name: review, Length: 37500, dtype: object

In [10]:
y_train

13131    1
3799     0
2689     0
31006    0
17245    1
        ..
40514    1
39347    0
18300    1
40694    1
3411     0
Name: positive, Length: 37500, dtype: int64

In [11]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', RandomForestClassifier(n_estimators=50, criterion='entropy'))
])

In [12]:
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)

In [14]:
y_pred

array([0, 0, 1, ..., 1, 1, 1])

In [15]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      6333
           1       0.83      0.85      0.84      6167

    accuracy                           0.84     12500
   macro avg       0.84      0.84      0.84     12500
weighted avg       0.84      0.84      0.84     12500



In [24]:
clf = Pipeline([                
     ('vectorizer', CountVectorizer()),   
      ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))   #using the KNN classifier with 10 neighbors 
])

In [25]:
clf.fit(X_train, y_train)

In [26]:
y_pred = clf.predict(X_test)

In [27]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.67      0.66      0.66      6291
           1       0.66      0.67      0.66      6209

    accuracy                           0.66     12500
   macro avg       0.66      0.66      0.66     12500
weighted avg       0.66      0.66      0.66     12500



In [28]:
clf = Pipeline([
                
     ('vectorizer', CountVectorizer()),   
      ('Multi NB', MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

In [31]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85      6670
           1       0.81      0.88      0.84      5830

    accuracy                           0.85     12500
   macro avg       0.85      0.85      0.85     12500
weighted avg       0.85      0.85      0.85     12500

