In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [15]:
#read the csv file
df =pd.read_csv("IMDB Dataset.csv")


In [16]:
#count the values
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [17]:
df.shape

(50000, 2)

In [18]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
#new columen form the sentiment 
df['Category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [21]:
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [23]:
#train test split by 20%
X_train ,X_test , y_train,y_test = train_test_split(df.review, df.Category ,test_size=0.2)

In [24]:
X_train.shape

(40000,)

In [25]:
X_test.shape

(10000,)

In [26]:
type(X_train)

pandas.core.series.Series

In [27]:
X_train[:5]

44584    Kalifornia is disturbing. I believe there is n...
42396    About as hilarious as 50s British comedy can g...
41542    This episode introduced the Holodeck to the TN...
25173    One of several musicals about sailors on leave...
2995     To experience Head you really need to understa...
Name: review, dtype: object

In [29]:
#Create bag of words representation using CountVectorizer

vec = CountVectorizer()
X_train_cv = vec.fit_transform(X_train.values)
X_train_cv

<40000x92900 sparse matrix of type '<class 'numpy.int64'>'
	with 5466234 stored elements in Compressed Sparse Row format>

In [38]:
vec.get_feature_names_out()[999]


'57th'

In [42]:
#Train the naive bayes model

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [43]:
X_test_cv = vec.transform(X_test)


In [44]:
y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      4981
           1       0.88      0.82      0.85      5019

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [48]:
reviews = [
    "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.",
    "With several name actors (Lance Henrikson, David Warner, Joe Don Baker), why was Jeffery Combs given the lead? Henrikson would have been a perfect fit for the lead, as would Warner, Baker or even others in the movie such as Charles Napier. Combs was miscast in this, and did a poor job of it. Everything he did seemed fake or contrived.<br /><br />The script is poor. Meaning that if Lance Henrikson (or another) had the lead role, he might have saved the film (removed it from my waste of time category), but it still would have been a bad movie. The screen play was completely lacking. The director should have recognized this and helped the movie along."
]
reviews_count = vec.transform(reviews)
model.predict(reviews_count)


array([1, 0], dtype=int64)

In [49]:
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('random_forest',(RandomForestClassifier(n_estimators=50, criterion='entropy')))
])

#fit
clf.fit(X_train, y_train)

#get predict 
y_pred =clf.predict(X_test)

#reports
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.84      0.84      4981
           1       0.84      0.84      0.84      5019

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [50]:
#using knn  KNeighborsClassifier
clk = Pipeline([
       ('vectorizer', CountVectorizer()),
       ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))
])
clk.fit(X_train, y_train)

y_knn_pred = clk.predict(X_test)


print(classification_report(y_test, y_knn_pred))

              precision    recall  f1-score   support

           0       0.65      0.65      0.65      4981
           1       0.65      0.65      0.65      5019

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



In [52]:
clm = Pipeline([
       ('vectorizer', CountVectorizer()),
       ('Multi NB', (MultinomialNB()))
])

clm.fit(X_train, y_train)

y_nv_pred = clm.predict(X_test)


print(classification_report(y_test, y_nv_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      4981
           1       0.88      0.82      0.85      5019

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

