In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/3.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/3.4 MB[0m [31m11.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.4/3.4 MB[0m [31m35.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [3]:
from spellchecker import SpellChecker
import pandas as pd
import numpy as np
import warnings

In [10]:
import sys
sys.path.append('/content/drive/MyDrive/IMDB_Model')

In [11]:
from utils import *

In [4]:
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/IMDB_clear.csv')

In [6]:
df.review[0]

'One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked They are right as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to manyAryans Muslims gangstas Latinos Christians Italians Irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare Forget pretty pictur

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.sentiment,test_size = 0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.2,random_state=42)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(max_features=8000,
                             stop_words="english",
                              preprocessor= clean_text,
                             norm = "l2",
                              lowercase=True)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [14]:
pipeline = Pipeline([('vectorizer',vectorizer),('model',MultinomialNB())])

In [15]:
pipeline.fit(X_train,y_train)

In [16]:
pipeline.score(X_train,y_train)

0.86865625

In [17]:
pipeline.score(X_val,y_val)

0.857625

In [18]:
pipeline.score(X_test,y_test)

0.8501

In [19]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_test,pipeline.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [27]:
def predict(review):
  spell = SpellChecker()
  review = spell.correction(review)
  review = [review]

  return pipeline.predict(review)

In [28]:
predict("""

Bay, at least, wasn't afraid to take chances. Rise of the Beasts,
unfortunately, feels as though it's trying way too hard to color in the lines.

""")

array(['negative'], dtype='<U8')

In [30]:
predict("""
Spider-Man: Across the Spider-Verse boldly throws out any restraint and leans
full-tilt into the zaniness of its visually and thematically kaleidoscopic world.

""")

array(['positive'], dtype='<U8')

In [31]:
import joblib

In [32]:
joblib.dump(pipeline,'/content/drive/MyDrive/IMDB_Model/model.pkl')

['/content/drive/MyDrive/IMDB_Model/model.pkl']