In [1]:
from datasets import load_dataset

dataset = load_dataset("ajaykarthick/imdb-movie-reviews")
dataset.keys()

dict_keys(['train', 'test'])

In [2]:
import pandas as pd
df = pd.DataFrame(dataset["train"])
df.drop(df.tail(20000).index, inplace = True)
df.shape

(20000, 2)

In [3]:
import re

In [4]:
df["review"] = df["review"].str.lower()
xx = "If you are 10 years old and never seen a movie before, maybe this film may be entertainment for you, but if you've seen several movies, this one will be a silly fully-cliched cheap and predictable for you. Don't waste your time with this."
xx = xx.lower()
xx

"if you are 10 years old and never seen a movie before, maybe this film may be entertainment for you, but if you've seen several movies, this one will be a silly fully-cliched cheap and predictable for you. don't waste your time with this."

In [5]:
df.head()


Unnamed: 0,review,label
0,"ms aparna sen, the maker of mr & mrs iyer, dir...",0
1,"i have seen this film only once, on tv, and it...",0
2,i was only fourteen when i first saw the alien...,1
3,this marvelous short will hit home with everyo...,0
4,if you are 10 years old and never seen a movie...,1


In [6]:
df["review"] = df["review"].str.replace("<.+?/?>", "", regex=True)
xx = re.sub("<.+?/?>", "", xx)
xx

"if you are 10 years old and never seen a movie before, maybe this film may be entertainment for you, but if you've seen several movies, this one will be a silly fully-cliched cheap and predictable for you. don't waste your time with this."

In [7]:
import string

In [8]:
punctuation = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", punctuation))

df["review"] = df["review"].apply(remove_punctuation)
xx = remove_punctuation(xx)
xx

'if you are 10 years old and never seen a movie before maybe this film may be entertainment for you but if youve seen several movies this one will be a silly fullycliched cheap and predictable for you dont waste your time with this'

In [9]:
df["review"] = df["review"].str.replace("\d+", "", regex=True)
xx = re.sub("\d+", "", xx)
xx

'if you are  years old and never seen a movie before maybe this film may be entertainment for you but if youve seen several movies this one will be a silly fullycliched cheap and predictable for you dont waste your time with this'

In [10]:
df["review"] = df["review"].str.replace("  ", " ", regex=True)
xx = re.sub("  ", " ", xx)
xx


'if you are years old and never seen a movie before maybe this film may be entertainment for you but if youve seen several movies this one will be a silly fullycliched cheap and predictable for you dont waste your time with this'

In [11]:
from nltk.corpus import stopwords
stopwords_str = ", ".join(stopwords.words('english'))


In [12]:
def remove_stopwords(text):
    new_text = []
    for token in text.split():
        if not token in stopwords_str:
            new_text.append(token)
    
    return " ".join(new_text)

In [13]:
df["review"] = df["review"].apply(remove_stopwords)
df["review"]
xx = remove_stopwords(xx)
xx

'years old never seen movie maybe film may entertainment youve seen several movies one silly fullycliched cheap predictable dont waste time'

In [14]:
type(df["review"].values)

numpy.ndarray

In [15]:
from collections import Counter

counter = Counter()

for review in df["review"]:       
    counter.update(review.split())

counter

Counter({'movie': 33442,
         'film': 29752,
         'one': 20054,
         'like': 15523,
         'good': 11438,
         'even': 9672,
         'time': 9285,
         'really': 9076,
         'see': 8884,
         'story': 8861,
         'well': 7457,
         'much': 7372,
         'get': 7227,
         'also': 7067,
         'great': 7055,
         'bad': 7047,
         'people': 7014,
         'first': 6790,
         'dont': 6627,
         'made': 6269,
         'movies': 6239,
         'films': 6074,
         'way': 6026,
         'make': 6021,
         'characters': 5947,
         'think': 5839,
         'watch': 5356,
         'many': 5353,
         'seen': 5285,
         'character': 5191,
         'two': 5137,
         'never': 5092,
         'love': 4989,
         'know': 4964,
         'best': 4925,
         'acting': 4872,
         'little': 4856,
         'plot': 4853,
         'ever': 4695,
         'life': 4624,
         'show': 4615,
         'better': 4352,
    

In [16]:
vocab = sorted([token for token in counter if counter[token] > 10])
vocab_data = "\n".join(vocab)
with open("./../artifacts/vocab.txt", "w", encoding="utf-8") as file:
    file.write(vocab_data)

len(vocab)

16411

In [22]:
with open("./../artifacts/vocab.txt", "r", encoding="utf-8") as file2:
    vt = file2.readlines()

vt = [s.strip() for s in vt]
result = [w == v for w,v in zip(vocab, vt)]
print(vt == vocab)
print("Length Vocab:- ", len(vt))
print(len(list(filter(lambda g: g == False, result))))

True
Length Vocab:-  16411
0


In [17]:
from sklearn.model_selection import train_test_split

x = df["review"]
y = df["label"]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [19]:
import numpy as np

In [54]:
def vectorize(dataset, vocabulary):
    vectors = []
    for tweet in dataset:
        text = np.zeros(len(vocabulary))
        
        words = tweet.split()
        for i in range(len(vocabulary)):
            if vocabulary[i] in words:
                text[i] = 1
        
        vectors.append(text)
    vectorized_list = np.asarray(vectors, dtype=np.float32)
    return vectorized_list

In [21]:
x_train_vectorized = vectorize(x_train, vocab)

In [22]:
x_test_vectorized = vectorize(x_test, vocab)

In [35]:
xx

'years old never seen movie maybe film may entertainment youve seen several movies one silly fullycliched cheap predictable dont waste time'

In [56]:
x_vectorized = vectorize([xx], vocab)

16411


In [37]:
len(x_train_vectorized[0])

16411

In [39]:
y_train.value_counts()

label
0    8022
1    7978
Name: count, dtype: int64

In [41]:
from sklearn.linear_model import LogisticRegression

In [42]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

In [43]:
lr = LogisticRegression()
lr.fit(x_train_vectorized, y_train)

y_train_pred = lr.predict(x_train_vectorized)

y_test_pred = lr.predict(x_test_vectorized)

training_scores(y_train, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.998
	Precision = 0.998
	Recall = 0.997
	F1-Score = 0.997
Testing Scores:
	Accuracy = 0.861
	Precision = 0.877
	Recall = 0.842
	F1-Score = 0.859


In [None]:
y_test_pred

In [44]:
y_pred = lr.predict(np.asarray(x_vectorized))

In [45]:
y_pred[0]

1

In [76]:
import re
import string

import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from numpy import ndarray

nltk.download('stopwords')
nltk.download('wordnet')


class TextPipeline:
    def __init__(self, text):
        self.__wnl = WordNetLemmatizer()
        self.__stopwords_str = ", ".join(nltk.corpus.stopwords.words('english'))
        self.text = text

    def _preprocess(self):
        self.text = self.text.lower()
        self.text = re.sub("<.+?/?>", "", self.text)
        self.text = self.text.translate(str.maketrans("", "", string.punctuation))
        self.text = re.sub(r"\d+", "", self.text)
        self.text = re.sub(r"  ", " ", self.text)

    def _remove_stopwords(self):
        new_text = []
        for token in self.text.split():
            if not token in self.__stopwords_str:
                new_text.append(token)
        self.text = " ".join(new_text)

    def _lemmatize(self):
        new_text = []
        for word in self.text.split():
            word = self.__wnl.lemmatize(word)
            new_text.append(word)
        self.text = " ".join(new_text)

    def preprocess_text(self) -> str:
        self._preprocess()
        self._remove_stopwords()
        self._lemmatize()
        print(self.text)
        return self.text

    def vectorize_text(self) -> ndarray:
        print(self.text)
        vector_array = []
        vocab_d = pd.read_csv("./../artifacts/vocab.txt", header=None)
        vocab_d = vocab_d[0].tolist()
        vector = np.zeros(len(vocab))
        words = self.text.split()
        for i in range(len(vocab)):
            if vocab[i] in words:
                vector[i] = 1
        print(len(vector))
        vector_array.append(vector)
        np_array = np.asarray(vector_array, dtype=np.float32)
        return np_array


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hesha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hesha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [77]:
text = "If you are 10 years old and never seen a movie before, maybe this film may be entertainment for you, but if you've seen several movies, this one will be a silly fully-cliched cheap and predictable for you. Don't waste your time with this."
pipeline = TextPipeline(text)
pipeline.preprocess_text()
vector = pipeline.vectorize_text()

prediction = lr.predict(x_vectorized)
prediction

year old never seen movie maybe film may entertainment youve seen several movie one silly fullycliched cheap predictable dont waste time
year old never seen movie maybe film may entertainment youve seen several movie one silly fullycliched cheap predictable dont waste time
16411


array([1], dtype=int64)

In [None]:
import pickle

with open('./../artifacts/model.pickle', 'wb') as file:
    pickle.dump(lr, file)

In [52]:
tt = vector == x_vectorized
tt.all()

False