In [16]:
from datasets import load_dataset

dataset = load_dataset("ajaykarthick/imdb-movie-reviews")
dataset.keys()

dict_keys(['train', 'test'])

In [17]:
import pandas as pd
df = pd.DataFrame(dataset["train"])
df.drop(df.tail(20000).index, inplace = True)
df.shape

(20000, 2)

In [18]:
df["review"] = df["review"].str.lower()

In [19]:
df["review"]

0        ms aparna sen, the maker of mr & mrs iyer, dir...
1        i have seen this film only once, on tv, and it...
2        i was only fourteen when i first saw the alien...
3        this marvelous short will hit home with everyo...
4        if you are 10 years old and never seen a movie...
                               ...                        
19995    this movie is very good. the screenplay is enc...
19996    the authors know nothing about russians prison...
19997    the 1997 low-key indie dramedy henry fool woul...
19998    what can i say? this was hands-down the worst ...
19999    this was one of the worst movies i've ever see...
Name: review, Length: 20000, dtype: object

In [20]:
df["review"] = df["review"].str.replace("<.+?/?>", "", regex=True)
df["review"]

0        ms aparna sen, the maker of mr & mrs iyer, dir...
1        i have seen this film only once, on tv, and it...
2        i was only fourteen when i first saw the alien...
3        this marvelous short will hit home with everyo...
4        if you are 10 years old and never seen a movie...
                               ...                        
19995    this movie is very good. the screenplay is enc...
19996    the authors know nothing about russians prison...
19997    the 1997 low-key indie dramedy henry fool woul...
19998    what can i say? this was hands-down the worst ...
19999    this was one of the worst movies i've ever see...
Name: review, Length: 20000, dtype: object

In [21]:
import string

In [22]:
punctuation = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", punctuation))

df["review"] = df["review"].apply(remove_punctuation)
df["review"]

0        ms aparna sen the maker of mr  mrs iyer direct...
1        i have seen this film only once on tv and it h...
2        i was only fourteen when i first saw the alien...
3        this marvelous short will hit home with everyo...
4        if you are 10 years old and never seen a movie...
                               ...                        
19995    this movie is very good the screenplay is ench...
19996    the authors know nothing about russians prison...
19997    the 1997 lowkey indie dramedy henry fool would...
19998    what can i say this was handsdown the worst mo...
19999    this was one of the worst movies ive ever seen...
Name: review, Length: 20000, dtype: object

In [23]:
df["review"] = df["review"].str.replace("\d+", "", regex=True)
df["review"]            

0        ms aparna sen the maker of mr  mrs iyer direct...
1        i have seen this film only once on tv and it h...
2        i was only fourteen when i first saw the alien...
3        this marvelous short will hit home with everyo...
4        if you are  years old and never seen a movie b...
                               ...                        
19995    this movie is very good the screenplay is ench...
19996    the authors know nothing about russians prison...
19997    the  lowkey indie dramedy henry fool would see...
19998    what can i say this was handsdown the worst mo...
19999    this was one of the worst movies ive ever seen...
Name: review, Length: 20000, dtype: object

In [24]:
df["review"] = df["review"].str.replace("  ", " ", regex=True)
df["review"]

0        ms aparna sen the maker of mr mrs iyer directs...
1        i have seen this film only once on tv and it h...
2        i was only fourteen when i first saw the alien...
3        this marvelous short will hit home with everyo...
4        if you are years old and never seen a movie be...
                               ...                        
19995    this movie is very good the screenplay is ench...
19996    the authors know nothing about russians prison...
19997    the lowkey indie dramedy henry fool would seem...
19998    what can i say this was handsdown the worst mo...
19999    this was one of the worst movies ive ever seen...
Name: review, Length: 20000, dtype: object

In [25]:
from nltk.corpus import stopwords
stopwords_str = ", ".join(stopwords.words('english'))


In [26]:
def remove_stopwords(text):
    new_text = []
    for token in text.split():
        if not token in stopwords_str:
            new_text.append(token)
    
    return " ".join(new_text)

In [27]:
df["review"] = df["review"].apply(remove_stopwords)
df["review"]

0        aparna sen maker mr mrs iyer directs movie you...
1        seen film tv repeated strange consider rubbish...
2        fourteen first saw alien movies immediately ca...
3        marvelous short hit home everyone child specif...
4        years old never seen movie maybe film may ente...
                               ...                        
19995    movie good screenplay enchanting meryl streep ...
19996    authors know nothing russians prisons movie ab...
19997    lowkey indie dramedy henry fool seemingly secu...
19998    say handsdown worst movie ever seen life belie...
19999    one worst movies ive ever seen horrible acting...
Name: review, Length: 20000, dtype: object

In [28]:
type(df["review"].values)

numpy.ndarray

In [29]:
from collections import Counter

counter = Counter()

for review in df["review"]:       
    counter.update(review.split())

counter

Counter({'movie': 33442,
         'film': 29752,
         'one': 20054,
         'like': 15523,
         'good': 11438,
         'even': 9672,
         'time': 9285,
         'really': 9076,
         'see': 8884,
         'story': 8861,
         'well': 7457,
         'much': 7372,
         'get': 7227,
         'also': 7067,
         'great': 7055,
         'bad': 7047,
         'people': 7014,
         'first': 6790,
         'dont': 6627,
         'made': 6269,
         'movies': 6239,
         'films': 6074,
         'way': 6026,
         'make': 6021,
         'characters': 5947,
         'think': 5839,
         'watch': 5356,
         'many': 5353,
         'seen': 5285,
         'character': 5191,
         'two': 5137,
         'never': 5092,
         'love': 4989,
         'know': 4964,
         'best': 4925,
         'acting': 4872,
         'little': 4856,
         'plot': 4853,
         'ever': 4695,
         'life': 4624,
         'show': 4615,
         'better': 4352,
    

In [32]:
vocab = [token for token in counter if counter[token] > 10]
vocab_data = "\n".join(vocab)
with open("./../artifacts/vocab.txt", "w", encoding="utf-8") as file:
    file.write(vocab_data)

len(vocab)

16411

In [33]:
from sklearn.model_selection import train_test_split

x = df["review"]
y = df["label"]

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [35]:
import numpy as np

In [36]:
def vectorize(dataset, vocabulary):
    vectors = []
    for tweet in dataset:
        text = np.zeros(len(vocabulary))
        
        words = tweet.split()
        for i in range(len(vocabulary)):
            if vocabulary[i] in words:
                text[i] = 1
        
        vectors.append(text)
    vectorized_list = np.array(vectors, dtype=np.float32)
    return vectorized_list

In [37]:
x_train_vectorized = vectorize(x_train, vocab)

In [38]:
x_test_vectorized = vectorize(x_test, vocab)

In [45]:
len(x_train_vectorized[0])

16411

In [39]:
y_train.value_counts()

label
0    8060
1    7940
Name: count, dtype: int64

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

In [42]:
lr = LogisticRegression()
lr.fit(x_train_vectorized, y_train)

y_train_pred = lr.predict(x_train_vectorized)

y_test_pred = lr.predict(x_test_vectorized)

training_scores(y_train, y_train_pred)

validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.997
	Precision = 0.998
	Recall = 0.997
	F1-Score = 0.997
Testing Scores:
	Accuracy = 0.867
	Precision = 0.879
	Recall = 0.859
	F1-Score = 0.869


In [43]:
import pickle

with open('./../artifacts/model.pickle', 'wb') as file:
    pickle.dump(lr, file)