In [4]:
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

In [5]:
raw_data = pd.read_csv("IMDB Dataset.csv")
raw_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
nltk.download("stopwords")
nltk.download("punkt", quiet=False, force=True)
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
punctuation = string.punctuation
stopWords = nltk.corpus.stopwords.words('english')
lm = WordNetLemmatizer()

In [8]:
def GetCleanText(text):
    onlyText = BeautifulSoup(text, "html.parser").get_text()
    textNoPunctNLow = "".join([char.lower() for char in onlyText if char not in punctuation]) 
    tokens = word_tokenize(textNoPunctNLow)
    finalText = " ".join(lm.lemmatize(word) for word in tokens if word not in stopWords)
    return finalText

In [9]:
GetCleanText(raw_data['review'][1])

'wonderful little production filming technique unassuming oldtimebbc fashion give comforting sometimes discomforting sense realism entire piece actor extremely well chosen michael sheen got polari voice pat truly see seamless editing guided reference williams diary entry well worth watching terrificly written performed piece masterful production one great master comedy life realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning orton halliwell set particularly flat halliwells mural decorating every surface terribly well done'

In [10]:
X = raw_data['review'].apply(lambda x: GetCleanText(x))
X

0        one reviewer mentioned watching 1 oz episode y...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: review, Length: 50000, dtype: object

In [11]:
Y = raw_data['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
Y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25)

## Text Vectorization

In [13]:
from tensorflow.keras.layers import TextVectorization

In [14]:
vocabSize = 20000
maxLen = 200
vectorizeLayer = TextVectorization(
    max_tokens = vocabSize,
    output_mode = 'int',
    output_sequence_length = maxLen
)

In [15]:
#Building vocabulary since there's non in the dataset
vectorizeLayer.adapt(X_train)

vectorizedTrainingSet = vectorizeLayer(X_train)

In [16]:
X_train.to_list()[2]

'movie came went theater due nature see wasnt well received unfairly panned subject matter actual film higher learning spectacular good film tried talk feared subject america racismplotstory higher learning mostly centered around malikplayed omar epps naive track star deal fast enough stern professorplayed laurence fishburne befriends fudgeice cube well gorgeous lady named dejatyra bank later end deal skinhead campus remyplay michael rapaport confused kid end befriending local skinhead campus impose view becomes racist areopinion higher learning without flaw character development scarce okay performance omar epps tyra bank leader skinheadswhose name forgot busta rhyme doesnt fit movie plus woman turning gay abused men cliché film know didnt like part film kristy swanson movie wasnt half bad laurence fishburne good professor phippseven though could without accent fudge ice cube best performance michael rapaport good confused remy one john singleton best movie one one reason frown rubbis

In [17]:
vectorizedTrainingSet[2,:]

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([    2,   276,   307,   501,   551,   809,    14,   192,    18,
        1795, 10041,  9284,   576,   340,   656,     3,  1745,  2587,
        1903,     7,     3,   644,   452,  7941,   576,   724,     1,
        1745,  2587,   541,  5124,    99,     1,  7509, 10125,  2356,
         979,   108,   482,   746,   101,  7055,     1,  5543,  7172,
        5127,     1,  4577,    18,  1399,   479,   651,     1,  1656,
         212,    45,   482, 19678,  5565,     1,   382, 18937,  1294,
         133,    45,     1,   508, 19678,  5565, 14418,   457,   358,
        2637,     1,  1745,  2587,   115,  1002,     8,   791, 15164,
         727,    70,  7509, 10125, 10976,  1656,  1373,     1,   226,
        2323,     1,  6405,    67,   693,     2,   798,    72,  1462,
         714,  4557,   244,  1503,     3,    33,    69,     5,    56,
           3, 14847,  7777,     2,   192,   253,    21,  5543,  7172,
           7,  2023,     1,    73,    30,   