In [19]:
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

In [20]:
raw_data = pd.read_csv("IMDB Dataset.csv")
raw_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [21]:
nltk.download("stopwords")
nltk.download("punkt", quiet=False, force=True)
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\marqu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [22]:
punctuation = string.punctuation
stopWords = nltk.corpus.stopwords.words('english')
lm = WordNetLemmatizer()

In [23]:
def GetCleanText(text):
    onlyText = BeautifulSoup(text, "html.parser").get_text()
    textNoPunctNLow = "".join([char.lower() for char in onlyText if char not in punctuation]) 
    tokens = word_tokenize(textNoPunctNLow)
    finalText = " ".join(lm.lemmatize(word) for word in tokens if word not in stopWords)
    return finalText

In [24]:
GetCleanText(raw_data['review'][1])

'wonderful little production filming technique unassuming oldtimebbc fashion give comforting sometimes discomforting sense realism entire piece actor extremely well chosen michael sheen got polari voice pat truly see seamless editing guided reference williams diary entry well worth watching terrificly written performed piece masterful production one great master comedy life realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning orton halliwell set particularly flat halliwells mural decorating every surface terribly well done'

In [25]:
X = raw_data['review'].apply(lambda x: GetCleanText(x))
X

0        one reviewer mentioned watching 1 oz episode y...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: review, Length: 50000, dtype: object

In [26]:
Y = raw_data['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
Y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25)

## Text Vectorization

In [28]:
from tensorflow.keras.layers import TextVectorization

In [36]:
vocabSize = 20000
maxLen = 200
vectorizeLayer = TextVectorization(
    max_tokens = vocabSize,
    output_mode = 'int',
    output_sequence_length = maxLen
)

In [39]:
#Building vocabulary since there's non in the dataset
vectorizeLayer.adapt(X_train)

vectorizedTrainingSet = vectorizeLayer(X_train)

In [31]:
X_train.to_list()[2]

'first thing first movie achingly beautiful someone work 3d cg film lightercompositor visuals blew away every second stunned screen story well okay going set world fire like futuristic blade runneresquire tale doesnt finei say felt voice acting particularly bland detracted movie whole saw cinema english hoping french version floating around somewheredefinitely worth seeing'

In [32]:
vectorizedTrainingSet[2,:]

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([   23,    27,    23,     2, 13480,   211,   179,    55,  3949,
        4310,     3,     1,  1866,  3729,   154,    82,   204,  4572,
         187,    11,    18,   726,    77,   128,    89,   761,     5,
        4140,  2812,     1,   576,    68,     1,    40,   325,   405,
          44,   472,  1757, 14560,     2,   129,   116,   332,   521,
        1206,   584,   196,  3909,    96,     1,   184,   213,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [41]:
from tensorflow.keras.layers import Embedding

embed = Embedding(
    input_dim = vocabSize,
    output_dim = 100,
    

)

In [43]:
vectorizedTrainingSet[2,:]

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([   23,    27,    23,     2, 13480,   211,   179,    55,  3949,
        4310,     3,     1,  1866,  3729,   154,    82,   204,  4572,
         187,    11,    18,   726,    77,   128,    89,   761,     5,
        4140,  2812,     1,   576,    68,     1,    40,   325,   405,
          44,   472,  1757, 14560,     2,   129,   116,   332,   521,
        1206,   584,   196,  3909,    96,     1,   184,   213,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [44]:
embed(vectorizedTrainingSet[2,:])

<tf.Tensor: shape=(200, 100), dtype=float32, numpy=
array([[-0.01717552,  0.03517122, -0.0231637 , ..., -0.02296155,
         0.01240039, -0.02624132],
       [-0.01822238, -0.0128541 ,  0.04156313, ..., -0.02275989,
         0.0179451 , -0.02814208],
       [-0.01717552,  0.03517122, -0.0231637 , ..., -0.02296155,
         0.01240039, -0.02624132],
       ...,
       [ 0.01921555,  0.03018453,  0.00447141, ...,  0.01050133,
        -0.0328871 , -0.03372177],
       [ 0.01921555,  0.03018453,  0.00447141, ...,  0.01050133,
        -0.0328871 , -0.03372177],
       [ 0.01921555,  0.03018453,  0.00447141, ...,  0.01050133,
        -0.0328871 , -0.03372177]], dtype=float32)>

In [45]:
vectorizeLayer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x1ecf0cda350>

In [46]:
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

In [50]:
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorizeLayer)
model.add(embed)
model.add(LSTM(100))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          2000000   
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 64)                6464      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                      

In [51]:
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])