In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.layers import ReLU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
#read data
data = pd.read_csv("./data/IMDB Dataset.csv")

In [3]:
#size of data
data.shape

(50000, 2)

In [4]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
#check null value 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [7]:
data['sent_bin'] = data['sentiment'].replace({'positive': 1, 'negative': 0})

In [8]:
data.head()

Unnamed: 0,review,sentiment,sent_bin
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [9]:
stop_word_text = ['a', 'an', 'br', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when', 'where', 'who', 
                  'whom', 'which', 'that', 'this', 'these', 'those', 'in', 'on', 'at', 'to', 'from', 'by', 'for', 'of', 'with',
                  'without', 'over', 'under', 'above', 'below', 'between', 'among', 'through', 'throughout', 'until', 'while', 
                  'since', 'during', 'within', 'without', 'beyond', 'beside', 'between', 'except', 'but', 'up', 'down', 'in', 
                  'out', 'off', 'above', 'below', 'under', 'too', 'very', 'so', 'such', 'just', 'as', 'both', 'neither', 'either', 
                  'although', 'because', 'since', 'so that', 'though', 'this', 'I', 'i', 'she', 'he', 'they', 'it', 'unless', 
                  'until', 'whether', 'while', 'why', '<', '>', 'it', 'that']

In [11]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[<>?\.,!"(\)\/[\]]', '', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain              

In [11]:
#load language model for preprocessing text
nlp = spacy.load("en_core_web_sm")

In [12]:
def preprocess_text(data):
    corpus = []
    for i in range(0, len(data)):
        plain = full_form(data['review'][i])
        # Apply spacy pipeline to the text
        doc = nlp(plain)
    
        # Apply stemming and remove stopwords
        stemmed_text = []
        for token in doc:
            # Check if the token is not a stop word and is alphabetic
            if not token.is_stop and token.is_alpha:
                stemmed_text.append(token.lemma_)

        # Remove custom stopwords
        stemmed_text = [word for word in stemmed_text if word.lower() not in stop_word_text]
    
        # Join the stemmed words into a text
        preprocessed_text = ' '.join(stemmed_text)
        corpus.append(preprocessed_text)
    
    return corpus

In [13]:
#review_processed = preprocess_text(data)
data['review'] = data['review'].apply(full_form)

In [15]:
y = data['sent_bin']

In [14]:
#vocabular size
voc_size = 5000

In [16]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in data['review']]

In [89]:
len(onehot_text[0])

310

In [17]:
len(onehot_text)

50000

# Embedding

In [18]:
sent_length = 200

In [19]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='pre', maxlen=sent_length)

In [20]:
len(data['review'][0].split()), len(onehot_text[0])

(310, 310)

In [21]:
embedd_docs[0]

array([2426,  100, 1898, 3376,  167, 3603, 4409, 4928, 2761, 3941, 2919,
       4928, 3921,   75, 4443, 4525, 4739, 1622, 1159, 3095, 1385,   25,
       1075,  187, 2426, 4928, 1858,   39, 1898,   25, 3291, 4095,  974,
       3796,  404, 4798,  513,   73, 1164, 4739, 2034,  517,  576, 4600,
       2428, 3988, 4739, 4616, 4701, 1273, 1403, 3740, 3795,  779, 3861,
       1942, 1407, 4928, 4846, 4501, 4409, 4928,  139,   25, 1460, 4095,
       4928, 3710, 3725, 1192, 2655, 3941, 4001, 1526, 4094,  699, 1433,
       1750,  165, 4412,  928, 1336, 3989, 1433, 4362, 1433, 2668, 4332,
       3938, 1585, 4928, 2982, 1700, 3861, 4887,  234,  336, 3203,  527,
       3095, 2992, 1192,  731, 3208, 3861,  857, 1407, 3861,  731, 4474,
        928, 1192, 4575,  527, 3861, 3320, 3590, 3861, 3182, 2139, 2941,
        928, 4879, 4739, 4413, 4371, 4095, 4928,  187,  392, 4409, 1732,
       1305, 1075, 4987, 1305, 4575, 3205, 2078,  503,  854, 4860, 1494,
       1089, 1071,  928, 2139,  981, 1301,  854, 48

In [22]:
#shape of data
len(embedd_docs), y.shape

(50000, (50000,))

In [23]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(y)
#y_data = np.squeeze(y_data)

In [24]:
X_data.shape, y_data.shape

((50000, 200), (50000,))

In [29]:
#split dataset for training and testing
X_train, temp_data, y_train, temp_label = train_test_split(X_data, y_data, test_size=0.4, random_state=42)
val_data, X_test, val_label, y_test = train_test_split(temp_data, temp_label, test_size=0.3, random_state=42)

In [30]:
print(f'training: {X_train.shape} - {y_train.shape}')
print(f'Validation: {val_data.shape} - {val_label.shape}')
print(f'testing: {X_test.shape} - {y_test.shape}')

training: (30000, 200) - (30000,)
Validation: (14000, 200) - (14000,)
testing: (6000, 200) - (6000,)


In [31]:
y_train[0:10]

array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1], dtype=int64)

# Model

In [33]:
## Creating model
embedding_vector_features=100
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(128))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          500000    
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 617,377
Trainable params: 617,377
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.fit(X_train, y_train, validation_data=(val_data, val_label), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a78e961af0>

# Model Evaluation

In [35]:
model.evaluate(X_test, y_test)



[0.7257998585700989, 0.8343333601951599]

In [36]:
predictions = (model.predict(X_test) > 0.5).astype("int32")



In [37]:
#confusion matrix 
confusion_matrix(y_test,predictions)

array([[2295,  698],
       [ 296, 2711]], dtype=int64)

In [38]:
accuracy_score(y_test,predictions)

0.8343333333333334

# Testing model with text

In [None]:
print(data['review'][0],'\n')
print(data['sentiment'][0])

In [118]:
text = "this movie is very bad and i don't like this movie"
#preprocessing 
processed_text = full_form(text)
#one hot encoding
onehot_sent = [one_hot(word, voc_size)[0] for word in processed_text.split(" ")]
#embedding
embedd_docs = pad_sequences([onehot_sent], padding='pre', maxlen=sent_length)
#convert into numpy array
X_sample = np.array(embedd_docs)

In [119]:
X_sample

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [120]:
sample_prediction = (model.predict(X_sample) > 0.5).astype("int32")



In [121]:
(sample_prediction)

array([[0]])

# Save the model

In [102]:
model.save("./assets/movieSentiAnalysisV-5.h5")

In [130]:
text = "this movie is very bad and i don't like this movie."

In [131]:
processed_text = full_form(text)
#one hot encoding
onehot_sent = [one_hot(word, voc_size)[0] for word in processed_text.split(" ")]
#embedding
embedd_docs = pad_sequences([onehot_sent], padding='pre', maxlen=sent_length)
#convert into numpy array
sample = np.array(embedd_docs)

In [132]:
sample

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [133]:
sample_prediction = (rnn.predict(sample) > 0.9).astype("int32")
sample_prediction



array([[0]])