In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.layers import ReLU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
#read data
data = pd.read_csv("C:\\Users\\Dark-Devil\\Desktop\\IMDB Dataset.csv")

In [3]:
#size of data
data.shape

(50000, 2)

In [4]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
#check null value 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [7]:
data['sent_bin'] = data['sentiment'].replace({'positive': 1, 'negative': 0})

In [8]:
data.head()

Unnamed: 0,review,sentiment,sent_bin
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [9]:
stop_word_text = ['a', 'an', 'br', 'the', 'and', 'or', 'but', 'if', 'then', 'else', 'when', 'where', 'who', 
                  'whom', 'which', 'that', 'this', 'these', 'those', 'in', 'on', 'at', 'to', 'from', 'by', 'for', 'of', 'with',
                  'without', 'over', 'under', 'above', 'below', 'between', 'among', 'through', 'throughout', 'until', 'while', 
                  'since', 'during', 'within', 'without', 'beyond', 'beside', 'between', 'except', 'but', 'up', 'down', 'in', 
                  'out', 'off', 'above', 'below', 'under', 'too', 'very', 'so', 'such', 'just', 'as', 'both', 'neither', 'either', 
                  'although', 'because', 'since', 'so that', 'though', 'this', 'I', 'i', 'she', 'he', 'they', 'it', 'unless', 
                  'until', 'whether', 'while', 'why', '<', '>', 'it', 'that']

In [10]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[<>?\.,!"(\)\/[\]]', '', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain              

In [11]:
#load language model for preprocessing text
nlp = spacy.load("en_core_web_sm")

In [12]:
def preprocess_text(data):
    corpus = []
    for i in range(0, len(data)):
        plain = full_form(data['review'][i])
        # Apply spacy pipeline to the text
        doc = nlp(plain)
    
        # Apply stemming and remove stopwords
        stemmed_text = []
        for token in doc:
            # Check if the token is not a stop word and is alphabetic
            if not token.is_stop and token.is_alpha:
                stemmed_text.append(token.lemma_)

        # Remove custom stopwords
        stemmed_text = [word for word in stemmed_text if word.lower() not in stop_word_text]
    
        # Join the stemmed words into a text
        preprocessed_text = ' '.join(stemmed_text)
        corpus.append(preprocessed_text)
    
    return corpus

In [13]:
review_processed = preprocess_text(data)

In [14]:
len(review_processed)

50000

In [15]:
y = data['sent_bin']

In [16]:
#vocabular size
voc_size = 5000

In [17]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in review_processed]

In [18]:
len(onehot_text)

50000

# Embedding

In [19]:
sent_length = 200

In [20]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='pre', maxlen=sent_length)

In [21]:
len(review_processed[0].split()), len(onehot_text[0])

(140, 140)

In [22]:
embedd_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 1878, 2420, 1467, 2012, 1214, 4430,
       1797, 1628, 4416, 1443, 3730, 4493, 2012,  878,  147, 1495, 3565,
       4470, 1797,  885, 1554, 2041, 3917, 1360, 1283, 3562, 4926, 1776,
        919, 3565, 2947,  250, 2126, 4909, 1709, 2012, 3602,  506, 1797,
        898, 1616, 3664, 4915,  723, 3919,  298, 4489, 4349, 3588, 1650,
        583,  330, 4067, 1433, 4914,  644, 1699, 4596, 2055, 4489, 4936,
       1197, 3616, 2223, 1448, 1855,  589, 4642,   25, 3383, 2026, 1972,
       4675,  354, 3948, 1421, 1903, 2791, 1046, 1344, 4449,  116, 3478,
       3819, 4049, 3198, 1545, 2102,  510, 1128, 40

In [23]:
#shape of data
len(embedd_docs), y.shape

(50000, (50000,))

In [24]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(y)
#y_data = np.squeeze(y_data)

In [25]:
X_data.shape, y_data.shape

((50000, 200), (50000,))

In [26]:
#split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

In [27]:
print(f'training: {X_train.shape} - {y_train.shape}')
print(f'testing: {X_test.shape} - {y_test.shape}')

training: (35000, 200) - (35000,)
testing: (15000, 200) - (15000,)


In [28]:
y_train[0:10]

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)

# Model

In [29]:
## Creating model
embedding_vector_features=100
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          500000    
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 580,501
Trainable params: 580,501
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fff9989790>

# Model Evaluation

In [31]:
model.evaluate(X_test, y_test)



[0.7251515984535217, 0.8245333433151245]

In [32]:
predictions = (model.predict(X_test) > 0.5).astype("int32")



In [33]:
#confusion matrix 
confusion_matrix(y_test,predictions)

array([[6193, 1218],
       [1414, 6175]], dtype=int64)

In [34]:
accuracy_score(y_test,predictions)

0.8245333333333333

# Testing model with text

In [None]:
print(data['review'][0],'\n')
print(data['sentiment'][0])

In [35]:
#preprocessing  text 
def preprocess_text_data(data):
    corpus = []
  
    #split the sentence
    plain = full_form(data)
    #stemming
    doc = nlp(plain)
    # Apply stemming and remove stopwords
    stemmed_text = []
    for token in doc:
        stemmed_text.append(token.lemma_)
    
    stemmed_text = [word for word in stemmed_text if word.lower() not in stop_word_text]
    
    #rejoining text
    preprocessed_text = ' '.join(stemmed_text)
    #add the sentence into list
    corpus.append(preprocessed_text)
    return corpus

In [36]:
text = "this movie is very bad and i don't like this movie"
#preprocessing 
processed_text = preprocess_text_data(text)
#one hot encoding
onehot_sent = [one_hot(word, voc_size) for word in processed_text]
#embedding
embedd_docs = pad_sequences(onehot_sent, padding='pre', maxlen=sent_length)
#convert into numpy array
X_sample = np.array(embedd_docs)

In [51]:
processed_text

['movie be really fucking distinguish movie people really do not like type movie']

In [52]:
sample_prediction = (model.predict(X_sample) > 0.5).astype("int32")



In [53]:
sample_prediction

array([[1]])

# Save the model

In [54]:
model.save("./assets/movieSentiAnalysisV-5.h5")

In [55]:
from tensorflow.keras.models import load_model

model_json = model.to_json()

# Save the JSON string to a file
with open("./assets/my_model-V5.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("./assets/my_model_weights-V5.h5")

In [56]:
from tensorflow.keras.models import load_model

In [57]:
rnn = load_model("./assets/movieSentiAnalysisV-5.h5")

In [58]:
text = "This movie is really fucking distinguish movie. people really do not like this type of movies."

In [59]:
#preprocessing 
processed_text = preprocess_text_data(text)
#one hot encoding
onehot_sent = [one_hot(word, voc_size) for word in processed_text]
#embedding
embedd_docs = pad_sequences(onehot_sent, padding='pre', maxlen=sent_length)
#convert into numpy array
X_sample = np.array(embedd_docs)

In [60]:
processed_text

['movie be really fucking distinguish movie people really do not like type movie']

In [61]:
sample_prediction = (rnn.predict(X_sample) > 0.9).astype("int32")
sample_prediction



array([[1]])