Data Loading and Preprocessing:
Load the data and preprocess it. This usually involves:


1.   Loading the CSV file.
2.   Removing HTML tags and converting the reviews to lower case.
3.   Tokenizing the text data.
4.   Removing Stopwords.
5.   Splitting the data into training and testing sets.





In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')


In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Removing HTML tags and converting review to lower case
import re
from bs4 import BeautifulSoup

def clean_text(text):
    # Remove HTML tags using BeautifulSoup
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove extra whitespaces
    text = ' '.join(text.split())

    return text

# Apply the cleaning function to the review column
df['cleaned_review'] = df['review'].apply(clean_text)


  text = BeautifulSoup(text, "html.parser").get_text()


In [4]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...


# Tokenizing data using Word2Vec Embedding.


1.   Tokenize the Text
2.   Load Pre-trained Word2Vec Model.
3.   Convert Text to Embeddings




In [5]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from nltk.tokenize import word_tokenize

# Tokenize the cleaned reviews
df['tokenized_review'] = df['cleaned_review'].apply(word_tokenize)
# Load the tokenized reviews
tokenized_reviews = df['tokenized_review'].tolist()

import os

# Create the directory if it does not exist
if not os.path.exists('/content/drive/My Drive/'):
    os.makedirs('/content/drive/My Drive/')

# Create the file
with open('/content/drive/My Drive/word2vec_model.bin', 'wb') as f:
    pass

from gensim.models import Word2Vec

model = Word2Vec(tokenized_reviews, vector_size=128, window=4, min_count=5, workers=4)
model.save('/content/drive/My Drive/word2vec_model.bin')

from gensim.models import Word2Vec

model = Word2Vec.load('/content/drive/My Drive/word2vec_model.bin')
import numpy as np





In [7]:
import numpy as np
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=128):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

df['embedding'] = df['tokenized_review'].apply(lambda x: get_average_word2vec(x, model.wv))

In [8]:
# Convert sentiments to binary labels (0 for negative, 1 for positive)
df['binary_sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
embeddings = df['embedding'].to_numpy()
# Stack the individual numpy arrays into one 2D array
X_data = np.stack(embeddings)

# Labels
y_data = df['binary_sentiment'].values



In [9]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)


2. Building the LSTM Model:
We'll build a simple architecture that includes:

  An Embedding layer: This will turn positive integers (indexes) into dense vectors of fixed size.
  An LSTM layer: This will allow our network to remember patterns over long sequences.
  A Dense layer: This will produce the final prediction, with a sigmoid activation function for binary classification.

In [10]:
import tensorflow as tf

# Reshape the data to fit the LSTM input shape (samples, timesteps, features)
X_train_reshaped = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_reshaped = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# LSTM model parameters
embedding_dim = 128  # As you've used for Word2Vec
lstm_units = 64

# Model architecture
model_nn = tf.keras.Sequential([
    tf.keras.layers.LSTM(lstm_units, input_shape=(1, embedding_dim)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model_nn.fit(
    X_train_reshaped, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test_reshaped, y_test)
)
model_nn.summary()
# Evaluate the model
loss, accuracy = model_nn.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 49473 (193.25 KB)
Trainable params: 49473 (193.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test Loss: 0.34078818559646606
Test Accuracy: 0.8561999797821045


In [11]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

# Model architecture
model_with_dropout = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(1, embedding_dim), return_sequences=True),
    Dropout(0.5),
    tf.keras.layers.LSTM(64),
    Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with custom learning rate
optimizer = Adam(learning_rate=0.0001)
model_with_dropout.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Train the model
history_dropout = model_with_dropout.fit(
    X_train_reshaped, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_reshaped, y_test),
    callbacks=[early_stopping]
)

model_with_dropout.summary()
# Evaluate the model
loss_with_dropout, accuracy_with_dropout = model_with_dropout.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss_with_dropout}")
print(f"Test Accuracy: {accuracy_with_dropout}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 1, 128)            131584    
                                                                 
 dropout (Dropout)           (None, 1, 128)            0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                             

In [12]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')


--2023-09-24 13:35:38--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-09-24 13:35:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-24 13:35:39--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [13]:
def get_average_glove_embedding(tokens_list, embeddings_index, generate_missing=True):
    if len(tokens_list) < 1:
        return np.zeros(len(next(iter(embeddings_index.values()))))

    embeddings = [embeddings_index[word] if word in embeddings_index else np.zeros(len(next(iter(embeddings_index.values())))) for word in tokens_list]

    if generate_missing:
        return np.mean(embeddings, axis=0)
    else:
        length = len(embeddings)
        summed = np.sum(embeddings, axis=0)
        return np.divide(summed, length)

df['glove_embedding'] = df['tokenized_review'].apply(lambda x: get_average_glove_embedding(x, glove_embeddings))


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(df['glove_embedding'], df['binary_sentiment'], test_size=0.2, random_state=42)

# Pad the sequences to ensure uniform length
max_length_glove = max([len(x) for x in X_train_glove])
X_train_glove = pad_sequences(X_train_glove, maxlen=max_length_glove, padding='post')
X_test_glove = pad_sequences(X_test_glove, maxlen=max_length_glove, padding='post')

# Build the LSTM model with GloVe embeddings
model_glove = Sequential()
model_glove.add(Embedding(input_dim=len(glove_embeddings), output_dim=len(next(iter(glove_embeddings.values()))), weights=[np.array(list(glove_embeddings.values()))], input_length=max_length_glove, trainable=False))
model_glove.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_glove.add(Dense(1, activation='sigmoid'))

# Compile the model
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping_glove = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history_glove = model_glove.fit(X_train_glove, y_train_glove, epochs=10, batch_size=64, validation_data=(X_test_glove, y_test_glove), callbacks=[early_stopping_glove])

# Evaluate the model
loss_glove, accuracy_glove = model_glove.evaluate(X_test_glove, y_test_glove)
print(f"Test Loss (GloVe): {loss_glove}")
print(f"Test Accuracy (GloVe): {accuracy_glove}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Test Loss (GloVe): 0.6931183338165283
Test Accuracy (GloVe): 0.5038999915122986


In [16]:
df.columns

Index(['review', 'sentiment', 'cleaned_review', 'tokenized_review',
       'embedding', 'binary_sentiment', 'glove_embedding'],
      dtype='object')

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, BatchNormalization, Dropout, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

def build_and_train_model(train_data, val_data, train_labels, val_labels, vocab_size, embed_dim, max_len, num_epochs):
    # Building the model
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size + 1, embed_dim, input_length=max_len)(inp)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Conv1D(32, 5, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = MaxPooling1D(2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = LSTM(64)(x)
    x = Dropout(0.5)(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inp, outputs=x)
    model.summary()

    # Compiling and training the model
    model.compile(optimizer=Adam(0.0005), loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(train_data, train_labels, validation_data=(val_data, val_labels), epochs=num_epochs, verbose=2, batch_size=32)

    print("Training Score:", model.evaluate(train_data, train_labels))
    print("Validation Score:", model.evaluate(val_data, val_labels))

    return history, model
# Prepare the sequences
MAX_VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_review'])

sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
word_index = tokenizer.word_index

# Padding
max_seq_len = 293  # This value can be set based on your specific needs
padded_sequences = pad_sequences(sequences, maxlen=max_seq_len, truncating='post', padding='post')

# Labels
labels = df['binary_sentiment'].values

# Split data
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=10)

# Now you can use the function to build and train your model
vocab_size = len(word_index)
embed_dim = 64
num_epochs = 5

history, trained_model = build_and_train_model(X_train, X_val, y_train, y_val, vocab_size, embed_dim, max_seq_len, num_epochs)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 293)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 293, 64)           6489664   
                                                                 
 batch_normalization (Batch  (None, 293, 64)           256       
 Normalization)                                                  
                                                                 
 dropout_2 (Dropout)         (None, 293, 64)           0         
                                                                 
 conv1d (Conv1D)             (None, 289, 32)           10272     
                                                                 
 dropout_3 (Dropout)         (None, 289, 32)           0         
                                                             

In [40]:
df.rename(columns={'embedding': 'Word2Vec_embedding'}, inplace=True)

df.head()

Unnamed: 0,review,sentiment,cleaned_review,tokenized_review,Word2Vec_embedding,binary_sentiment,glove_embedding
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[-0.08409601429314799, 0.7679504786297074, -0....",1,"[-0.04562624060837463, 0.20324038589200655, 0...."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[a, wonderful, little, production, the, filmin...","[0.004923407902242616, 0.6077968509373022, -0....",1,"[-0.12205874067876721, 0.19907215390703642, 0...."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[-0.22519955, 0.9055954, -0.4656713, -0.450061...",1,"[-0.06054061, 0.21474889, 0.35279295, -0.26057..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, there, s, a, family, where, a, lit...","[-0.14202915, 0.93962675, -0.41742346, -0.3339...",0,"[-0.035699263, 0.1847207, 0.37936354, -0.30113..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, in, the, time, of, m...","[-0.15036648048676157, 0.6601591575482305, -0....",1,"[-0.07253314, 0.17413004, 0.39302725, -0.20514..."


In [34]:
import numpy as np

# Function to classify sentiment of user input
def classify_sentiment(user_input, model, tokenizer, max_seq_len):
    # Tokenize and preprocess the user input
    user_input = [user_input]
    user_sequences = tokenizer.texts_to_sequences(user_input)
    user_padded_sequences = pad_sequences(user_sequences, maxlen=max_seq_len, truncating='post', padding='post')

    # Predict sentiment
    sentiment_prob = model.predict(user_padded_sequences)[0]

    # Classify as 'positive' or 'negative' based on probability threshold
    threshold = 0.5
    if sentiment_prob >= threshold:
        sentiment = 'positive'
    else:
        sentiment = 'negative'

    return sentiment, sentiment_prob

# Example usage:
user_input = input("Enter your statement: ")
sentiment, sentiment_prob = classify_sentiment(user_input, trained_model, tokenizer, max_seq_len)
print(f"Sentiment: {sentiment} (Probability: {sentiment_prob[0]:.4f})")



Enter your statement: Worst and bad movie. the taking is poor. No production values
Sentiment: negative (Probability: 0.0418)
