<a href="https://www.kaggle.com/code/abiraazmary/imdb-sentiment-analysis?scriptVersionId=155302047" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Importing Dataset CSV**

In [None]:
import pandas as pd

# Update the file path to your CSV file
filepath_dict = {'yelp': '/kaggle/input/idmb-dataset/IMDB Dataset.csv'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath)  # Use pandas read_csv directly without specifying names and sep
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0]) 

In [None]:
print(df)

# **Data Splitting**

In [None]:
from sklearn.model_selection import train_test_split
# Assuming 'review' is the feature and 'sentiment' is the target
X = df['review']
y = df['sentiment']

# Perform train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

# **Tokenization**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_tokenize = tokenizer.texts_to_sequences(X_train)
X_test_tokenize = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(X_train[2])
print(X_train_tokenize[2])

In [None]:
for word in ['the', 'all', 'happy', 'sad']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

# **Padding**

In [None]:
import matplotlib.pyplot as plt

# Calculate the lengths of sentences
sentence_lengths = [len(seq) for seq in X_train_tokenize]

# Plot the histogram
plt.hist(sentence_lengths, bins=50, alpha=0.75)
plt.axvline(x=np.mean(sentence_lengths), color='red', linestyle='dashed', linewidth=2, label='Mean Length')
plt.title('Distribution of Sentence Lengths')
plt.xlabel('Sentence Length')
plt.ylabel('Frequency')
plt.legend()
plt.show()

print("Average sequence length:", np.mean(sentence_lengths))

In [None]:
from keras.preprocessing.sequence import pad_sequences

val = 235

# Pad sequences
X_train_pad = pad_sequences(X_train_tokenize, padding='post', maxlen=val)
X_test_pad = pad_sequences(X_test_tokenize, padding='post', maxlen=val)

print("Padded sequence example:")
print(X_train_pad[0, :])

# **Encoding labels**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)



# **Embedding**

In [None]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
embedding_dim = 100
embedding_matrix = create_embedding_matrix(
    'glove.6B.100d.txt',
    tokenizer.word_index, embedding_dim)

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

In [None]:
len(embedding_matrix)

# **Observations before traning**

In [None]:
X_test_pad

In [None]:
X_train_pad

In [None]:
y_train

In [None]:
y_train_encoded

In [None]:
X_train_pad

In [None]:
y_train_encoded

# **Shallow Model**

Model compiling

In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=val, trainable=True))
model.add(layers.Flatten())  # Flatten the 3D tensor to 2D
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


# **Fitting**

In [None]:
X_test_pad

In [None]:
X_train_pad

In [None]:
y_train

In [None]:
y_train_encoded

In [None]:
X_train_pad

In [None]:
y_train_encoded

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping


# # Early stopping callback
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_pad, y_train_encoded, epochs=20, verbose=True, 
    validation_split=0.2, batch_size=10)
    
# Plot training history
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# **Results of shallow model**

In [None]:
# Model evaluation
loss, accuracy = model.evaluate(X_train_pad, y_train_encoded, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test_pad, y_test_encoded, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Model evaluation
from sklearn.metrics import confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
y_train_pred = (model.predict(X_train_pad) > 0.5).astype("int32")
y_test_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train_encoded, y_train_pred)
conf_matrix_test = confusion_matrix(y_test_encoded, y_test_pred)

# F1 Score
f1_train = f1_score(y_train_encoded, y_train_pred)
f1_test = f1_score(y_test_encoded, y_test_pred)

# Plot Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Training Set
sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[0])
axes[0].set_title('Training Set Confusion Matrix')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Testing Set
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[1])
axes[1].set_title('Testing Set Confusion Matrix')
axes[1].set_xlabel('Predicted Label')
axes[1].set_ylabel('True Label')

plt.show()

# Print F1 Score
print("Training F1 Score: {:.4f}".format(f1_train))
print("Testing F1 Score: {:.4f}".format(f1_test))

In [None]:
from sklearn.metrics import precision_score, recall_score
# Precision and Recall
precision_train = precision_score(y_train_encoded, y_train_pred)
recall_train = recall_score(y_train_encoded, y_train_pred)

precision_test = precision_score(y_test_encoded, y_test_pred)
recall_test = recall_score(y_test_encoded, y_test_pred)

# Print Precision and Recall
print("Training Precision: {:.4f}".format(precision_train))
print("Training Recall: {:.4f}".format(recall_train))

print("Testing Precision: {:.4f}".format(precision_test))
print("Testing Recall: {:.4f}".format(recall_test))

# **LSTM Model**

Compiling model

In [None]:
from keras.models import Sequential
from keras import layers
from keras.layers import LSTM, Flatten, LeakyReLU

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=val, trainable=True))
model.add(LSTM(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

# **Model Fit**

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

# Train the model
history = model.fit(
    X_train_pad, y_train_encoded, epochs=20, verbose=True, 
    validation_split=0.2, batch_size=40)
    
# Plot training history
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# **Results of LSTM Model**

In [None]:
# Model evaluation
loss, accuracy = model.evaluate(X_train_pad, y_train_encoded, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test_pad, y_test_encoded, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Model evaluation
from sklearn.metrics import confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
y_train_pred = (model.predict(X_train_pad) > 0.5).astype("int32")
y_test_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train_encoded, y_train_pred)
conf_matrix_test = confusion_matrix(y_test_encoded, y_test_pred)

# F1 Score
f1_train = f1_score(y_train_encoded, y_train_pred)
f1_test = f1_score(y_test_encoded, y_test_pred)

# Plot Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Training Set
sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[0])
axes[0].set_title('Training Set Confusion Matrix')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Testing Set
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[1])
axes[1].set_title('Testing Set Confusion Matrix')
axes[1].set_xlabel('Predicted Label')
axes[1].set_ylabel('True Label')

plt.show()

# Print F1 Score
print("Training F1 Score: {:.4f}".format(f1_train))
print("Testing F1 Score: {:.4f}".format(f1_test))

In [None]:
from sklearn.metrics import precision_score, recall_score
# Precision and Recall
precision_train = precision_score(y_train_encoded, y_train_pred)
recall_train = recall_score(y_train_encoded, y_train_pred)

precision_test = precision_score(y_test_encoded, y_test_pred)
recall_test = recall_score(y_test_encoded, y_test_pred)

# Print Precision and Recall
print("Training Precision: {:.4f}".format(precision_train))
print("Training Recall: {:.4f}".format(recall_train))

print("Testing Precision: {:.4f}".format(precision_test))
print("Testing Recall: {:.4f}".format(recall_test))

# **BLSTM Model**

Model Compile

In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=val, trainable=True))
model.add(layers.Bidirectional(layers.LSTM(10, activation='relu')))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

# **Model Fit**

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

# Train the model
history = model.fit(
    X_train_pad, y_train_encoded, epochs=20, verbose=True, 
    validation_split=0.2, batch_size=40)
    
# Plot training history
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# **Results of BLSTM Model**

In [None]:
# Model evaluation
loss, accuracy = model.evaluate(X_train_pad, y_train_encoded, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test_pad, y_test_encoded, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Model evaluation
from sklearn.metrics import confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
y_train_pred = (model.predict(X_train_pad) > 0.5).astype("int32")
y_test_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train_encoded, y_train_pred)
conf_matrix_test = confusion_matrix(y_test_encoded, y_test_pred)

# F1 Score
f1_train = f1_score(y_train_encoded, y_train_pred)
f1_test = f1_score(y_test_encoded, y_test_pred)

# Plot Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Training Set
sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[0])
axes[0].set_title('Training Set Confusion Matrix')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Testing Set
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[1])
axes[1].set_title('Testing Set Confusion Matrix')
axes[1].set_xlabel('Predicted Label')
axes[1].set_ylabel('True Label')

plt.show()

# Print F1 Score
print("Training F1 Score: {:.4f}".format(f1_train))
print("Testing F1 Score: {:.4f}".format(f1_test))

In [None]:
from sklearn.metrics import precision_score, recall_score
# Precision and Recall
precision_train = precision_score(y_train_encoded, y_train_pred)
recall_train = recall_score(y_train_encoded, y_train_pred)

precision_test = precision_score(y_test_encoded, y_test_pred)
recall_test = recall_score(y_test_encoded, y_test_pred)

# Print Precision and Recall
print("Training Precision: {:.4f}".format(precision_train))
print("Training Recall: {:.4f}".format(recall_train))

print("Testing Precision: {:.4f}".format(precision_test))
print("Testing Recall: {:.4f}".format(recall_test))