In [4]:
pip uninstall tensorflow -y

Found existing installation: tensorflow 2.14.0
Uninstalling tensorflow-2.14.0:
  Successfully uninstalled tensorflow-2.14.0


In [None]:
pip install tensorflow==2.12.0

In [1]:
import re
import nltk
import random
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense, GlobalAveragePooling1D, SimpleRNN
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


**IMDb Dataset**

In [2]:
dataset ='/content/gdrive/My Drive/Colab Notebooks/IMBD_Dataset/IMDB Dataset.csv'
df = pd.read_csv(dataset)
df.sentiment = [1 if s == 'positive' else 0 for s in df.sentiment]

**Preprocessing**

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
idx = random.randint(0, len(df)-1)
before_process = df.iloc[idx][0]

def process(x):
    x = re.sub('[,\.!?:()"]', '', x)
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

df['review'] = df['review'].apply(lambda x: process(x))
after_process = df.iloc[idx][0]

sw_set = set(nltk.corpus.stopwords.words('english'))

def sw_remove(x):
    words = nltk.tokenize.word_tokenize(x)
    filtered_list = [word for word in words if word not in sw_set]
    return ' '.join(filtered_list)

df['review'] = df['review'].apply(lambda x: sw_remove(x))
after_removal = sw_remove(after_process)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Tokenization and Padding**

In [4]:
train_rev, test_rev, train_sent, test_sent = train_test_split(df['review'], df['sentiment'], test_size=0.1, random_state=42)

dict_size = 35000
tokenizer = Tokenizer(num_words=dict_size)
tokenizer.fit_on_texts(df['review'])

train_rev_tokens = tokenizer.texts_to_sequences(train_rev)
test_rev_tokens = tokenizer.texts_to_sequences(test_rev)
seq_lengths =  np.array([len(sequence) for sequence in train_rev_tokens])

upper_bound = int(np.mean(seq_lengths) + 2 * np.std(seq_lengths))
percentage = stats.percentileofscore(seq_lengths, upper_bound)

train_rev_pad = pad_sequences(train_rev_tokens, maxlen=upper_bound)
test_rev_pad = pad_sequences(test_rev_tokens, maxlen=upper_bound)
idx_pad = random.randint(0, len(train_rev_pad)-1)

**GloVe Embedding**

In [5]:
glove_path = '/content/gdrive/My Drive/Colab Notebooks/glove.6B/glove.6B.100d.txt'


def create_embedding_matrix(embedding_path, word_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    print("Embedding matrix shape: %s" % str(embedding_matrix.shape))

    with open(embedding_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(values[1:], dtype="float32")
            return embedding_matrix

embedding_dimension =100
embedding_matrix = create_embedding_matrix(glove_path, tokenizer.word_index, embedding_dimension)

Embedding matrix shape: (125792, 100)


**Tuning Hyper parameters of LSTM Model**

In [15]:
validation_split = 0.2
batch_size = 50
epochs = 5

# Define a function to create the model
def create_model(state_dim, learning_rate, dropout_rate, batch_size, epochs):
     model = Sequential()
     model.add(Embedding(len(tokenizer.word_index)+1, embedding_dimension, input_length=upper_bound, weights=[embedding_matrix], trainable=True, mask_zero=True))
     model.add(LSTM(state_dim, return_sequences=True))
     model.add(GlobalAveragePooling1D())
     model.add(Dropout(dropout_rate))
     model.add(Dense(1, activation='sigmoid'))
     model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
     return model

# Define the hyperparameter grid
param_dist = {
    'state_dim': [20, 50, 100, 200, 500],
    'learning_rate': [0.01, 0.001, 0.0001],
    'dropout_rate': [0.2, 0.3, 0.5],
    'batch_size': [32, 64, 128],
    'epochs': [5, 10, 15]
}

# Wrap the Keras model for use with scikit-learn
keras_model = KerasClassifier(build_fn=create_model, verbose=2)

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=keras_model, param_distributions=param_dist, cv=2, n_iter=10)

# Fit the model
random_search.fit(train_rev_pad, train_sent, validation_split=validation_split, batch_size=batch_size, epochs=epochs, shuffle=True)

# Convert the results to a pandas DataFrame
results_df = pd.DataFrame(random_search.cv_results_)

# Print the DataFrame
print("Results stored in DataFrame:")
print(results_df)

# Print best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Get and print the best test accuracy
best_accuracy = random_search.score(test_rev_pad, test_sent)
print("Best Test Accuracy:", best_accuracy)

  keras_model = KerasClassifier(build_fn=create_model, verbose=2)


Epoch 1/5
360/360 - 98s - loss: 0.5575 - accuracy: 0.7514 - val_loss: 0.4080 - val_accuracy: 0.8473 - 98s/epoch - 273ms/step
Epoch 2/5
360/360 - 94s - loss: 0.2940 - accuracy: 0.8960 - val_loss: 0.3324 - val_accuracy: 0.8669 - 94s/epoch - 261ms/step
Epoch 3/5
360/360 - 94s - loss: 0.1837 - accuracy: 0.9426 - val_loss: 0.3577 - val_accuracy: 0.8553 - 94s/epoch - 260ms/step
Epoch 4/5
360/360 - 94s - loss: 0.1219 - accuracy: 0.9672 - val_loss: 0.3808 - val_accuracy: 0.8553 - 94s/epoch - 262ms/step
Epoch 5/5
360/360 - 94s - loss: 0.0815 - accuracy: 0.9813 - val_loss: 0.4599 - val_accuracy: 0.8478 - 94s/epoch - 261ms/step
352/352 - 17s - loss: 0.4656 - accuracy: 0.8425 - 17s/epoch - 49ms/step
Epoch 1/5
360/360 - 100s - loss: 0.5559 - accuracy: 0.7265 - val_loss: 0.4188 - val_accuracy: 0.8387 - 100s/epoch - 277ms/step
Epoch 2/5
360/360 - 97s - loss: 0.3145 - accuracy: 0.8936 - val_loss: 0.3444 - val_accuracy: 0.8598 - 97s/epoch - 269ms/step
Epoch 3/5
360/360 - 96s - loss: 0.1945 - accuracy: 

In [21]:
from google.colab import files
results_df.to_csv('LSTM_Hypertuning.csv')
files.download('LSTM_Hypertuning.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Tuning Hyper parameters of RNN Model**

In [16]:
validation_split = 0.2
batch_size = 50
epochs = 5

# Define a function to create the model
def create_model(state_dim, learning_rate, dropout_rate, batch_size, epochs):
     model = Sequential()
     model.add(Embedding(len(tokenizer.word_index)+1, embedding_dimension, input_length=upper_bound, weights=[embedding_matrix], trainable=True, mask_zero=True))
     model.add(SimpleRNN(state_dim, return_sequences=True))
     model.add(GlobalAveragePooling1D())
     model.add(Dropout(dropout_rate))
     model.add(Dense(1, activation='sigmoid'))
     model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
     return model

# Define the hyperparameter grid
param_dist = {
    'state_dim': [20, 50, 100, 200, 500],
    'learning_rate': [0.01, 0.001, 0.0001],
    'dropout_rate': [0.2, 0.3, 0.5],
    'batch_size': [32, 64, 128],
    'epochs': [5, 10, 15]
}

# Wrap the Keras model for use with scikit-learn
keras_model = KerasClassifier(build_fn=create_model, verbose=2)

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=keras_model, param_distributions=param_dist, cv=2, n_iter=10)

# Fit the model
random_search.fit(train_rev_pad, train_sent, validation_split=validation_split, batch_size=batch_size, epochs=epochs, shuffle=True)

# Convert the results to a pandas DataFrame
result_df = pd.DataFrame(random_search.cv_results_)

# Print the DataFrame
print("Results stored in DataFrame:")
print(result_df)

# Print best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Get and print the best test accuracy
best_accuracy = random_search.score(test_rev_pad, test_sent)
print("Best Test Accuracy:", best_accuracy)

  keras_model = KerasClassifier(build_fn=create_model, verbose=2)


Epoch 1/5
360/360 - 71s - loss: 0.3797 - accuracy: 0.8372 - val_loss: 0.2997 - val_accuracy: 0.8738 - 71s/epoch - 197ms/step
Epoch 2/5
360/360 - 69s - loss: 0.1465 - accuracy: 0.9493 - val_loss: 0.3387 - val_accuracy: 0.8642 - 69s/epoch - 192ms/step
Epoch 3/5
360/360 - 69s - loss: 0.0678 - accuracy: 0.9797 - val_loss: 0.4375 - val_accuracy: 0.8580 - 69s/epoch - 191ms/step
Epoch 4/5
360/360 - 69s - loss: 0.0345 - accuracy: 0.9899 - val_loss: 0.6502 - val_accuracy: 0.8511 - 69s/epoch - 192ms/step
Epoch 5/5
360/360 - 69s - loss: 0.0208 - accuracy: 0.9939 - val_loss: 0.6226 - val_accuracy: 0.8469 - 69s/epoch - 193ms/step
704/704 - 20s - loss: 0.6216 - accuracy: 0.8554 - 20s/epoch - 29ms/step
Epoch 1/5
360/360 - 70s - loss: 0.3756 - accuracy: 0.8393 - val_loss: 0.2890 - val_accuracy: 0.8789 - 70s/epoch - 196ms/step
Epoch 2/5
360/360 - 69s - loss: 0.1408 - accuracy: 0.9529 - val_loss: 0.3610 - val_accuracy: 0.8702 - 69s/epoch - 192ms/step
Epoch 3/5
360/360 - 70s - loss: 0.0664 - accuracy: 0.

In [22]:
from google.colab import files
results_df.to_csv('RNN_Hypertuning.csv')
files.download('RNN_Hypertuning.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>