In [85]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer


In [86]:
# Load the dataset
IMDB = '/NLP/Dataset/IMDB Dataset.csv'
IMDB = pd.read_csv(IMDB)

In [87]:
IMDB.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [88]:
# Text cleaning function
def clean_text(review):
    # Replace <br> HTML tags with space
    review = re.sub(r'<br\s*/?>', ' ', review)
    # Retain only alphabetic characters and spaces
    review = re.sub(r'[^A-Za-z\s]', '', review)
    return review.lower().strip()  # Convert to lowercase and remove leading/trailing spaces

# Applying the cleaning function to reviews
IMDB['review'] = IMDB['review'].map(clean_text)



In [90]:
# Apply binary conversion for sentiment column
def convert_sentiment(value):
    return 1 if value == 'positive' else 0

IMDB['sentiment'] = IMDB['sentiment'].apply(convert_sentiment)


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,
1,a wonderful little production the filming tec...,
2,i thought this was a wonderful way to spend ti...,
3,basically theres a family where a little boy j...,
4,petter matteis love in the time of money is a ...,


In [92]:

# Tokenizing and padding sequences
vocab_limit = 20000
max_sequence_length = 200
text_tokenizer = Tokenizer(num_words=vocab_limit)

# Fitting tokenizer on the IMDB reviews
text_tokenizer.fit_on_texts(IMDB['review'])

# Converting text to sequences
tokenized_sequences = text_tokenizer.texts_to_sequences(IMDB['review'])

# Padding the sequences to ensure uniform length
X = pad_sequences(tokenized_sequences, maxlen=max_sequence_length)

# Extracting sentiment values as labels
y = IMDB['sentiment'].to_numpy()



In [95]:


# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)

In [98]:
# Loading GloVe embeddings into a dictionary
embedding_dim = 100
glove_file_path = '/NLP/glove.6B.100d.txt'
word_embeddings = {}

# Reading the GloVe file and storing the word vectors
with open(glove_file_path, 'r', encoding='utf-8') as f:
    for entry in f:
        split_entry = entry.split()
        token = split_entry[0]
        embedding_vector = np.array(split_entry[1:], dtype='float32')
        word_embeddings[token] = embedding_vector


In [100]:
embedding_index['the'].shape

(100,)

In [105]:
word_index = tokenizer.word_index
num_words = min(vocab_size, len(word_index) + 1)
embedding_matrix_glove = np.zeros((num_words, embedding_size))

In [106]:
embedding_matrix_glove.shape

(20000, 100)

In [107]:
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix_glove[i] = embedding_vector

In [108]:
embedding_matrix_glove

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.0035074 , -0.14286   ,  0.80261999, ..., -0.58814001,
         0.31889999,  0.012209  ],
       [ 0.20203   , -0.25244001, -0.12557   , ..., -0.16885   ,
        -0.99378997,  0.32501   ],
       [ 0.097328  ,  0.37051001, -0.34889001, ...,  0.037943  ,
         0.27794001,  0.68112999]])

In [113]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.initializers import Constant

# RNN Model
rnn_model = Sequential([
    Embedding(input_dim=num_words,
              output_dim=embedding_size,
              embeddings_initializer=Constant(embedding_matrix_glove),
              input_length=sequence_len,
              trainable=False),  # Pre-trained embeddings are frozen
    SimpleRNN(10, return_sequences=True),
    SimpleRNN(5, return_sequences=False),
    Dense(1, activation='sigmoid')
])



In [114]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_history = rnn_model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.9998 - loss: 0.2009 - val_accuracy: 1.0000 - val_loss: 0.0639
Epoch 2/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 1.0000 - loss: 0.0540 - val_accuracy: 1.0000 - val_loss: 0.0334
Epoch 3/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 1.0000 - loss: 0.0295 - val_accuracy: 1.0000 - val_loss: 0.0206
Epoch 4/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 1.0000 - loss: 0.0187 - val_accuracy: 1.0000 - val_loss: 0.0140
Epoch 5/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 1.0000 - loss: 0.0129 - val_accuracy: 1.0000 - val_loss: 0.0101


In [115]:
from sklearn.metrics import accuracy_score
y_pred = (rnn_model.predict(X_val) > 0.5).astype("int32")
accuracy = accuracy_score(y_val, y_pred)
print(f"Test Accuracy: {accuracy}")

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step
Test Accuracy: 1.0


Pytorch Embedding Implementation

In [116]:
IMDB.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,0
1,a wonderful little production the filming tec...,0
2,i thought this was a wonderful way to spend ti...,0
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,0


In [129]:
reviews = IMDB['review'].values
labels = IMDB['sentiment'].values

In [130]:
from nltk.tokenize import word_tokenize

# Tokenize reviews
def tokenize(text):
    return word_tokenize(text.lower())

# Tokenize all reviews
tokenized_reviews = [tokenize(review) for review in reviews]

In [131]:
from collections import Counter

vocab = Counter([word for review in tokenized_reviews for word in review])

# Create word to index mapping
word2idx = {word: i+2 for i, word in enumerate(vocab)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

In [132]:
embedding_dim = 100
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

for word, idx in word2idx.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [128]:
import torch
from torch.utils.data import Dataset, DataLoader

class IMDBTextDataset(Dataset):
    def __init__(self, reviews, sentiments, vocab_index, max_sequence_length=200):
        self.texts = reviews
        self.sentiments = sentiments
        self.vocab_index = vocab_index
        self.max_sequence_length = max_sequence_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        sentiment = self.sentiments[index]
        
        # Convert words to indices, pad if necessary
        word_indices = [self.vocab_index.get(word, self.vocab_index['<UNK>']) for word in text]
        if len(word_indices) > self.max_sequence_length:
            word_indices = word_indices[:self.max_sequence_length]
        else:
            word_indices += [self.vocab_index['<PAD>']] * (self.max_sequence_length - len(word_indices))
        
        return torch.tensor(word_indices), torch.tensor(sentiment)

In [133]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_sentiments, test_sentiments = train_test_split(
    tokenized_reviews, labels, test_size=0.2, random_state=42)

# Create dataset instances
train_data = IMDBTextDataset(train_texts, train_sentiments, word2idx)
test_data = IMDBTextDataset(test_texts, test_sentiments, word2idx)

# Create data loaders
batch_size = 64
train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=batch_size)

In [134]:
from torch import nn

class SentimentLSTMNet(nn.Module):
    def __init__(self, embed_matrix, lstm_hidden_size, output_size, num_lstm_layers, dropout_rate):
        super(SentimentLSTMNet, self).__init__()
        
        vocab_size, embedding_dim = embed_matrix.shape
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(embed_matrix, dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = False  # Freeze pre-trained GloVe embeddings
        
        self.lstm_layer = nn.LSTM(embedding_dim, lstm_hidden_size, num_layers=num_lstm_layers, batch_first=True, dropout=dropout_rate)
        self.fc_layer = nn.Linear(lstm_hidden_size, output_size)
        self.dropout_layer = nn.Dropout(dropout_rate)
    
    def forward(self, inputs):
        embedded_words = self.embedding_layer(inputs)
        lstm_output, (hidden_state, cell_state) = self.lstm_layer(embedded_words)
        final_output = self.fc_layer(self.dropout_layer(hidden_state[-1]))
        return final_output

In [135]:
lstm_hidden_units = 128
num_output_classes = 1
num_lstm_layers = 2
dropout_prob = 0.5

In [137]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model = SentimentLSTMNet(embedding_matrix, lstm_hidden_units, num_output_classes, num_lstm_layers, dropout_prob).to(device)

In [138]:
import torch.optim as optim

loss_function = nn.BCEWithLogitsLoss()
optimizer_function = optim.Adam(sentiment_model.parameters(), lr=1e-3)

In [139]:
def train_lstm_model(model, data_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for text_batch, sentiment_batch in data_loader:
            text_batch, sentiment_batch = text_batch.to(device), sentiment_batch.to(device).float()
            
            # Forward pass
            optimizer.zero_grad()
            predictions = model(text_batch).squeeze(1)
            loss = criterion(predictions, sentiment_batch)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(data_loader)}')

In [140]:
train_lstm_model(sentiment_model, train_data_loader, loss_function, optimizer_function, epochs=5)

Epoch 1/5, Loss: 0.009956482363387477
Epoch 2/5, Loss: 3.709256580332294e-05
Epoch 3/5, Loss: 1.5574855865270365e-05
Epoch 4/5, Loss: 8.478736374672735e-06
Epoch 5/5, Loss: 5.0284756598557575e-06


In [141]:
def evaluate_lstm_model(model, data_loader):
    model.eval()
    with torch.no_grad():
        correct_predictions = 0
        total_samples = 0
        for text_batch, sentiment_batch in data_loader:
            text_batch, sentiment_batch = text_batch.to(device), sentiment_batch.to(device).float()
            predictions = model(text_batch).squeeze(1)
            predicted_labels = torch.round(torch.sigmoid(predictions))
            correct_predictions += (predicted_labels == sentiment_batch).sum().item()
            total_samples += sentiment_batch.size(0)
        
        accuracy = correct_predictions / total_samples * 100
        print(f'Accuracy: {accuracy:.2f}%')

In [142]:
evaluate_lstm_model(sentiment_model, test_data_loader)

Accuracy: 100.00%
