In [1]:
# Import necessary libraries 
import pandas as pd
import re
import spacy
from collections import Counter
from nltk import ngrams
import pickle
import numpy as np
import gensim.downloader as api

In [2]:
# Download necessary data
import spacy.cli
spacy.cli.download('en_core_web_sm') # Download English model

# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(50000, 2)
review       0
sentiment    0
dtype: int64


In [3]:
# Clean and preprocess the raw data
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Load spacy without unnecessary components (for this project) for efficiency

# Preprocessing function to clean reviews
def preprocess_dataset(texts):
    cleaned_texts = []
    docs = list(nlp.pipe(texts, batch_size=1000))
    for doc in docs:
        # Collect lemmas for valid tokens in this doc
        cleaned = [token.lemma_ for token in doc if not token.is_stop and (token.is_alpha or token.is_digit)]
        # Join into a single string per review
        cleaned_texts.append(' '.join(cleaned))
    return cleaned_texts

# Apply preprocessing to the reviews
df['cleaned_review'] = preprocess_dataset(df['review'])
print(df[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  reviewer mention watch 1 Oz episode hook right...  
1  wonderful little production br filming techniq...  
2  think wonderful way spend time hot summer week...  
3  basically family little boy Jake think zombie ...  
4  Petter Mattei love Time money visually stunnin...  


In [4]:
# Generate n-grams from cleaned text
def generate_ngrams(text, n=2):
    words = text.split()
    return list(ngrams(words, n))

# Apply n-gram generation (bigrams)
df['bigrams'] = df['cleaned_review'].apply(lambda x: generate_ngrams(x, 2))
print(df[['cleaned_review', 'bigrams']].head())

                                      cleaned_review  \
0  reviewer mention watch 1 Oz episode hook right...   
1  wonderful little production br filming techniq...   
2  think wonderful way spend time hot summer week...   
3  basically family little boy Jake think zombie ...   
4  Petter Mattei love Time money visually stunnin...   

                                             bigrams  
0  [(reviewer, mention), (mention, watch), (watch...  
1  [(wonderful, little), (little, production), (p...  
2  [(think, wonderful), (wonderful, way), (way, s...  
3  [(basically, family), (family, little), (littl...  
4  [(Petter, Mattei), (Mattei, love), (love, Time...  


In [5]:
# Load pre-trained GloVe word embeddings
model = api.load("glove-wiki-gigaword-100")

# Prepare data for neural network using pretrained GloVe embeddings

# Use cleaned_review without special tokens
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Define special tokens for sequence modeling
special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']

# Get all unique words
all_words = set()
for review in df['cleaned_review']:
    words = review.split()
    all_words.update(words)

# The vocabulary contains all the necessary words and tokens for the embeddings
vocab = special_tokens + list(all_words)
vocab_size = len(vocab)

# Every word should be assigned with an index for vectorization 
word_to_index = {word: i for i, word in enumerate(vocab)}

# Create embedding matrix
embedding_dim = 100
# vocab size (x) x embedding dimensions (y) 
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# The embedding matrix has all the words that exist in both the vocabulary of the dataset and pre-trained model
for word, i in word_to_index.items():
    if word in model and word not in special_tokens:
        # Returns vector with 100 values
        embedding_matrix[i] = model[word]

# Convert to sequences with <SOS> and <EOS>
X = []
for review in df['cleaned_review']:
    # Assigns index of SOS to begin and if a word doesnt exist it assigns index of UNK and in the end EOS 
    seq = [word_to_index['<SOS>']] + [word_to_index.get(word, word_to_index['<UNK>']) for word in review.split()] + [word_to_index['<EOS>']]
    X.append(seq)

# Pad sequences with <PAD> (index 0)
max_len = max(len(seq) for seq in X)
X = np.array([seq + [word_to_index['<PAD>']] * (max_len - len(seq)) for seq in X])

y = df['label'].values

# Save preprocessing artifacts
with open('neural_preprocessing.pkl', 'wb') as f:
    pickle.dump({
        'word_to_index': word_to_index,
        'embedding_matrix': embedding_matrix,
        'max_len': max_len,
        'X': X,
        'y': df['label'].values
    }, f)

print("Neural preprocessing complete. Data saved for model training.")

Neural preprocessing complete. Data saved for model training.


In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, stratify=df['label'], random_state=42)

print("Data split complete.")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Input shape: {X_train.shape}")

# Convert to tensors
X_train = torch.from_numpy(X_train).long() 
y_train = torch.from_numpy(y_train.values).float()
X_test = torch.from_numpy(X_test).long()
y_test = torch.from_numpy(y_test.values).float()

# Create DataLoaders
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Data split complete.
Training set size: 40000
Test set size: 10000
Input shape: (40000, 1298)


In [39]:
# Neural Model for Sentimantic Analysis
class SemanticClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout):
        super().__init__()
        weights = torch.as_tensor(embedding_matrix, dtype=torch.float)
        self.embedding = nn.Embedding.from_pretrained(weights, freeze=False)
        
        # Concatenated Mean + Max pooling = 2 * embedding_dim
        input_dim = weights.shape[1] * 2 
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(), # Better gradient flow than ReLU
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        embedded = self.embedding(x)
        mask = (x != 0).unsqueeze(-1).float()
        
        # Masked Mean Pooling
        mean_pooled = (embedded * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
        # Masked Max Pooling (set padding to very low value)
        max_pooled, _ = torch.max(embedded * mask + (1 - mask) * -1e9, dim=1)
        
        # Combine both to capture average sentiment AND peak signals
        combined = torch.cat([mean_pooled, max_pooled], dim=1)
        return self.net(combined).squeeze()

# Implement GRU and LSTM to compare them with the other neural and classical approaches

# Optimized RNN Base (Used for both GRU and LSTM)
class OptimizedRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout, rnn_type="GRU"):
        super().__init__()
        weights = torch.as_tensor(embedding_matrix, dtype=torch.float)
        self.embedding = nn.Embedding.from_pretrained(weights, freeze=False)
        
        # Added Bidirectionality (looks at context from both ends)
        # Added Multiple Layers
        RNN_Class = nn.GRU if rnn_type == "GRU" else nn.LSTM
        self.rnn = RNN_Class(
            weights.shape[1], 
            hidden_dim, 
            num_layers=1, 
            bidirectional=True, 
            batch_first=True, 
            dropout=0
        )
        
        # Input is hidden_dim * 2 (bidirectional)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        embedded = self.embedding(x)
        
        # Use torch.sum on current device
        lengths = (x != 0).sum(dim=1).to(torch.int64).cpu() 
        
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths, batch_first=True, enforce_sorted=False
        )
        
        # Use full output instead of just hidden state
        packed_output, _ = self.rnn(packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Global Max Pooling over time: Captures the most important feature found in the sequence
        pooled, _ = torch.max(output, dim=1)
        
        return self.fc(pooled).squeeze()




# GRU approach (faster, simpler)
class SemanticClassifierGRU(OptimizedRNN):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout):
        super().__init__(embedding_matrix, hidden_dim, output_dim, dropout, rnn_type="GRU")

# LSTM approach (more reliable)
class SemanticClassifierLSTM(OptimizedRNN):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout):
        super().__init__(embedding_matrix, hidden_dim, output_dim, dropout, rnn_type="LSTM")

In [44]:
from sklearn.metrics import f1_score

def train_and_evaluate_model(model_class, embedding_matrix, train_loader, test_loader):
    # Hyperparameters for the model (optimizer, regularization, learning_rate)
    model = model_class(embedding_matrix, hidden_dim=64, output_dim=1, dropout=0.7)  # Reduced hidden_dim, increased dropout
    # Freeze embeddings initially
    model.embedding.weight.requires_grad = False
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)  # Increased weight_decay
    criterion = nn.BCEWithLogitsLoss()  # More stable
    patience = 0
    best_f1 = 0  # Initialize best F1

    # Training with 20 epochs
    epochs = 20

    for epoch in range(epochs):
        # Unfreeze embeddings after 5 epochs
        if epoch == 5:
            model.embedding.weight.requires_grad = True
            print("Embeddings unfrozen.")
        
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                val_outputs = model(batch_X)
                loss = criterion(val_outputs, batch_y)
                val_loss += loss.item()
                preds = (val_outputs > 0).int()  # Note: > 0 since logits, not probs
                all_preds.extend(preds.numpy())
                all_labels.extend(batch_y.int().numpy())
        
        acc = accuracy_score(all_labels, all_preds)
        prec = precision_score(all_labels, all_preds, zero_division=0)
        rec = recall_score(all_labels, all_preds, zero_division=0)
        f1 = f1_score(all_labels, all_preds, zero_division=0)
        print(f"Epoch {epoch+1}: Train Loss {train_loss/len(train_loader):.4f}, Val Loss {val_loss/len(test_loader):.4f}, Acc {acc:.4f}, Prec {prec:.4f}, Rec {rec:.4f}, F1 {f1:.4f}")

        # Early stopping based on F1
        if f1 > best_f1:
            best_f1 = f1
            patience = 0
            torch.save(model.state_dict(), f'best_{model_class.__name__}.pth')
        else:
            patience += 1
            if patience >= 5:
                break

    # Load best and evaluate
    model.load_state_dict(torch.load(f'best_{model_class.__name__}.pth'))
    model.eval()
    with torch.no_grad():
        all_final_preds = []
        all_final_labels = []
        for batch_X, batch_y in test_loader:
            final_outputs = model(batch_X)
            final_preds = (final_outputs > 0).int()
            all_final_preds.extend(final_preds.numpy())
            all_final_labels.extend(batch_y.int().numpy())
        
        final_acc = accuracy_score(all_final_labels, all_final_preds)
        final_prec = precision_score(all_final_labels, all_final_preds, zero_division=0)
        final_rec = recall_score(all_final_labels, all_final_preds, zero_division=0)
        final_f1 = f1_score(all_final_labels, all_final_preds, zero_division=0)
    
    return {
        'accuracy': final_acc,
        'precision': final_prec,
        'recall': final_rec,
        'f1': final_f1
    }

# Train and compare all models
results = {}
models = [SemanticClassifier, SemanticClassifierGRU, SemanticClassifierLSTM]

for model_class in models:
    print(f"\nTraining {model_class.__name__}...")
    results[model_class.__name__] = train_and_evaluate_model(model_class, embedding_matrix, train_loader, test_loader)

print("\nComparison of Final Test Results:")
print("Model\t\tAccuracy\tPrecision\tRecall\t\tF1")
for name, metrics in results.items():
    print(f"{name}\t{metrics['accuracy']:.4f}\t\t{metrics['precision']:.4f}\t\t{metrics['recall']:.4f}\t\t{metrics['f1']:.4f}")
 



Training SemanticClassifier...
Epoch 1: Train Loss 0.5154, Val Loss 0.4653, Acc 0.7775, Prec 0.8494, Rec 0.6746, F1 0.7520
Epoch 2: Train Loss 0.4672, Val Loss 0.4389, Acc 0.7946, Prec 0.7581, Rec 0.8652, F1 0.8081
Epoch 3: Train Loss 0.4570, Val Loss 0.4252, Acc 0.8024, Prec 0.8149, Rec 0.7826, F1 0.7984
Epoch 4: Train Loss 0.4514, Val Loss 0.4250, Acc 0.8052, Prec 0.8122, Rec 0.7940, F1 0.8030
Epoch 5: Train Loss 0.4471, Val Loss 0.4291, Acc 0.7997, Prec 0.8373, Rec 0.7440, F1 0.7879
Embeddings unfrozen.
Epoch 6: Train Loss 0.3480, Val Loss 0.2744, Acc 0.8859, Prec 0.8931, Rec 0.8768, F1 0.8849
Epoch 7: Train Loss 0.2227, Val Loss 0.2721, Acc 0.8906, Prec 0.8900, Rec 0.8914, F1 0.8907
Epoch 8: Train Loss 0.1528, Val Loss 0.3141, Acc 0.8831, Prec 0.8813, Rec 0.8854, F1 0.8834
Epoch 9: Train Loss 0.1082, Val Loss 0.4014, Acc 0.8763, Prec 0.8531, Rec 0.9092, F1 0.8802
Epoch 10: Train Loss 0.0747, Val Loss 0.4661, Acc 0.8730, Prec 0.8624, Rec 0.8876, F1 0.8748
Epoch 11: Train Loss 0.053