## LSTM - Sentiment Analysis

In [1]:
#%pip install torchtext
#%pip install torchdata
#%pip install torchdata==0.7.1
#%pip install portalocker
#%pip install datasets



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import random
import numpy as np
from collections import Counter
import torch
import math
from torchtext.vocab import GloVe


# set device
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# set seed
def set_seed(seed: int = 42):
    random.seed(seed)                           # Python random
    np.random.seed(seed)                        # NumPy
    torch.manual_seed(seed)                     # PyTorch CPU
    torch.cuda.manual_seed_all(seed)            # PyTorch GPU (if using)
    torch.backends.cudnn.deterministic = True   # For reproducibility
    torch.backends.cudnn.benchmark = False      # Disable auto-tuning (slower but stable)

set_seed(42)

# ========== Hyperparameters ==========
EMBEDDING_DIM = 50
HIDDEN_DIM = 64
BATCH_SIZE = 64
EPOCHS = 5
MAX_LEN = 300
LEARNING_RATE = 0.005

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [None]:
# ========== Find some twitter data ============

# train_data = dataset["train"]
# test_data = dataset["test"]

In [None]:
# ========== Tokenizer and GloVe ==========
tokenizer = get_tokenizer("basic_english")
glove = GloVe(name="6B", dim=EMBEDDING_DIM)

def yield_glove_tokens(data_iter):
    for example in data_iter:
        tokens = tokenizer(example["text"])
        yield [token for token in tokens if token in glove.stoi]

# ========== Build Vocab (only GloVe tokens) ==========
vocab = build_vocab_from_iterator(yield_glove_tokens(train_data), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# ========== GloVe Coverage Check ==========
known = sum(1 for token in vocab.get_itos() if token in glove.stoi)
print(f"GloVe coverage: {known / len(vocab):.2%}")

# ========== Preprocessing ==========
def preprocess(example):
    tokens = tokenizer(example["text"])
    input_ids = vocab(tokens)[:MAX_LEN]
    label = int(example["label"])
    return {"input_ids": input_ids, "label": label}

train_data = train_data.map(preprocess)
test_data = test_data.map(preprocess)
train_data.set_format(type="python", columns=["input_ids", "label"])
test_data.set_format(type="python", columns=["input_ids", "label"])

# ========== Collate Function ==========
def collate_batch(batch):
    texts = [torch.tensor(sample["input_ids"], dtype=torch.int64) for sample in batch]
    labels = [torch.tensor(sample["label"], dtype=torch.float32) for sample in batch]
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts_padded.to(device), torch.stack(labels).to(device)

# ========== DataLoaders ==========
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, 
                          collate_fn=collate_batch, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, 
                         collate_fn=collate_batch, num_workers=0, pin_memory=True)

# ========== Check Class Balance ==========
print("Label distribution in training set:")
print(Counter([sample["label"] for sample in train_data]))

# ========== Build Embedding Matrix ==========
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)

for idx, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[idx] = glove[token]
    else:
        embedding_matrix[idx] = torch.randn(EMBEDDING_DIM) * 0.6  # small random vector for unknowns


In [6]:
#Crearte the LSTM model
class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
        super(LSTMSentiment, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=vocab["<pad>"]) # embedding layer
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True, dropout=0.2) # LSTM layer
        self.attention = nn.Linear(hidden_dim*2, hidden_dim*2) # Attention layer
        self.fc = nn.Linear(hidden_dim*2, 1)
        
    def forward(self, x):
        x = self.embedding(x) # run the embedding layer
        lstm_out, _ = self.lstm(x) # run lstm layer

        att_scores = self.attention(lstm_out) # compute attention scores
        attn_weights = torch.softmax(att_scores, dim=1) # normalize scores to weights

        context = torch.sum(attn_weights * lstm_out, dim=1) # weighted sum to get context vector

        out = self.fc(context)
        return out.squeeze()


# Initialize model, loss function, and optimizer
model = LSTMSentiment(len(vocab), EMBEDDING_DIM, HIDDEN_DIM).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

model.apply(init_weights)




LSTMSentiment(
  (embedding): Embedding(63925, 50, padding_idx=0)
  (lstm): LSTM(50, 64, batch_first=True, dropout=0.2, bidirectional=True)
  (attention): Linear(in_features=128, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [7]:
from tqdm import tqdm

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for texts, labels in loop:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=total_loss / (loop.n + 1))
    
    # Step the scheduler once per epoch
    scheduler.step(total_loss / len(train_loader))  # at end of each epoch

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1/5: 100%|██████████| 391/391 [01:29<00:00,  4.36it/s, loss=0.369]


Epoch 1, Loss: 0.3690


Epoch 2/5: 100%|██████████| 391/391 [01:19<00:00,  4.94it/s, loss=0.146]


Epoch 2, Loss: 0.1458


Epoch 3/5: 100%|██████████| 391/391 [01:29<00:00,  4.39it/s, loss=0.0369]


Epoch 3, Loss: 0.0369


Epoch 4/5: 100%|██████████| 391/391 [01:28<00:00,  4.44it/s, loss=0.00774]


Epoch 4, Loss: 0.0077


Epoch 5/5: 100%|██████████| 391/391 [01:27<00:00,  4.45it/s, loss=0.00148] 

Epoch 5, Loss: 0.0015





In [None]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        predicted = (torch.sigmoid(outputs) >= 0.50).float()  # only apply sigmoid here
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

# Now compute metrics on all collected predictions
acc = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Accuracy:  0.8615
Precision: 0.8565
Recall:    0.8693
F1 Score:  0.8629


In [None]:
print("Unique predictions:", torch.unique(torch.tensor(y_pred), return_counts=True))

Unique predictions: (tensor([0., 1.]), tensor([1965, 2035]))


In [None]:
Counter([sample["label"] for sample in test_data])

Counter({1: 2005, 0: 1995})