## MLP - Sentiment Analysis

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import random
import numpy as np
from collections import Counter
import torch
import math
from torchtext.vocab import GloVe


# set device
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# set seed
def set_seed(seed: int = 42):
    random.seed(seed)                           # Python random
    np.random.seed(seed)                        # NumPy
    torch.manual_seed(seed)                     # PyTorch CPU
    torch.cuda.manual_seed_all(seed)            # PyTorch GPU (if using)
    torch.backends.cudnn.deterministic = True   # For reproducibility
    torch.backends.cudnn.benchmark = False      # Disable auto-tuning (slower but stable)

set_seed(42)

# ========== Hyperparameters ==========
EMBEDDING_DIM = 50
HIDDEN_DIM = 64
BATCH_SIZE = 64
EPOCHS = 5
MAX_LEN = 300
LEARNING_RATE = 0.005

In [None]:
# ========== Find some twitter data ============

# train_data = dataset["train"]
# test_data = dataset["test"]

In [None]:
# ========== Tokenizer and GloVe ==========
tokenizer = get_tokenizer("basic_english")
glove = GloVe(name="6B", dim=EMBEDDING_DIM)

def yield_glove_tokens(data_iter):
    for example in data_iter:
        tokens = tokenizer(example["text"])
        yield [token for token in tokens if token in glove.stoi]

# ========== Build Vocab (only GloVe tokens) ==========
vocab = build_vocab_from_iterator(yield_glove_tokens(train_data), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# ========== GloVe Coverage Check ==========
known = sum(1 for token in vocab.get_itos() if token in glove.stoi)
print(f"GloVe coverage: {known / len(vocab):.2%}")

# ========== Preprocessing ==========
def preprocess(example):
    tokens = tokenizer(example["text"])
    input_ids = vocab(tokens)[:MAX_LEN]
    label = int(example["label"])
    return {"input_ids": input_ids, "label": label}

train_data = train_data.map(preprocess)
test_data = test_data.map(preprocess)
train_data.set_format(type="python", columns=["input_ids", "label"])
test_data.set_format(type="python", columns=["input_ids", "label"])

# ========== Collate Function ==========
def collate_batch(batch):
    texts = [torch.tensor(sample["input_ids"], dtype=torch.int64) for sample in batch]
    labels = [torch.tensor(sample["label"], dtype=torch.float32) for sample in batch]
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts_padded.to(device), torch.stack(labels).to(device)

# ========== DataLoaders ==========
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, 
                          collate_fn=collate_batch, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, 
                         collate_fn=collate_batch, num_workers=0, pin_memory=True)

# ========== Check Class Balance ==========
print("Label distribution in training set:")
print(Counter([sample["label"] for sample in train_data]))

# ========== Build Embedding Matrix ==========
embedding_matrix = torch.zeros(len(vocab), EMBEDDING_DIM)

for idx, token in enumerate(vocab.get_itos()):
    if token in glove.stoi:
        embedding_matrix[idx] = glove[token]
    else:
        embedding_matrix[idx] = torch.randn(EMBEDDING_DIM) * 0.6  # small random vector for unknowns