# Assignment 2

In [17]:
import json
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from time import time, sleep
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, PredefinedSplit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.base import BaseEstimator, TransformerMixin

# Gensim for Dense Embeddings
from gensim.models import FastText

# NLTK downloads
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)


In [18]:
# ==========================================
# 1. DATA LOADING & LABEL ENGINEERING
# ==========================================
def load_and_parse_data(filepath):
    """
    Parses nested JSON and applies Majority Voting for labels.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.reset_index(drop = True).rename(columns={'index': 'id_EXIST'})
    
    # Label Processing (Majority Voting)
    if 'labels_task1_1' in df.columns:
        def get_majority_vote(labels_list):
            if not isinstance(labels_list, list): return np.nan
            counts = pd.Series(labels_list).value_counts()
            # Tie-breaking: Prioritize 'YES' (Sexism) if tie
            if len(counts) > 1 and counts.iloc[0] == counts.iloc[1]:
                if 'YES' in counts.index[:2]: return 'YES'
            return counts.idxmax()
        
        df['final_label_str'] = df['labels_task1_1'].apply(get_majority_vote)
        df['label'] = df['final_label_str'].map({'YES': 1, 'NO': 0})
        df = df.dropna(subset=['label'])
        df['label'] = df['label'].astype(int)
        
    return df

print("Loading Data...")
df_train = load_and_parse_data('../data/training/EXIST2025_training.json')
df_val = load_and_parse_data('../data/dev/EXIST2025_dev.json')
df_test = load_and_parse_data('../data/test/EXIST2025_test_clean.json')

print(f"\nTotal Samples - Training: {len(df_train)}")
if 'final_label_str' in df_train.columns:
    print(df_train['final_label_str'].value_counts())

print(f"\nTotal Samples - Validation: {len(df_val)}")
if 'final_label_str' in df_val.columns:
    print(df_val['final_label_str'].value_counts())

print(f"\nTotal Samples - Test: {len(df_test)}")
try:
    print(df_test['final_label_str'].value_counts())
except:
    print(f"No labels available for this data split")


Loading Data...

Total Samples - Training: 6920
final_label_str
YES    3553
NO     3367
Name: count, dtype: int64

Total Samples - Validation: 1038
final_label_str
YES    559
NO     479
Name: count, dtype: int64

Total Samples - Test: 2076
No labels available for this data split


In [19]:
from sklearn.model_selection import cross_validate

stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, strategy='raw'):

    text_processed = str(text)
    
    if strategy == 'raw':
        return text_processed
        
    if strategy == 'lowercase':
        return text_processed.lower()
        
    if strategy == 'no_punct':
        # Remove punctuation, keep spaces
        text_processed = re.sub(r'[^\w\s]', '', text_processed)
        return text_processed.lower()
        
    if strategy == 'no_stopwords':
        text_processed = text_processed.lower()
        words = text_processed.split()
        return " ".join([w for w in words if w not in stop_words])
        
    if strategy == 'stemmed':
        text_processed = text_processed.lower()
        words = text_processed.split()
        return " ".join([stemmer.stem(w) for w in words])
        
    if strategy == 'lemmatized':
        text_processed = text_processed.lower()
        # Simple tokenization for lemmatizer
        words = text_processed.split() 
        return " ".join([lemmatizer.lemmatize(w) for w in words])
        
    return text_processed

# Preprocess the Tweet columns
df_train['text_clean'] = df_train['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))
df_val['text_clean'] = df_val['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))
df_test['text_clean'] = df_test['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))

X_train_raw = df_train['tweet']
y_train = df_train['label']

X_val_raw = df_val['tweet']
y_val = df_val['label']

X_test_raw = df_test['tweet']

class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: "<PAD>", 1: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<UNK>": 1}
        self.min_freq = min_freq
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                if frequencies[word] == self.min_freq:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    def tokenize(self, text):
        return re.findall(r'\w+', text)
    def numericalize(self, text):
        tokenized_text = self.tokenize(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]

vocab = Vocabulary(min_freq=2)
vocab.build_vocabulary(df_train['text_clean'].tolist())

class EXISTDataset(Dataset):
    def __init__(self, df, vocab, max_len=64):
        self.df = df
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        text = self.df.iloc[index]['text_clean']
        # Check if label exists (it might not for test set)
        if 'label' in self.df.columns:
            label = self.df.iloc[index]['label']
        else:
            label = -1 # Placeholder
        
        tokens = self.vocab.numericalize(text)
        
        # Padding/Truncating
        if len(tokens) < self.max_len:
            tokens.extend([self.vocab.stoi["<PAD>"]] * (self.max_len - len(tokens)))
        else:
            tokens = tokens[:self.max_len]
            
        return torch.tensor(tokens), torch.tensor(label, dtype=torch.long)

# Datasets and Loaders
train_dataset = EXISTDataset(df_train, vocab)
val_dataset = EXISTDataset(df_val, vocab)
test_dataset = EXISTDataset(df_test, vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)


In [20]:
# 3. Bidirectional LSTM Model
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=True, batch_first=True, dropout=0.5 if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        # output: [batch_size, seq_len, hidden_dim * 2]
        output, (hidden, cell) = self.lstm(embedded)
        
        # Max pooling over the sequence dimension
        pooled = torch.max(output, dim=1)[0]
        
        return self.fc(self.dropout(pooled))
# Initialization
VOCAB_SIZE = len(vocab.stoi)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 2
N_LAYERS = 2
model = BiLSTMClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)
print(model)

BiLSTMClassifier(
  (embedding): Embedding(11013, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
# 1. Configuración de dispositivo (GPU si está disponible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 2. Pérdida y Optimizador
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 3. Bucle de entrenamiento
EPOCHS = 10
best_val_loss = float('inf')

print(f"Entrenando en: {device}")

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        # Métrica de accuracy
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
    # Validación
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()
            
    print(f'Epoch {epoch+1}/{EPOCHS}:')
    print(f'  Train Loss: {train_loss/len(train_loader):.4f} | Accuracy: {100.*correct/total:.2f}%')
    print(f'  Val   Loss: {val_loss/len(val_loader):.4f} | Accuracy: {100.*val_correct/val_total:.2f}%')
    print("-" * 30)

    # Guardar el mejor modelo
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print("¡Modelo guardado!")