In [1]:
import json
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from time import time, sleep
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset

# Sklearn imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# NLTK downloads
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Configurar semilla para reproducibilidad
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [2]:
def load_and_parse_data(filepath):
    """
    Parses nested JSON and applies Majority Voting for labels.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.reset_index(drop=True).rename(columns={'index': 'id_EXIST'})
    
    # Label Processing (Majority Voting)
    if 'labels_task1_1' in df.columns:
        def get_majority_vote(labels_list):
            if not isinstance(labels_list, list): return np.nan
            counts = pd.Series(labels_list).value_counts()
            # Tie-breaking: Prioritize 'YES' (Sexism) if tie
            if len(counts) > 1 and counts.iloc[0] == counts.iloc[1]:
                if 'YES' in counts.index[:2]: return 'YES'
            return counts.idxmax()
        
        df['final_label_str'] = df['labels_task1_1'].apply(get_majority_vote)
        df['label'] = df['final_label_str'].map({'YES': 1, 'NO': 0})
        df = df.dropna(subset=['label'])
        df['label'] = df['label'].astype(int)
        
    return df

print("Loading Data...")
# Ajusta las rutas según donde tengas tus datos
df_train = load_and_parse_data('../data/training/EXIST2025_training.json')
df_val = load_and_parse_data('../data/dev/EXIST2025_dev.json')
# El test original no tiene labels, pero lo cargamos por si acaso
df_test = load_and_parse_data('../data/test/EXIST2025_test_clean.json')

print(f"\nTotal Samples - Training: {len(df_train)}")
print(df_train['final_label_str'].value_counts())
print(f"\nTotal Samples - Validation (Will be used as TEST): {len(df_val)}")
print(df_val['final_label_str'].value_counts())

Loading Data...

Total Samples - Training: 6920
final_label_str
YES    3553
NO     3367
Name: count, dtype: int64

Total Samples - Validation (Will be used as TEST): 1038
final_label_str
YES    559
NO     479
Name: count, dtype: int64


In [3]:
stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, strategy='raw'):
    text_processed = str(text)
    if strategy == 'raw':
        return text_processed
    if strategy == 'lowercase':
        return text_processed.lower()
    if strategy == 'no_punct':
        text_processed = re.sub(r'[^\w\s]', '', text_processed)
        return text_processed.lower()
    if strategy == 'no_stopwords':
        text_processed = text_processed.lower()
        words = text_processed.split()
        return " ".join([w for w in words if w not in stop_words])
    if strategy == 'stemmed':
        text_processed = text_processed.lower()
        words = text_processed.split()
        return " ".join([stemmer.stem(w) for w in words])
    if strategy == 'lemmatized':
        text_processed = text_processed.lower()
        words = text_processed.split() 
        return " ".join([lemmatizer.lemmatize(w) for w in words])
    return text_processed

# Limpiamos textos
df_train['text_clean'] = df_train['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))
df_val['text_clean'] = df_val['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))
df_test['text_clean'] = df_test['tweet'].apply(lambda x: preprocess_text(x, 'lowercase'))

class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: "<PAD>", 1: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<UNK>": 1}
        self.min_freq = min_freq
        
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                if frequencies[word] == self.min_freq:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
                    
    def tokenize(self, text):
        return re.findall(r'\w+', text)
        
    def numericalize(self, text):
        tokenized_text = self.tokenize(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]

vocab = Vocabulary(min_freq=2)
vocab.build_vocabulary(df_train['text_clean'].tolist())

In [4]:
class EXISTDataset(Dataset):
    def __init__(self, df, vocab, max_len=64):
        self.df = df
        self.vocab = vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, index):
        text = self.df.iloc[index]['text_clean']
        label = self.df.iloc[index]['label'] if 'label' in self.df.columns else -1
        
        tokens = self.vocab.numericalize(text)
        length = len(tokens)
        
        # Evitar secuencias vacías (rompen el pack_padded_sequence)
        if length == 0:
            tokens = [self.vocab.stoi["<UNK>"]]
            length = 1
            
        # Truncar si es más largo que max_len
        if length > self.max_len:
            tokens = tokens[:self.max_len]
            length = self.max_len
            
        # Rellenar (Padding)
        padded_tokens = tokens + [self.vocab.stoi["<PAD>"]] * (self.max_len - length)
            
        return torch.tensor(padded_tokens), torch.tensor(label, dtype=torch.long), torch.tensor(length, dtype=torch.long)

# Dataset global de Train (se dividirá en Folds)
train_dataset = EXISTDataset(df_train, vocab)

# El dataset "dev" original será nuestro "TEST" real
test_dataset_real = EXISTDataset(df_val, vocab)
test_loader_real = DataLoader(test_dataset_real, batch_size=32, shuffle=False)

In [5]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=True, batch_first=True, 
                           dropout=dropout if n_layers > 1 else 0)
        
        # El tamaño es hidden_dim * 2 por ser bidireccional
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, lengths):
        # text: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text))
        
        # Empaquetar secuencias para ignorar el padding en el LSTM
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # LSTM processing
        packed_output, (hidden, cell) = self.lstm(packed)
        
        # hidden tiene forma: [num_layers * 2, batch_size, hidden_dim]
        # Concatenamos los estados ocultos de la última capa forward y backward
        forward_hidden = hidden[-2]
        backward_hidden = hidden[-1]
        final_hidden = torch.cat([forward_hidden, backward_hidden], dim=1)
        
        # Clasificación
        output = self.dropout(final_hidden)
        logits = self.fc(output)
        
        return logits

class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
        # Evaluamos con F1-Macro, por lo que buscamos que el score SUBA
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.best_model_state = model.state_dict().copy()
            self.counter = 0
            
        return self.early_stop

# Configuración de arquitectura
VOCAB_SIZE = len(vocab.stoi)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 2
N_LAYERS = 2

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Iniciando entrenamiento en: {device}\n")

EPOCHS = 20
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

y_train_array = df_train['label'].values
fold_metrics = []

# Guardaremos el mejor modelo global del mejor fold (opcional, útil para la inferencia final)
best_global_f1 = 0
best_global_model_state = None

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(y_train_array)), y_train_array)):
    print(f"================ FOLD {fold + 1}/{N_SPLITS} ================")
    
    train_sub = Subset(train_dataset, train_idx)
    val_sub = Subset(train_dataset, val_idx)
    
    train_loader = DataLoader(train_sub, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_sub, batch_size=32, shuffle=False)
    
    model = BiLSTMClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    early_stopping = EarlyStopping(patience=5) 
    
    for epoch in range(EPOCHS):
        # TRAIN
        model.train()
        train_loss = 0
        all_train_preds, all_train_labels = [], []
        
        for texts, labels, lengths in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            
            # Gradient Clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())
            
        train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
            
        # VALIDATION
        model.eval()
        val_loss = 0
        all_val_preds, all_val_labels = [], []
        
        with torch.no_grad():
            for texts, labels, lengths in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts, lengths)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                all_val_preds.extend(predicted.cpu().numpy())
                all_val_labels.extend(labels.cpu().numpy())
                
        val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')
        
        print(f'Epoch {epoch+1:02d}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Train F1: {train_f1:.4f} | Val Loss: {val_loss/len(val_loader):.4f} | Val F1: {val_f1:.4f}')
        
        # Early Stopping
        if early_stopping(val_f1, model):
            print(f"--> Early stopping activado en la época {epoch+1}")
            break
            
    # Al final del Fold, guardamos el mejor F1 de este Fold
    fold_best_f1 = early_stopping.best_score
    fold_metrics.append(fold_best_f1)
    print(f"\nMejor F1-Macro en Fold {fold+1}: {fold_best_f1:.4f}\n")
    
    # Trackeamos el mejor modelo de los 5 Folds para usarlo luego en Test
    if fold_best_f1 > best_global_f1:
        best_global_f1 = fold_best_f1
        best_global_model_state = early_stopping.best_model_state.copy()

print("=" * 40)
print(f"F1-Macro Promedio Cross-Validation (5 Folds): {np.mean(fold_metrics):.4f}")
print("=" * 40)

Iniciando entrenamiento en: cuda

Epoch 01/20 | Train Loss: 0.6773 | Train F1: 0.5679 | Val Loss: 0.6591 | Val F1: 0.5863
Epoch 02/20 | Train Loss: 0.6400 | Train F1: 0.6267 | Val Loss: 0.6171 | Val F1: 0.6473
Epoch 03/20 | Train Loss: 0.6025 | Train F1: 0.6784 | Val Loss: 0.6043 | Val F1: 0.6597
Epoch 04/20 | Train Loss: 0.5682 | Train F1: 0.7029 | Val Loss: 0.6171 | Val F1: 0.6720
Epoch 05/20 | Train Loss: 0.5243 | Train F1: 0.7347 | Val Loss: 0.6275 | Val F1: 0.6747
Epoch 06/20 | Train Loss: 0.4948 | Train F1: 0.7597 | Val Loss: 0.6746 | Val F1: 0.6712
Epoch 07/20 | Train Loss: 0.4691 | Train F1: 0.7780 | Val Loss: 0.6784 | Val F1: 0.6906
Epoch 08/20 | Train Loss: 0.4320 | Train F1: 0.8012 | Val Loss: 0.7056 | Val F1: 0.6961
Epoch 09/20 | Train Loss: 0.3971 | Train F1: 0.8216 | Val Loss: 0.7375 | Val F1: 0.6944
Epoch 10/20 | Train Loss: 0.3719 | Train F1: 0.8357 | Val Loss: 0.8967 | Val F1: 0.7000
Epoch 11/20 | Train Loss: 0.3329 | Train F1: 0.8548 | Val Loss: 0.8100 | Val F1: 0.703

In [7]:
print("Evaluando el mejor modelo encontrado en el conjunto de TEST (Dev Original)...")

# Cargar el mejor modelo de todos los Folds
final_model = BiLSTMClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(device)
final_model.load_state_dict(best_global_model_state)
final_model.eval()

test_preds = []
test_labels = []

start_time = time()
with torch.no_grad():
    for texts, labels, lengths in test_loader_real:
        texts, labels = texts.to(device), labels.to(device)
        outputs = final_model(texts, lengths)
        
        _, predicted = outputs.max(1)
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())
inference_time = time() - start_time

test_f1 = f1_score(test_labels, test_preds, average='macro')

print(f"\nF1-Macro Final (Test Set): {test_f1:.4f}")
print(f"Tiempo de Inferencia total: {inference_time:.3f} segundos")
print("\nMatriz de Confusión:\n", confusion_matrix(test_labels, test_preds))
print("\nReporte de Clasificación:\n", classification_report(test_labels, test_preds))

Evaluando el mejor modelo encontrado en el conjunto de TEST (Dev Original)...

F1-Macro Final (Test Set): 0.7168
Tiempo de Inferencia total: 0.486 segundos

Matriz de Confusión:
 [[343 136]
 [157 402]]

Reporte de Clasificación:
               precision    recall  f1-score   support

           0       0.69      0.72      0.70       479
           1       0.75      0.72      0.73       559

    accuracy                           0.72      1038
   macro avg       0.72      0.72      0.72      1038
weighted avg       0.72      0.72      0.72      1038



# Transformers

In [9]:
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import torch.nn as nn
import torch

# Usaremos un modelo base ligero, ideal para empezar.
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]
        
        # Llamada directa al tokenizador (la forma moderna y estándar)
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Preparamos los textos crudos
X_train_bert = df_train['tweet']
y_train_bert = df_train['label']

X_test_bert = df_val['tweet'] 
y_test_bert = df_val['label']

bert_test_dataset = BERTDataset(X_test_bert, y_test_bert, tokenizer)
bert_test_loader = DataLoader(bert_test_dataset, batch_size=16, shuffle=False)

In [14]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, freeze_bert=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
                
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Usamos el token [CLS] (primera posición)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return logits

def unfreeze_last_n_layers(model, n):
    """Utilidad para tu Estudio de Ablación"""
    # Congelar todo primero
    for param in model.bert.parameters():
        param.requires_grad = False
    
    # Descongelar las últimas n capas del encoder
    if n > 0:
        for layer in model.bert.encoder.layer[-n:]:
            for param in layer.parameters():
                param.requires_grad = True
                
    # La cabeza de clasificación siempre debe poder entrenarse
    for param in model.classifier.parameters():
        param.requires_grad = True

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.utils.data import Subset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Entrenando BERT en: {device}\n")

# Parámetros específicos para Fine-Tuning según el PDF
EPOCHS_BERT = 4 
BATCH_SIZE = 16 
N_SPLITS = 5

skf_bert = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
y_train_array_bert = y_train_bert.values
bert_fold_metrics = []
best_bert_global_f1 = 0
best_bert_model_state = None

bert_full_dataset = BERTDataset(X_train_bert, y_train_bert, tokenizer)

for fold, (train_idx, val_idx) in enumerate(skf_bert.split(np.zeros(len(y_train_array_bert)), y_train_array_bert)):
    print(f"================ BERT FOLD {fold + 1}/{N_SPLITS} ================")
    
    train_sub = Subset(bert_full_dataset, train_idx)
    val_sub = Subset(bert_full_dataset, val_idx)
    
    train_loader = DataLoader(train_sub, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_sub, batch_size=BATCH_SIZE, shuffle=False)
    
    # Instanciar el modelo (Full fine-tuning inicial)
    model = BERTClassifier(MODEL_NAME, num_classes=2, freeze_bert=False).to(device)
    
    # Diferentes learning rates: bajo para BERT, más alto para la nueva capa (Exigencia del PDF)
    optimizer = torch.optim.AdamW([
        {'params': model.bert.parameters(), 'lr': 2e-5},
        {'params': model.classifier.parameters(), 'lr': 1e-4}
    ])
    
    # Scheduler con Warmup (10% de los pasos totales)
    total_steps = len(train_loader) * EPOCHS_BERT
    warmup_steps = int(total_steps * 0.1)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    criterion = nn.CrossEntropyLoss()
    best_fold_f1 = 0
    
    for epoch in range(EPOCHS_BERT):
        model.train()
        train_loss = 0
        all_train_preds, all_train_labels = [], []
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            
            # Gradient clipping (max_norm = 1.0)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step() # Actualizar learning rate
            
            train_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())
            
        # Validación
        model.eval()
        val_loss = 0
        all_val_preds, all_val_labels = [], []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                all_val_preds.extend(predicted.cpu().numpy())
                all_val_labels.extend(labels.cpu().numpy())
                
        val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')
        print(f'Epoch {epoch+1}/{EPOCHS_BERT} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f} | Val F1: {val_f1:.4f}')
        
        # Guardar el mejor modelo del Fold
        if val_f1 > best_fold_f1:
            best_fold_f1 = val_f1
            best_model_state_for_this_fold = model.state_dict().copy()
            
    bert_fold_metrics.append(best_fold_f1)
    print(f"\nMejor F1-Macro en Fold {fold+1}: {best_fold_f1:.4f}\n")
    
    if best_fold_f1 > best_bert_global_f1:
        best_bert_global_f1 = best_fold_f1
        best_bert_model_state = best_model_state_for_this_fold

print("=" * 40)
print(f"F1-Macro Promedio BERT (5 Folds): {np.mean(bert_fold_metrics):.4f}")
print("=" * 40)

Entrenando BERT en: cuda



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Epoch 1/4 | Train Loss: 0.5995 | Val Loss: 0.5007 | Val F1: 0.7452
Epoch 2/4 | Train Loss: 0.4460 | Val Loss: 0.5136 | Val F1: 0.7764
Epoch 3/4 | Train Loss: 0.3107 | Val Loss: 0.5435 | Val F1: 0.7879
Epoch 4/4 | Train Loss: 0.2146 | Val Loss: 0.6583 | Val F1: 0.7876

Mejor F1-Macro en Fold 1: 0.7879



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [None]:
print("Evaluando el mejor modelo BERT en el conjunto de TEST...")

final_bert = BERTClassifier(MODEL_NAME, num_classes=2).to(device)
final_bert.load_state_dict(best_bert_model_state)
final_bert.eval()

test_preds = []
test_labels = []

from time import time
start_time = time()

with torch.no_grad():
    for batch in bert_test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        logits = final_bert(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())
        
inference_time = time() - start_time

test_f1 = f1_score(test_labels, test_preds, average='macro')

print(f"\nF1-Macro Final BERT (Test Set): {test_f1:.4f}")
print(f"Tiempo de Inferencia total: {inference_time:.3f} segundos")
print("\nReporte de Clasificación:\n", classification_report(test_labels, test_preds))