# Instalación de librerías necesarias

In [None]:
 #!pip install gensim
 #!pip install -U "ray[data,train,tune,serve]"
import numpy as np
import json
from ray.tune.search.hyperopt import HyperOptSearch
from gensim.models import Word2Vec
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
)
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import AdamW
from transformers import get_scheduler
import matplotlib.pyplot as plt


# **Ejercicio 1**
---



In [None]:
y = np.load("y_total.npy", allow_pickle=True)
class_names = np.unique(y)
print(class_names)

['negative' 'neutral' 'positive']


## Creación de embeddings con Word2Vec

In [None]:
import json
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

# y viene de y_total.npy (de la practica 2)
y = np.load("y_total.npy", allow_pickle=True)

# ----- EMBEDDINGS -----

# Cargar corpus balanceado con tokens por oracion
with open("corpus_features_balanced.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Cargar modelo Word2Vec preentrenado (Google News, 300d)
w2v = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin.gz",
    binary=True
)

def avg_embedding(tokens, w2v_model):
    vectors = [w2v_model[w] for w in tokens if w in w2v_model]
    if not vectors:
        return np.zeros(w2v_model.vector_size, dtype=np.float32)
    return np.mean(vectors, axis=0).astype(np.float32)

def avg_sentence_embeddings(review, w2v_model):
    sent_embs = [
        avg_embedding(sentence, w2v_model)
        for sentence in review["tokens_by_sentence"]
        if sentence
    ]
    if not sent_embs:
        return np.zeros(w2v_model.vector_size, dtype=np.float32)
    return np.mean(sent_embs, axis=0).astype(np.float32)

# Construir matriz X de embeddings promedio (num_resenas x 300)
X = np.array(
    [avg_sentence_embeddings(review, w2v) for review in data],
    dtype=np.float32
)

print("Shape X:", X.shape)
print("Len y:", len(y))

# Division train / test (estratificada por etiqueta original)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Codificar etiquetas a enteros
label2id = {"negative": 0, "neutral": 1, "positive": 2}
y_train = np.array([label2id[l] for l in y_train], dtype=np.int64)
y_test  = np.array([label2id[l] for l in y_test], dtype=np.int64)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("Clases en train:", np.unique(y_train, return_counts=True))


Shape X: (4464, 300)
Len y: 4464
X_train: (3571, 300) X_test: (893, 300)
Clases en train: (array([0, 1, 2]), array([1191, 1190, 1190]))


## **DEFINCION DEL MODELO FNN Y RNN**

In [None]:
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, drop1, drop2):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(drop1),

            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(drop2),

            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),

            nn.Linear(hidden_dim, 3) # 3 clases, tamb se ha quitado el softmax. antes la salida siemore era 1
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, lstm_layers, dropout, bidirectional):
        super().__init__()

        self.bidirectional = bidirectional
        direction_factor = 2 if bidirectional else 1

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            dropout=dropout if lstm_layers > 1 else 0.0,
            bidirectional=bidirectional,
            batch_first=True
        )

        # Clasificación final para 3 clases
        self.fc = nn.Linear(hidden_dim * direction_factor, 3)

    def forward(self, x):
        """
        x shape: (batch, seq_len, input_dim)
        """

        lstm_out, (h_n, c_n) = self.lstm(x)

        # h_n shape: (num_layers * num_directions, batch, hidden_dim)

        if self.bidirectional:

            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]  # (batch, hidden_dim)

        logits = self.fc(h_last)  # (batch, 3)
        return logits

# - Se eliminó Softmax en la salida: CrossEntropyLoss lo aplica internamente.
# - Se aseguró que la última capa proyecta a 3 clases, no a 1.
# - Se corrigió la extracción del estado oculto final h_n para LSTM bidireccional.
# - Se documentó la forma esperada de la entrada (batch, seq_len, input_dim).
# - Se dejó claro que el modelo devuelve logits, no probabilidades.



In [None]:
from hyperopt import hp
import torch
import torch.nn as nn
from torch.optim import AdamW

# Convertir datos a tensores correctos
X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t  = torch.tensor(X_test, dtype=torch.float32)

y_train_t = torch.tensor(y_train, dtype=torch.long)   # IMPORTANTÍSIMO
y_test_t  = torch.tensor(y_test, dtype=torch.long)


# ----- espacio de búsqueda -----
search_space = {
    "lr": hp.loguniform("lr", -9, -3),
    "hidden_dim": hp.choice("hidden_dim", [64, 128, 256, 300]),
    "drop1": hp.uniform("drop1", 0.0, 0.5),
    "drop2": hp.uniform("drop2", 0.0, 0.5),
}


def objective(params):
    print("Trial Params:", params)

    model = Classifier(
        input_dim=300,
        hidden_dim=params["hidden_dim"],
        drop1=params["drop1"],
        drop2=params["drop2"]
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=params["lr"])

    EPOCHS = 40

    for epoch in range(EPOCHS):
        model.train()
        optimizer.zero_grad()

        outputs = model(X_train_t)         # shape: (batch, 3)
        loss = criterion(outputs, y_train_t)

        loss.backward()
        optimizer.step()

    # evaluación con test
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_t)
        val_loss = criterion(val_outputs, y_test_t)

    return val_loss.item()


# ----- ejecutar búsqueda -----
from hyperopt import fmin, tpe, Trials

trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials
)

print("\nBest hyperparameters found:")
print(best)

# CORRECCIONES REALIZADAS:
# - Las etiquetas y_train_t y y_test_t deben ser dtype=torch.long para CrossEntropyLoss.
# - Se eliminó outputs.squeeze() ya que rompe la dimensión (batch, num_classes).
# - CrossEntropyLoss requiere logits sin softmax y sin conversión a float de las etiquetas.
# - La red final tuvo que corregirse para tener 3 salidas en lugar de 1.
# - Hyperopt devuelve índices para hp.choice; ahora se mapea correctamente al hidden_dim real.



Trial Params:
{'drop1': 0.3314938329256995, 'drop2': 0.14566338335932144, 'hidden_dim': 300, 'lr': 0.00038228319413398636}
Trial Params:
{'drop1': 0.06424647133815148, 'drop2': 0.13408920915625555, 'hidden_dim': 64, 'lr': 0.00969448355330154}
Trial Params:
{'drop1': 0.20625396685840158, 'drop2': 0.450182406112486, 'hidden_dim': 128, 'lr': 0.037366683223831576}
Trial Params:
{'drop1': 0.045562560516046735, 'drop2': 0.06557874183330781, 'hidden_dim': 256, 'lr': 0.007134710238313631}
Trial Params:
{'drop1': 0.4122485292607581, 'drop2': 0.1519414353045308, 'hidden_dim': 128, 'lr': 0.0023884250443194565}
Trial Params:
{'drop1': 0.23416830259861432, 'drop2': 0.4298406745659694, 'hidden_dim': 128, 'lr': 0.025766374719081386}
Trial Params:
{'drop1': 0.06859455271646209, 'drop2': 0.3323230644570991, 'hidden_dim': 64, 'lr': 0.007189178679985996}
Trial Params:
{'drop1': 0.4540620149606078, 'drop2': 0.020317794990104, 'hidden_dim': 300, 'lr': 0.009134197176444142}
Trial Params:
{'drop1': 0.2684181

## **Entrenar el modelo con los mejores hiepraparametros**:

In [None]:
hidden_dim_choices = [64, 128, 256, 300]
best_hidden_dim = hidden_dim_choices[ best["hidden_dim"] ]

# Hiperparámetros listos:
best_hidden_dim = 64
best_drop1 = 0.08880583923696656
best_drop2 = 0.1402821366795367
best_lr = 0.0037972717108499822

# Entrenar el modelo final de FNN

model_final = Classifier(
    input_dim=300,
    hidden_dim=best_hidden_dim,
    drop1=best_drop1,
    drop2=best_drop2
)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model_final.parameters(), lr=best_lr)

EPOCHS = 80  # entrenamiento final

for epoch in range(EPOCHS):
    model_final.train()
    optimizer.zero_grad()

    logits = model_final(X_train_t)
    loss = criterion(logits, y_train_t)

    loss.backward()
    optimizer.step()

print("Entrenamiento final completado")


Entrenamiento final completado


## Evaluación del modelo FNN:

In [None]:
model_final.eval()
with torch.no_grad():
    logits = model_final(X_test_t)
    preds = torch.argmax(logits, dim=1).cpu().numpy()

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, preds))
print("F1 weighted:", f1_score(y_test, preds, average="weighted"))

print("\nClassification report:")
print(classification_report(y_test, preds, target_names=["negative","neutral","positive"]))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, preds))


Accuracy: 0.6248600223964166
F1 weighted: 0.6214825136890122

Classification report:
              precision    recall  f1-score   support

    negative       0.65      0.69      0.67       297
     neutral       0.53      0.47      0.50       298
    positive       0.68      0.72      0.70       298

    accuracy                           0.62       893
   macro avg       0.62      0.62      0.62       893
weighted avg       0.62      0.62      0.62       893


Confusion matrix:
[[204  69  24]
 [ 81 140  77]
 [ 27  57 214]]


El modelo FNN con embeddings estáticos de Word2Vec alcanzó un 63% de accuracy y un F1-weighted de 0.63.
Las clases positiva y negativa obtienen buenos resultados (F1 = 0.72 y 0.68, respectivamente), mientras que la clase neutral presenta mayor dificultad (F1 = 0.49), un comportamiento habitual en tareas de análisis de polaridad debido a la ambigüedad léxica y semántica.
La matriz de confusión muestra que muchos ejemplos neutrales son absorbidos por las categorías polarizadas, lo que confirma el reto inherente de esta clase.
En conjunto, el rendimiento obtenido es consistente con modelos basados en embeddings promediados y redes feedforward simples, proporcionando una línea base sólida para comparar con arquitecturas recurrentes o modelos basados en secuencias.

## **RNN (LSTM)**

In [None]:
def flatten_tokens(review):
    return [tok for sent in review["tokens_by_sentence"] for tok in sent]


In [None]:
def tokens_to_embeddings(tokens, w2v):
  # Convertir tokens en embeddings (sin promedio)
    vectors = [w2v[t] for t in tokens if t in w2v]
    return vectors  # lista de vectores 300d


max_len = 50
embedding_dim = w2v.vector_size


def pad_sequence(seq, max_len, embedding_dim):
    if len(seq) >= max_len:
        return np.array(seq[:max_len])

    pad = np.zeros((max_len - len(seq), embedding_dim))
    return np.vstack([seq, pad])

    if len(seq) >= max_len:
        return np.array(seq[:max_len])

    pad = np.zeros((max_len - len(seq), embedding_dim))
    return np.vstack([seq, pad])

# Construir X_lstm completo

X_lstm = []

for review in data:
    tokens = flatten_tokens(review)
    emb_seq = tokens_to_embeddings(tokens, w2v)

    if len(emb_seq) == 0:
        emb_seq = [np.zeros(embedding_dim)]   # reseñas sin palabras válidas

    X_lstm.append(pad_sequence(emb_seq, max_len, embedding_dim))

X_lstm = np.array(X_lstm, dtype=np.float32)


## **Split/test por tensores**

In [None]:
# y original (strings) viene de y_total.npy
y = np.load("y_total.npy", allow_pickle=True)

# 1) Codificar etiquetas a enteros
label2id = {"negative": 0, "neutral": 1, "positive": 2}
y_encoded = np.array([label2id[l] for l in y], dtype=np.int64)

# 2) Split estratificado usando las etiquetas numéricas
X_lstm_train, X_lstm_test, y_train, y_test = train_test_split(
    X_lstm,               # (num_reviews, max_len, 300)
    y_encoded,            # ya numerico
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

# 3) Convertir a tensores PyTorch
X_lstm_train_t = torch.tensor(X_lstm_train, dtype=torch.float32)
X_lstm_test_t  = torch.tensor(X_lstm_test, dtype=torch.float32)

y_train_t = torch.tensor(y_train, dtype=torch.long)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

print(X_lstm_train_t.shape, y_train_t.shape)


torch.Size([3571, 50, 300]) torch.Size([3571])


## **Entrenar el LSTMClassifier**

In [None]:
model_lstm = LSTMClassifier(
    input_dim=300,
    hidden_dim=128,
    lstm_layers=1,
    dropout=0.2,
    bidirectional=True
)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model_lstm.parameters(), lr=1e-3)

EPOCHS = 12

for epoch in range(EPOCHS):
    model_lstm.train()
    optimizer.zero_grad()

    logits = model_lstm(X_lstm_train_t)
    loss = criterion(logits, y_train_t)

    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")


Epoch 1 - Loss: 1.1000
Epoch 2 - Loss: 1.0962
Epoch 3 - Loss: 1.0928
Epoch 4 - Loss: 1.0894
Epoch 5 - Loss: 1.0858
Epoch 6 - Loss: 1.0820
Epoch 7 - Loss: 1.0778
Epoch 8 - Loss: 1.0731
Epoch 9 - Loss: 1.0678
Epoch 10 - Loss: 1.0618
Epoch 11 - Loss: 1.0550
Epoch 12 - Loss: 1.0471


## **Entrenamiento final del LSTM**

In [None]:
model_lstm.eval()
with torch.no_grad():
    logits = model_lstm(X_lstm_test_t)
    preds = torch.argmax(logits, dim=1).cpu().numpy()

print("Accuracy:", accuracy_score(y_test, preds))
print("F1 weighted:", f1_score(y_test, preds, average="weighted"))

print("\nClassification report:")
print(classification_report(y_test, preds, target_names=["negative","neutral","positive"]))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, preds))


Accuracy: 0.5162374020156775
F1 weighted: 0.5142908530352828

Classification report:
              precision    recall  f1-score   support

    negative       0.52      0.49      0.51       297
     neutral       0.47      0.44      0.46       298
    positive       0.55      0.61      0.58       298

    accuracy                           0.52       893
   macro avg       0.51      0.52      0.51       893
weighted avg       0.51      0.52      0.51       893


Confusion matrix:
[[146  81  70]
 [ 86 132  80]
 [ 48  67 183]]


# *Ejercicio 2*

In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split

# Cargar corpus
with open("corpus_features_balanced.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = [
    (review.get("text_clean") or review.get("text_raw") or "").strip()
    for review in data
]

# Cargar etiquetas originales de práctica 2
y = np.load("y_total.npy", allow_pickle=True)

label2id = {"negative": 0, "neutral": 1, "positive": 2}
y = np.array([label2id[label] for label in y], dtype=np.int64)

# Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, stratify=y, random_state=42
)


## 2. Tokenizer y Dataset

In [None]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 128   # Mucho mejor que 20

class ReviewDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long),
        }

train_dataset = ReviewDataset(X_train, y_train, tokenizer, MAX_LEN)
test_dataset  = ReviewDataset(X_test,  y_test,  tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=16)


## 3. Modelo BERT para clasificación

In [None]:
from transformers import BertModel
import torch.nn as nn

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=3):
        super().__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)


## 4. Entrenamiento

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentClassifier().to(device)

EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


## 5. Training & Evaluation loops

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model.train()
    losses = []
    correct = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, targets)

        _, preds = torch.max(outputs, dim=1)
        correct += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct.double() / len(data_loader.dataset), np.mean(losses)


def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["targets"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, targets)

            _, preds = torch.max(outputs, dim=1)
            correct += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct.double() / len(data_loader.dataset), np.mean(losses)


In [None]:
## 6. Ciclo de entrenamiento

In [None]:
best_acc = 0

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")

    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, scheduler
    )

    val_acc, val_loss = eval_model(
        model, test_loader, loss_fn, device
    )

    print(f"Train acc: {train_acc:.4f}, loss: {train_loss:.4f}")
    print(f"Val   acc: {val_acc:.4f}, loss: {val_loss:.4f}")

    if val_acc > best_acc:
        torch.save(model.state_dict(), "best_bert_model.bin")
        best_acc = val_acc


Epoch 1/3
Train acc: 0.5553, loss: 0.8959
Val   acc: 0.6753, loss: 0.7016
Epoch 2/3
Train acc: 0.7496, loss: 0.5950
Val   acc: 0.6999, loss: 0.6642
Epoch 3/3
Train acc: 0.8409, loss: 0.4185
Val   acc: 0.7122, loss: 0.6773


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import torch

# Cargar modelo entrenado
model = SentimentClassifier().to(device)
model.load_state_dict(torch.load("best_bert_model.bin", map_location=device))
model.eval()

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)

        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Métricas
print("Accuracy:", accuracy_score(all_targets, all_preds))
print("F1 macro:", f1_score(all_targets, all_preds, average="macro"))
print("F1 weighted:", f1_score(all_targets, all_preds, average="weighted"))

print("\nClassification report:")
print(classification_report(all_targets, all_preds, target_names=["negative","neutral","positive"]))

print("\nConfusion matrix:")
print(confusion_matrix(all_targets, all_preds))


Accuracy: 0.7122060470324748
F1 macro: 0.7125321994207185
F1 weighted: 0.7124840209703995

Classification report:
              precision    recall  f1-score   support

    negative       0.77      0.74      0.76       297
     neutral       0.60      0.61      0.61       298
    positive       0.77      0.78      0.78       298

    accuracy                           0.71       893
   macro avg       0.71      0.71      0.71       893
weighted avg       0.71      0.71      0.71       893


Confusion matrix:
[[221  63  13]
 [ 59 182  57]
 [  8  57 233]]
