In [None]:
# ✅ Instalar en Colab (si es necesario)
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Dataset personalizado
class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 4. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 6. Entrenamiento
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 7. Evaluación con AUC macro
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 8. Predicción sobre test real y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
test_encodings = tokenizer(list(dataTesting['plot']), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_torch.csv', index_label='ID')
print("✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.")
# 0.87906

Epoch 1 completado. Loss promedio: 0.3212
Epoch 2 completado. Loss promedio: 0.2285
Epoch 3 completado. Loss promedio: 0.1862
📈 MCAUC: 0.8778287892393899
✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.


# New Section

In [None]:
# ✅ Instalar en Colab (si es necesario)
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Dataset personalizado
class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 4. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 6. Entrenamiento
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 7. Evaluación con AUC macro
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 8. Predicción sobre test real y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
test_encodings = tokenizer(list(dataTesting['plot']), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_torch.csv', index_label='ID')
print("✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.")
# 0.89671

Epoch 1 completado. Loss promedio: 0.3349
Epoch 2 completado. Loss promedio: 0.2336
Epoch 3 completado. Loss promedio: 0.1913
Epoch 4 completado. Loss promedio: 0.1609
Epoch 5 completado. Loss promedio: 0.1360
Epoch 6 completado. Loss promedio: 0.1152
Epoch 7 completado. Loss promedio: 0.0982
Epoch 8 completado. Loss promedio: 0.0832
Epoch 9 completado. Loss promedio: 0.0719
Epoch 10 completado. Loss promedio: 0.0622
📈 MCAUC: 0.8993715939939145
✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.


In [None]:
# ✅ Instalar en Colab (si es necesario)
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Dataset personalizado
class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 4. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 6. Entrenamiento
model.train()
for epoch in range(20):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 7. Evaluación con AUC macro
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 8. Predicción sobre test real y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
test_encodings = tokenizer(list(dataTesting['plot']), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_torch.csv', index_label='ID')
print("✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.")
# 0.89639

Epoch 1 completado. Loss promedio: 0.3218
Epoch 2 completado. Loss promedio: 0.2250
Epoch 3 completado. Loss promedio: 0.1842
Epoch 4 completado. Loss promedio: 0.1542
Epoch 5 completado. Loss promedio: 0.1300
Epoch 6 completado. Loss promedio: 0.1102
Epoch 7 completado. Loss promedio: 0.0940
Epoch 8 completado. Loss promedio: 0.0809
Epoch 9 completado. Loss promedio: 0.0698
Epoch 10 completado. Loss promedio: 0.0603
Epoch 11 completado. Loss promedio: 0.0521
Epoch 12 completado. Loss promedio: 0.0452
Epoch 13 completado. Loss promedio: 0.0391
Epoch 14 completado. Loss promedio: 0.0347
Epoch 15 completado. Loss promedio: 0.0294
Epoch 16 completado. Loss promedio: 0.0258
Epoch 17 completado. Loss promedio: 0.0226
Epoch 18 completado. Loss promedio: 0.0198
Epoch 19 completado. Loss promedio: 0.0178
Epoch 20 completado. Loss promedio: 0.0155
📈 MCAUC: 0.8918559901795774
✅ Archivo 'pred_genres_text_BERT_torch.csv' generado correctamente.


In [None]:
# 🛠️ Instala las librerías necesarias (si aún no están)
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 6. Evaluación
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

preds = []
model.eval()
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_concat.csv', index_label='ID')
print("✅ Archivo generado: pred_genres_text_BERT_concat.csv")
# 0.89671

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 🛠️ Instala las librerías necesarias (si aún no están)
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 6. Evaluación
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

preds = []
model.eval()
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_concat.csv', index_label='ID')
print("✅ Archivo generado: pred_genres_text_BERT_concat.csv")
# 0.90159

Epoch 1 completado. Loss promedio: 0.3219
Epoch 2 completado. Loss promedio: 0.2221
Epoch 3 completado. Loss promedio: 0.1800
Epoch 4 completado. Loss promedio: 0.1488
Epoch 5 completado. Loss promedio: 0.1245
Epoch 6 completado. Loss promedio: 0.1050
Epoch 7 completado. Loss promedio: 0.0897
Epoch 8 completado. Loss promedio: 0.0758
Epoch 9 completado. Loss promedio: 0.0650
Epoch 10 completado. Loss promedio: 0.0550
📈 MCAUC: 0.8997952115373078
✅ Archivo generado: pred_genres_text_BERT_concat.csv


In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_BERT_full.csv")
# 0.91335


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3018
Epoch 2 completado. Loss promedio: 0.2033
Epoch 3 completado. Loss promedio: 0.1634
Epoch 4 completado. Loss promedio: 0.1336
Epoch 5 completado. Loss promedio: 0.1113
Epoch 6 completado. Loss promedio: 0.0933
Epoch 7 completado. Loss promedio: 0.0774
Epoch 8 completado. Loss promedio: 0.0654
Epoch 9 completado. Loss promedio: 0.0550
Epoch 10 completado. Loss promedio: 0.0462
Epoch 11 completado. Loss promedio: 0.0390
Epoch 12 completado. Loss promedio: 0.0329
Epoch 13 completado. Loss promedio: 0.0284
Epoch 14 completado. Loss promedio: 0.0239
Epoch 15 completado. Loss promedio: 0.0203
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9996660574926551
✅ Archivo generado para Kaggle: pred_genres_text_BERT_full.csv


In [None]:
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Modelos a probar
modelos = [
    ("bert-base-uncased", "BERT"),
    ("roberta-base", "RoBERTa"),
    ("distilbert-base-uncased", "DistilBERT"),
    ("albert-base-v2", "ALBERT"),
    ("google/electra-base-discriminator", "ELECTRA"),
    ("microsoft/MiniLM-L12-H384-uncased", "MiniLM")
]

# 3. Loop para evaluar cada modelo
for model_name, display_name in modelos:
    print(f"\n🔄 Entrenando modelo: {display_name} ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    class MovieDataset(Dataset):
        def __init__(self, texts, labels):
            self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
            self.labels = torch.tensor(labels).float()
        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item
        def __len__(self):
            return len(self.labels)

    # 4. Partición de datos
    X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
    train_dataset = MovieDataset(X_train, y_train)
    test_dataset = MovieDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # 5. Modelo base exacto
    class Classifier(nn.Module):
        def __init__(self, num_labels):
            super().__init__()
            self.encoder = AutoModel.from_pretrained(model_name)
            hidden_size = self.encoder.config.hidden_size
            self.dropout = nn.Dropout(0.3)
            self.classifier = nn.Linear(hidden_size, num_labels)
        def forward(self, input_ids, attention_mask):
            outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.pooler_output if hasattr(outputs, 'pooler_output') else outputs.last_hidden_state[:, 0, :]
            x = self.dropout(pooled)
            return torch.sigmoid(self.classifier(x))

    model = Classifier(y.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCELoss()

    # 6. Entrenamiento (3 épocas)
    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # 7. Evaluación con AUC macro
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    print(f"📈 MCAUC ({display_name}): {roc_auc_score(y_true, y_pred, average='macro'):.5f}")



🔄 Entrenando modelo: BERT (bert-base-uncased)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3325
Epoch 2 completado. Loss promedio: 0.2338
Epoch 3 completado. Loss promedio: 0.1922
📈 MCAUC (BERT): 0.87729

🔄 Entrenando modelo: RoBERTa (roberta-base)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completado. Loss promedio: 0.3003
Epoch 2 completado. Loss promedio: 0.2172
Epoch 3 completado. Loss promedio: 0.1842
📈 MCAUC (RoBERTa): 0.88614

🔄 Entrenando modelo: DistilBERT (distilbert-base-uncased)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2829
Epoch 2 completado. Loss promedio: 0.2066
Epoch 3 completado. Loss promedio: 0.1734
📈 MCAUC (DistilBERT): 0.89603

🔄 Entrenando modelo: ALBERT (albert-base-v2)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3195
Epoch 2 completado. Loss promedio: 0.2892
Epoch 3 completado. Loss promedio: 0.2729
📈 MCAUC (ALBERT): 0.65739

🔄 Entrenando modelo: ELECTRA (google/electra-base-discriminator)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3119
Epoch 2 completado. Loss promedio: 0.2294
Epoch 3 completado. Loss promedio: 0.1944
📈 MCAUC (ELECTRA): 0.87506

🔄 Entrenando modelo: MiniLM (microsoft/MiniLM-L12-H384-uncased)


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.4016
Epoch 2 completado. Loss promedio: 0.2930
Epoch 3 completado. Loss promedio: 0.2740
📈 MCAUC (MiniLM): 0.66168


In [None]:
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Modelos a probar
modelos = [
    ("bert-base-uncased", "BERT"),
    ("roberta-base", "RoBERTa"),
    ("distilbert-base-uncased", "DistilBERT"),
    ("albert-base-v2", "ALBERT"),
    ("google/electra-base-discriminator", "ELECTRA"),
    ("microsoft/MiniLM-L12-H384-uncased", "MiniLM")
]

# 3. Loop para evaluar cada modelo
for model_name, display_name in modelos:
    print(f"\n🔄 Entrenando modelo: {display_name} ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    class MovieDataset(Dataset):
        def __init__(self, texts, labels):
            self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
            self.labels = torch.tensor(labels).float()
        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item
        def __len__(self):
            return len(self.labels)

    # 4. Partición de datos
    X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
    train_dataset = MovieDataset(X_train, y_train)
    test_dataset = MovieDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # 5. Modelo base exacto
    class Classifier(nn.Module):
        def __init__(self, num_labels):
            super().__init__()
            self.encoder = AutoModel.from_pretrained(model_name)
            hidden_size = self.encoder.config.hidden_size
            self.dropout = nn.Dropout(0.3)
            self.classifier = nn.Linear(hidden_size, num_labels)
        def forward(self, input_ids, attention_mask):
            outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.pooler_output if hasattr(outputs, 'pooler_output') else outputs.last_hidden_state[:, 0, :]
            x = self.dropout(pooled)
            return torch.sigmoid(self.classifier(x))

    model = Classifier(y.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCELoss()

    # 6. Entrenamiento (3 épocas)
    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # 7. Evaluación con AUC macro
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    print(f"📈 MCAUC ({display_name}): {roc_auc_score(y_true, y_pred, average='macro'):.5f}")



🔄 Entrenando modelo: BERT (bert-base-uncased)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3243
Epoch 2 completado. Loss promedio: 0.2275
Epoch 3 completado. Loss promedio: 0.1861
📈 MCAUC (BERT): 0.88187

🔄 Entrenando modelo: RoBERTa (roberta-base)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completado. Loss promedio: 0.2962
Epoch 2 completado. Loss promedio: 0.2104
Epoch 3 completado. Loss promedio: 0.1772
📈 MCAUC (RoBERTa): 0.88725

🔄 Entrenando modelo: DistilBERT (distilbert-base-uncased)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2903
Epoch 2 completado. Loss promedio: 0.2078
Epoch 3 completado. Loss promedio: 0.1739
📈 MCAUC (DistilBERT): 0.89506

🔄 Entrenando modelo: ALBERT (albert-base-v2)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3014
Epoch 2 completado. Loss promedio: 0.2268
Epoch 3 completado. Loss promedio: 0.1930
📈 MCAUC (ALBERT): 0.87236

🔄 Entrenando modelo: ELECTRA (google/electra-base-discriminator)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3110
Epoch 2 completado. Loss promedio: 0.2256
Epoch 3 completado. Loss promedio: 0.1906
📈 MCAUC (ELECTRA): 0.88203

🔄 Entrenando modelo: MiniLM (microsoft/MiniLM-L12-H384-uncased)


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.4008
Epoch 2 completado. Loss promedio: 0.2924
Epoch 3 completado. Loss promedio: 0.2730
📈 MCAUC (MiniLM): 0.66701


In [None]:
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import DistilBertTokenizer, DistilBertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 3. Dataset personalizado
class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 4. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['plot'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Modelo DistilBERT Multilabel
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # usar token [CLS]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 6. Entrenamiento
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

# 7. Evaluación con AUC macro
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC:", roc_auc_score(y_true, y_pred, average='macro'))

# 8. Predicción sobre test real y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
test_encodings = tokenizer(list(dataTesting['plot']), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_DistilBERT.csv', index_label='ID')
print("✅ Archivo 'pred_genres_text_DistilBERT.csv' generado correctamente.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2863
Epoch 2 completado. Loss promedio: 0.2048
Epoch 3 completado. Loss promedio: 0.1707
Epoch 4 completado. Loss promedio: 0.1450
Epoch 5 completado. Loss promedio: 0.1220
Epoch 6 completado. Loss promedio: 0.1030
Epoch 7 completado. Loss promedio: 0.0865
Epoch 8 completado. Loss promedio: 0.0730
Epoch 9 completado. Loss promedio: 0.0617
Epoch 10 completado. Loss promedio: 0.0517
Epoch 11 completado. Loss promedio: 0.0443
Epoch 12 completado. Loss promedio: 0.0374
Epoch 13 completado. Loss promedio: 0.0314
Epoch 14 completado. Loss promedio: 0.0270
Epoch 15 completado. Loss promedio: 0.0227
📈 MCAUC: 0.8937670420967231
✅ Archivo 'pred_genres_text_DistilBERT.csv' generado correctamente.


In [None]:
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import RobertaTokenizer, RobertaModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo RoBERTa Multilabel (igual estructura)
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # primer token
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(20):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_RoBERTa_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_RoBERTa_full.csv")
# 0.90437

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completado. Loss promedio: 0.2564
Epoch 2 completado. Loss promedio: 0.1834
Epoch 3 completado. Loss promedio: 0.1540
Epoch 4 completado. Loss promedio: 0.1309
Epoch 5 completado. Loss promedio: 0.1124
Epoch 6 completado. Loss promedio: 0.0961
Epoch 7 completado. Loss promedio: 0.0829
Epoch 8 completado. Loss promedio: 0.0704
Epoch 9 completado. Loss promedio: 0.0585
Epoch 10 completado. Loss promedio: 0.0491
Epoch 11 completado. Loss promedio: 0.0415
Epoch 12 completado. Loss promedio: 0.0342
Epoch 13 completado. Loss promedio: 0.0293
Epoch 14 completado. Loss promedio: 0.0250
Epoch 15 completado. Loss promedio: 0.0217
Epoch 16 completado. Loss promedio: 0.0196
Epoch 17 completado. Loss promedio: 0.0165
Epoch 18 completado. Loss promedio: 0.0148
Epoch 19 completado. Loss promedio: 0.0140
Epoch 20 completado. Loss promedio: 0.0128
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9999551270132044
✅ Archivo generado para Kaggle: pred_genres_text_RoBERTa_full.csv


In [None]:
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import RobertaTokenizer, RobertaModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo RoBERTa Multilabel (igual estructura)
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # primer token
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_RoBERTa_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_RoBERTa_full.csv")
# 0.91238

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completado. Loss promedio: 0.2543
Epoch 2 completado. Loss promedio: 0.1818
Epoch 3 completado. Loss promedio: 0.1514
Epoch 4 completado. Loss promedio: 0.1301
Epoch 5 completado. Loss promedio: 0.1104
Epoch 6 completado. Loss promedio: 0.0943
Epoch 7 completado. Loss promedio: 0.0810
Epoch 8 completado. Loss promedio: 0.0689
Epoch 9 completado. Loss promedio: 0.0575
Epoch 10 completado. Loss promedio: 0.0478
Epoch 11 completado. Loss promedio: 0.0407
Epoch 12 completado. Loss promedio: 0.0332
Epoch 13 completado. Loss promedio: 0.0275
Epoch 14 completado. Loss promedio: 0.0242
Epoch 15 completado. Loss promedio: 0.0208
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9999594742812302
✅ Archivo generado para Kaggle: pred_genres_text_RoBERTa_full.csv


In [None]:
# 🛠️ Instalar librerías necesarias
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + ": " + df["plot"]  # 🔧 AÑO ELIMINADO

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + ": " + dataTesting["plot"]  # 🔧 AÑO ELIMINADO
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_no_year.csv', index_label='ID')
print("✅ Archivo generado sin año: pred_genres_text_BERT_no_year.csv")
# 0.90346


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2923
Epoch 2 completado. Loss promedio: 0.2025
Epoch 3 completado. Loss promedio: 0.1634
Epoch 4 completado. Loss promedio: 0.1336
Epoch 5 completado. Loss promedio: 0.1103
Epoch 6 completado. Loss promedio: 0.0919
Epoch 7 completado. Loss promedio: 0.0763
Epoch 8 completado. Loss promedio: 0.0644
Epoch 9 completado. Loss promedio: 0.0536
Epoch 10 completado. Loss promedio: 0.0453
Epoch 11 completado. Loss promedio: 0.0381
Epoch 12 completado. Loss promedio: 0.0321
Epoch 13 completado. Loss promedio: 0.0271
Epoch 14 completado. Loss promedio: 0.0231
Epoch 15 completado. Loss promedio: 0.0194
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9997665672670131
✅ Archivo generado sin año: pred_genres_text_BERT_no_year.csv


In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(12):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_BERT_full.csv")
# 0.91257


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(18):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_BERT_full.csv")
# 0.91304


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2986
Epoch 2 completado. Loss promedio: 0.2044
Epoch 3 completado. Loss promedio: 0.1639
Epoch 4 completado. Loss promedio: 0.1349
Epoch 5 completado. Loss promedio: 0.1113
Epoch 6 completado. Loss promedio: 0.0927
Epoch 7 completado. Loss promedio: 0.0779
Epoch 8 completado. Loss promedio: 0.0643
Epoch 9 completado. Loss promedio: 0.0537
Epoch 10 completado. Loss promedio: 0.0457
Epoch 11 completado. Loss promedio: 0.0379
Epoch 12 completado. Loss promedio: 0.0317
Epoch 13 completado. Loss promedio: 0.0266
Epoch 14 completado. Loss promedio: 0.0227
Epoch 15 completado. Loss promedio: 0.0198
Epoch 16 completado. Loss promedio: 0.0169
Epoch 17 completado. Loss promedio: 0.0150
Epoch 18 completado. Loss promedio: 0.0137
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9996191970935708
✅ Archivo generado para Kaggle: pred_genres_text_BERT_full.csv


In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)

# ✅ Año al comienzo
df["input_text"] = df["year"].astype(str) + " " + df["title"] + ": " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')  # ✅ max_length aumentado
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento con todo el dataset
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["year"].astype(str) + " " + dataTesting["title"] + ": " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')  # ✅ también aquí
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_BERT_year_first.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_BERT_year_first.csv")
# 0.91258


Epoch 1 completado. Loss promedio: 0.2936
Epoch 2 completado. Loss promedio: 0.1966
Epoch 3 completado. Loss promedio: 0.1569
Epoch 4 completado. Loss promedio: 0.1291
Epoch 5 completado. Loss promedio: 0.1064
Epoch 6 completado. Loss promedio: 0.0884
Epoch 7 completado. Loss promedio: 0.0737
Epoch 8 completado. Loss promedio: 0.0611
Epoch 9 completado. Loss promedio: 0.0512
Epoch 10 completado. Loss promedio: 0.0426
Epoch 11 completado. Loss promedio: 0.0360
Epoch 12 completado. Loss promedio: 0.0301
Epoch 13 completado. Loss promedio: 0.0257
Epoch 14 completado. Loss promedio: 0.0222
Epoch 15 completado. Loss promedio: 0.0187
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9998319396345208
✅ Archivo generado para Kaggle: pred_genres_text_BERT_year_first.csv


In [None]:
# ✅ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# ⚙️ Modelos a evaluar
model_list = [
    ("microsoft/deberta-v3-base", "DeBERTa v3"),
    ("xlm-roberta-base", "XLM-RoBERTa"),
    ("bert-base-cased", "BERT cased"),
    ("google-bert/bert-base-uncased", "Google BERT"),
    ("google/mobilebert-uncased", "MobileBERT")
]

# 📄 Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 🧾 Dataset personalizado
class MovieDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 🧪 Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)

# 🧠 Clase para cada modelo
class CustomClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        if hasattr(outputs, "pooler_output"):
            x = outputs.pooler_output
        else:
            x = outputs.last_hidden_state[:, 0]  # CLS token
        x = self.dropout(x)
        return torch.sigmoid(self.classifier(x))

# 🚀 Entrenar y evaluar cada modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []

for model_name, display_name in model_list:
    print(f"\n🔄 Entrenando modelo: {display_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = MovieDataset(X_train, y_train, tokenizer)
    test_dataset = MovieDataset(X_test, y_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    model = CustomClassifier(model_name, y.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCELoss()

    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # 🎯 Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    score = roc_auc_score(y_true, y_pred, average='macro')
    results.append((display_name, score))
    print(f"📈 MCAUC ({display_name}): {score:.5f}")

# 📊 Mostrar resumen final
results.sort(key=lambda x: x[1], reverse=True)
print("\n🏁 Resultados finales:")
for name, score in results:
    print(f"{name:20s}: {score:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2951
Epoch 2 completado. Loss promedio: 0.2161
Epoch 3 completado. Loss promedio: 0.1850
📈 MCAUC (DeBERTa v3): 0.89357

🔄 Entrenando modelo: XLM-RoBERTa


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3284
Epoch 2 completado. Loss promedio: 0.2678
Epoch 3 completado. Loss promedio: 0.2315
📈 MCAUC (XLM-RoBERTa): 0.82531

🔄 Entrenando modelo: BERT cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3140
Epoch 2 completado. Loss promedio: 0.2211
Epoch 3 completado. Loss promedio: 0.1810
📈 MCAUC (BERT cased): 0.88235

🔄 Entrenando modelo: Google BERT


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3234
Epoch 2 completado. Loss promedio: 0.2272
Epoch 3 completado. Loss promedio: 0.1843
📈 MCAUC (Google BERT): 0.88813

🔄 Entrenando modelo: MobileBERT


config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 45.9943
Epoch 2 completado. Loss promedio: 45.5143
Epoch 3 completado. Loss promedio: 45.3794
📈 MCAUC (MobileBERT): 0.50000

🏁 Resultados finales:
DeBERTa v3          : 0.89357
Google BERT         : 0.88813
BERT cased          : 0.88235
XLM-RoBERTa         : 0.82531
MobileBERT          : 0.50000


In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo DeBERTa v3 Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # usar el token CLS
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación (solo referencia)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_DeBERTa_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2725
Epoch 2 completado. Loss promedio: 0.1988
Epoch 3 completado. Loss promedio: 0.1686
Epoch 4 completado. Loss promedio: 0.1467
Epoch 5 completado. Loss promedio: 0.1287
Epoch 6 completado. Loss promedio: 0.1143
Epoch 7 completado. Loss promedio: 0.1002
Epoch 8 completado. Loss promedio: 0.0887
Epoch 9 completado. Loss promedio: 0.0776
Epoch 10 completado. Loss promedio: 0.0683
Epoch 11 completado. Loss promedio: 0.0602
Epoch 12 completado. Loss promedio: 0.0516
Epoch 13 completado. Loss promedio: 0.0445
Epoch 14 completado. Loss promedio: 0.0388
Epoch 15 completado. Loss promedio: 0.0346
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9993652272846125
✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv


In [None]:
# ✅ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Usar todo el dataset para entrenamiento y validación
X_all, y_all = df['input_text'], y
full_dataset = MovieDataset(X_all, y_all)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.deberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        # Reemplazamos pooler_output por CLS token
        x = self.dropout(outputs.last_hidden_state[:, 0, :])
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Evaluación sobre los mismos datos (solo referencia interna)
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask).cpu().numpy()
        all_preds.append(outputs)
        all_labels.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_labels)
print("📈 MCAUC (referencia, mismo set de entrenamiento):", roc_auc_score(y_true, y_pred, average='macro'))

# 7. Predicción final para Kaggle y exportación
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_DeBERTa.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa.csv")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2873
Epoch 2 completado. Loss promedio: 0.2062
Epoch 3 completado. Loss promedio: 0.1776
Epoch 4 completado. Loss promedio: 0.1559
Epoch 5 completado. Loss promedio: 0.1387
Epoch 6 completado. Loss promedio: 0.1241
Epoch 7 completado. Loss promedio: 0.1110
Epoch 8 completado. Loss promedio: 0.0983
Epoch 9 completado. Loss promedio: 0.0873
Epoch 10 completado. Loss promedio: 0.0776
Epoch 11 completado. Loss promedio: 0.0691
Epoch 12 completado. Loss promedio: 0.0606
Epoch 13 completado. Loss promedio: 0.0549
Epoch 14 completado. Loss promedio: 0.0473
Epoch 15 completado. Loss promedio: 0.0417
📈 MCAUC (referencia, mismo set de entrenamiento): 0.9985977470867393
✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa.csv


In [None]:
# 🛠️ Instalar librerías necesarias (si aún no están)
# !pip install transformers scikit-learn torch pandas --quiet

import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento y evaluación por época
for epoch in range(30):  # Cambia aquí si quieres más o menos épocas
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación por época
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.3216
📈 MCAUC Epoch 1: 0.79803
Epoch 2 completado. Loss promedio: 0.2249
📈 MCAUC Epoch 2: 0.86380
Epoch 3 completado. Loss promedio: 0.1825
📈 MCAUC Epoch 3: 0.89083
Epoch 4 completado. Loss promedio: 0.1514
📈 MCAUC Epoch 4: 0.89456
Epoch 5 completado. Loss promedio: 0.1273
📈 MCAUC Epoch 5: 0.89729
Epoch 6 completado. Loss promedio: 0.1072
📈 MCAUC Epoch 6: 0.89474
Epoch 7 completado. Loss promedio: 0.0915
📈 MCAUC Epoch 7: 0.89810
Epoch 8 completado. Loss promedio: 0.0781
📈 MCAUC Epoch 8: 0.90008
Epoch 9 completado. Loss promedio: 0.0663
📈 MCAUC Epoch 9: 0.90120
Epoch 10 completado. Loss promedio: 0.0570
📈 MCAUC Epoch 10: 0.89981
Epoch 11 completado. Loss promedio: 0.0487
📈 MCAUC Epoch 11: 0.89945
Epoch 12 completado. Loss promedio: 0.0425
📈 MCAUC Epoch 12: 0.90124
Epoch 13 completado. Loss promedio: 0.0368
📈 MCAUC Epoch 13: 0.90163
Epoch 14 completado. Loss promedio: 0.0313
📈 MCAUC Epoch 14: 0.89975
Epoch 15 completado. Loss promedio: 0.0272
📈 MCAUC E

In [None]:
# 🛠️ Instalar librerías necesarias
# !pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"] + " " + df["year"].astype(str)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 4. Modelo BERT Multilabel
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento y evaluación por época
for epoch in range(30):  # Cambia si deseas más o menos épocas
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación por época
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


Epoch 1 completado. Loss promedio: 0.3276
📈 MCAUC Epoch 1: 0.80167
Epoch 2 completado. Loss promedio: 0.2281
📈 MCAUC Epoch 2: 0.86653
Epoch 3 completado. Loss promedio: 0.1850
📈 MCAUC Epoch 3: 0.88237
Epoch 4 completado. Loss promedio: 0.1537
📈 MCAUC Epoch 4: 0.88970
Epoch 5 completado. Loss promedio: 0.1287
📈 MCAUC Epoch 5: 0.89749
Epoch 6 completado. Loss promedio: 0.1082
📈 MCAUC Epoch 6: 0.89461
Epoch 7 completado. Loss promedio: 0.0922
📈 MCAUC Epoch 7: 0.89329
Epoch 8 completado. Loss promedio: 0.0788
📈 MCAUC Epoch 8: 0.89797
Epoch 9 completado. Loss promedio: 0.0674
📈 MCAUC Epoch 9: 0.89646
Epoch 10 completado. Loss promedio: 0.0580
📈 MCAUC Epoch 10: 0.89943
Epoch 11 completado. Loss promedio: 0.0502
📈 MCAUC Epoch 11: 0.89765
Epoch 12 completado. Loss promedio: 0.0434
📈 MCAUC Epoch 12: 0.89913
Epoch 13 completado. Loss promedio: 0.0379
📈 MCAUC Epoch 13: 0.89784
Epoch 14 completado. Loss promedio: 0.0322
📈 MCAUC Epoch 14: 0.89948
Epoch 15 completado. Loss promedio: 0.0281
📈 MCAUC E

In [None]:
# 🛠️ Instalar librerías necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"] + " " + df["year"].astype(str)  # plot-título-año

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

# 5. Entrenamiento y evaluación por época
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2626
📈 MCAUC Epoch 1: 0.82587
Epoch 2 completado. Loss promedio: 0.1970
📈 MCAUC Epoch 2: 0.90088
Epoch 3 completado. Loss promedio: 0.1599
📈 MCAUC Epoch 3: 0.91676
Epoch 4 completado. Loss promedio: 0.1334
📈 MCAUC Epoch 4: 0.91934
Epoch 5 completado. Loss promedio: 0.1076
📈 MCAUC Epoch 5: 0.92440
Epoch 6 completado. Loss promedio: 0.0899
📈 MCAUC Epoch 6: 0.92285
Epoch 7 completado. Loss promedio: 0.0703
📈 MCAUC Epoch 7: 0.92441
Epoch 8 completado. Loss promedio: 0.0563
📈 MCAUC Epoch 8: 0.92015
Epoch 9 completado. Loss promedio: 0.0492
📈 MCAUC Epoch 9: 0.90898
Epoch 10 completado. Loss promedio: 0.0399
📈 MCAUC Epoch 10: 0.91244


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"] + " " + df["year"].astype(str)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Cargar dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["plot"] + " " + dataTesting["title"] + " " + dataTesting["year"].astype(str)
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_DeBERTa_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv")
# 0.50974



Epoch 1 completado. Loss promedio: 0.2606
Epoch 2 completado. Loss promedio: 0.2999
Epoch 3 completado. Loss promedio: 0.2984
Epoch 4 completado. Loss promedio: 0.2978
Epoch 5 completado. Loss promedio: 0.2974
✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"] + " " + df["year"].astype(str)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Cargar dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["plot"] + " " + dataTesting["title"] + " " + dataTesting["year"].astype(str)
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=256, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_DeBERTa_full.csv', index_label='ID')
print("✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv")




Epoch 1 completado. Loss promedio: 0.2283
Epoch 2 completado. Loss promedio: 0.1643
Epoch 3 completado. Loss promedio: 0.1305
Epoch 4 completado. Loss promedio: 0.1080
Epoch 5 completado. Loss promedio: 0.0824
✅ Archivo generado para Kaggle: pred_genres_text_DeBERTa_full.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"]  # Quitamos el año

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época con scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2346
📈 MCAUC Epoch 1: 0.89370
Epoch 2 completado. Loss promedio: 0.1659
📈 MCAUC Epoch 2: 0.91494
Epoch 3 completado. Loss promedio: 0.1318
📈 MCAUC Epoch 3: 0.91900
Epoch 4 completado. Loss promedio: 0.1055
📈 MCAUC Epoch 4: 0.92504
Epoch 5 completado. Loss promedio: 0.0842
📈 MCAUC Epoch 5: 0.92557
Epoch 6 completado. Loss promedio: 0.0670
📈 MCAUC Epoch 6: 0.92384
Epoch 7 completado. Loss promedio: 0.0523
📈 MCAUC Epoch 7: 0.92384
Epoch 8 completado. Loss promedio: 0.0419
📈 MCAUC Epoch 8: 0.92177
Epoch 9 completado. Loss promedio: 0.0343
📈 MCAUC Epoch 9: 0.91894
Epoch 10 completado. Loss promedio: 0.0298
📈 MCAUC Epoch 10: 0.91991


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["plot"] + " " + df["title"] + " " + df["year"].astype(str)  # ✅ AÑADIMOS EL AÑO

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2409
📈 MCAUC Epoch 1: 0.87442
Epoch 2 completado. Loss promedio: 0.1740
📈 MCAUC Epoch 2: 0.91414
Epoch 3 completado. Loss promedio: 0.1408
📈 MCAUC Epoch 3: 0.92479
Epoch 4 completado. Loss promedio: 0.1153
📈 MCAUC Epoch 4: 0.93015
Epoch 5 completado. Loss promedio: 0.0933
📈 MCAUC Epoch 5: 0.93173
Epoch 6 completado. Loss promedio: 0.0743
📈 MCAUC Epoch 6: 0.93141
Epoch 7 completado. Loss promedio: 0.0599
📈 MCAUC Epoch 7: 0.93036
Epoch 8 completado. Loss promedio: 0.0480
📈 MCAUC Epoch 8: 0.92719
Epoch 9 completado. Loss promedio: 0.0398
📈 MCAUC Epoch 9: 0.92762
Epoch 10 completado. Loss promedio: 0.0345
📈 MCAUC Epoch 10: 0.92668


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2341
📈 MCAUC Epoch 1: 0.90520
Epoch 2 completado. Loss promedio: 0.1628
📈 MCAUC Epoch 2: 0.92606
Epoch 3 completado. Loss promedio: 0.1271
📈 MCAUC Epoch 3: 0.93374
Epoch 4 completado. Loss promedio: 0.1015
📈 MCAUC Epoch 4: 0.93489
Epoch 5 completado. Loss promedio: 0.0801
📈 MCAUC Epoch 5: 0.93564
Epoch 6 completado. Loss promedio: 0.0627
📈 MCAUC Epoch 6: 0.93609
Epoch 7 completado. Loss promedio: 0.0487
📈 MCAUC Epoch 7: 0.93307
Epoch 8 completado. Loss promedio: 0.0389
📈 MCAUC Epoch 8: 0.93205
Epoch 9 completado. Loss promedio: 0.0319
📈 MCAUC Epoch 9: 0.92942
Epoch 10 completado. Loss promedio: 0.0274
📈 MCAUC Epoch 10: 0.92947


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(6):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep6.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv")




Epoch 1 completado. Loss promedio: 0.2320
Epoch 2 completado. Loss promedio: 0.1610
Epoch 3 completado. Loss promedio: 0.1276
Epoch 4 completado. Loss promedio: 0.1035
Epoch 5 completado. Loss promedio: 0.0828
Epoch 6 completado. Loss promedio: 0.0655
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2373
📈 MCAUC Epoch 1: 0.89268
Epoch 2 completado. Loss promedio: 0.1681
📈 MCAUC Epoch 2: 0.92031
Epoch 3 completado. Loss promedio: 0.1326
📈 MCAUC Epoch 3: 0.92968
Epoch 4 completado. Loss promedio: 0.1072
📈 MCAUC Epoch 4: 0.93321
Epoch 5 completado. Loss promedio: 0.0857
📈 MCAUC Epoch 5: 0.93186
Epoch 6 completado. Loss promedio: 0.0682
📈 MCAUC Epoch 6: 0.93215
Epoch 7 completado. Loss promedio: 0.0542
📈 MCAUC Epoch 7: 0.93208
Epoch 8 completado. Loss promedio: 0.0432
📈 MCAUC Epoch 8: 0.93058
Epoch 9 completado. Loss promedio: 0.0356
📈 MCAUC Epoch 9: 0.92905
Epoch 10 completado. Loss promedio: 0.0308
📈 MCAUC Epoch 10: 0.92973


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)

# 🧠 Agregar variable de década
df['decade'] = (df['year'] // 10 * 10).astype(str) + "s"

# 🎯 Formato input_text con década incluida
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + ", " + df["decade"] + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")




Epoch 1 completado. Loss promedio: 0.2369
📈 MCAUC Epoch 1: 0.88839
Epoch 2 completado. Loss promedio: 0.1645
📈 MCAUC Epoch 2: 0.92617
Epoch 3 completado. Loss promedio: 0.1294
📈 MCAUC Epoch 3: 0.93254
Epoch 4 completado. Loss promedio: 0.1032
📈 MCAUC Epoch 4: 0.93456
Epoch 5 completado. Loss promedio: 0.0820
📈 MCAUC Epoch 5: 0.93378


KeyboardInterrupt: 

In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)

# 🧠 Agregar variable de década
df['decade'] = (df['year'] // 10 * 10).astype(str) + "s"

# 🎯 Formato input_text con década incluida
df["input_text"] = df["title"] + " (" + df["decade"] + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Modelo DeBERTa Multilabel
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    # Evaluación
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")




Epoch 1 completado. Loss promedio: 0.2364
📈 MCAUC Epoch 1: 0.89618
Epoch 2 completado. Loss promedio: 0.1664
📈 MCAUC Epoch 2: 0.92118
Epoch 3 completado. Loss promedio: 0.1310
📈 MCAUC Epoch 3: 0.92906
Epoch 4 completado. Loss promedio: 0.1045
📈 MCAUC Epoch 4: 0.93452
Epoch 5 completado. Loss promedio: 0.0832
📈 MCAUC Epoch 5: 0.93465
Epoch 6 completado. Loss promedio: 0.0656
📈 MCAUC Epoch 6: 0.93314


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)

# 🔧 Ingeniería de variables: KMeans sobre TF-IDF del plot
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['plot'])
kmeans = KMeans(n_clusters=10, random_state=42)
df['plot_cluster'] = kmeans.fit_predict(X_tfidf)

# Formato original + nuevo cluster agregado como texto
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"] + " Cluster" + df["plot_cluster"].astype(str)

# 2. Etiquetas multilabel
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 3. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 4. División train/test
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.33, random_state=42)
train_dataset = MovieDataset(X_train, y_train)
test_dataset = MovieDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 5. Modelo DeBERTa
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 6. Entrenamiento y evaluación por época
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
num_training_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(train_loader):.4f}")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            all_preds.append(outputs)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)
    mauc = roc_auc_score(y_true, y_pred, average='macro')
    print(f"📈 MCAUC Epoch {epoch+1}: {mauc:.5f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2405
📈 MCAUC Epoch 1: 0.89219
Epoch 2 completado. Loss promedio: 0.1732
📈 MCAUC Epoch 2: 0.91379
Epoch 3 completado. Loss promedio: 0.1390
📈 MCAUC Epoch 3: 0.92348
Epoch 4 completado. Loss promedio: 0.1133
📈 MCAUC Epoch 4: 0.93100
Epoch 5 completado. Loss promedio: 0.0912
📈 MCAUC Epoch 5: 0.93510
Epoch 6 completado. Loss promedio: 0.0730
📈 MCAUC Epoch 6: 0.93194


KeyboardInterrupt: 

In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(6):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep6.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv")
# Podemos probarlo con menos epocas



Epoch 1 completado. Loss promedio: 0.2248
Epoch 2 completado. Loss promedio: 0.1551
Epoch 3 completado. Loss promedio: 0.1232
Epoch 4 completado. Loss promedio: 0.0980
Epoch 5 completado. Loss promedio: 0.0772
Epoch 6 completado. Loss promedio: 0.0601
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep6.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv")
# Podemos probarlo con menos epocas 0.93820

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2192
Epoch 2 completado. Loss promedio: 0.1540
Epoch 3 completado. Loss promedio: 0.1210
Epoch 4 completado. Loss promedio: 0.0964
Epoch 5 completado. Loss promedio: 0.0757
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(4):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep4.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep4.csv")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2189
Epoch 2 completado. Loss promedio: 0.1552
Epoch 3 completado. Loss promedio: 0.1234
Epoch 4 completado. Loss promedio: 0.0994
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep4.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep5.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv")
# 5.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2223
Epoch 2 completado. Loss promedio: 0.1541
Epoch 3 completado. Loss promedio: 0.1224
Epoch 4 completado. Loss promedio: 0.0982
Epoch 5 completado. Loss promedio: 0.0777
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(6):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep6.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv")
# 6.2



Epoch 1 completado. Loss promedio: 0.2195
Epoch 2 completado. Loss promedio: 0.1546
Epoch 3 completado. Loss promedio: 0.1233
Epoch 4 completado. Loss promedio: 0.0992
Epoch 5 completado. Loss promedio: 0.0793
Epoch 6 completado. Loss promedio: 0.0619
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(6):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep6.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv")
# 6.3



Epoch 1 completado. Loss promedio: 0.2193
Epoch 2 completado. Loss promedio: 0.1541
Epoch 3 completado. Loss promedio: 0.1222
Epoch 4 completado. Loss promedio: 0.0985
Epoch 5 completado. Loss promedio: 0.0772
Epoch 6 completado. Loss promedio: 0.0604
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep5.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv")
# 5.5



Epoch 1 completado. Loss promedio: 0.2218
Epoch 2 completado. Loss promedio: 0.1564
Epoch 3 completado. Loss promedio: 0.1239
Epoch 4 completado. Loss promedio: 0.0997
Epoch 5 completado. Loss promedio: 0.0793
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep5.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv")
# 5.11

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2225
Epoch 2 completado. Loss promedio: 0.1552
Epoch 3 completado. Loss promedio: 0.1237
Epoch 4 completado. Loss promedio: 0.0987
Epoch 5 completado. Loss promedio: 0.0774
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(6):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep5.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv")
# 6.12

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch 1 completado. Loss promedio: 0.2241
Epoch 2 completado. Loss promedio: 0.1602
Epoch 3 completado. Loss promedio: 0.1281
Epoch 4 completado. Loss promedio: 0.1032
Epoch 5 completado. Loss promedio: 0.0821
Epoch 6 completado. Loss promedio: 0.0643
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep6.csv


In [None]:
# 🛠️ Instalar dependencias necesarias
!pip install transformers scikit-learn torch pandas --quiet

# 📚 Importar librerías
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch import nn

# 1. Cargar y preparar datos
df = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
df['genres'] = df['genres'].apply(ast.literal_eval)
df["input_text"] = df["title"] + " (" + df["year"].astype(str) + "): " + df["plot"]

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genres'])

# 2. Tokenizador y Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

class MovieDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=384, return_tensors='pt')
        self.labels = torch.tensor(labels).float()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# 3. Dataset completo para entrenamiento
full_dataset = MovieDataset(df['input_text'], y)
full_loader = DataLoader(full_dataset, batch_size=4, shuffle=True)

# 4. Modelo DeBERTa v3 Large
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("microsoft/deberta-v3-large")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        return torch.sigmoid(self.classifier(x))

# 5. Entrenamiento (usando solo hasta época 6 como la mejor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeBERTaClassifier(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

model.train()
for epoch in range(5):  # Solo hasta la mejor época encontrada
    total_loss = 0
    for batch in full_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completado. Loss promedio: {total_loss / len(full_loader):.4f}")

# 6. Predicción para Kaggle
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', index_col=0)
dataTesting["input_text"] = dataTesting["title"] + " (" + dataTesting["year"].astype(str) + "): " + dataTesting["plot"]
test_enc = tokenizer(list(dataTesting["input_text"]), truncation=True, padding=True, max_length=384, return_tensors='pt')
test_dataset = DataLoader(torch.utils.data.TensorDataset(test_enc['input_ids'], test_enc['attention_mask']), batch_size=32)

model.eval()
preds = []
with torch.no_grad():
    for input_ids_batch, attn_mask_batch in test_dataset:
        input_ids_batch = input_ids_batch.to(device)
        attn_mask_batch = attn_mask_batch.to(device)
        pred_batch = model(input_ids_batch, attn_mask_batch).cpu().numpy()
        preds.append(pred_batch)

# 7. Guardar archivo CSV
y_pred_test_final = np.vstack(preds)
cols = ['p_' + genre for genre in mlb.classes_]
res = pd.DataFrame(y_pred_test_final, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_DeBERTaV3_Ep5.csv', index_label='ID')
print("✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv")
# 5.19



Epoch 1 completado. Loss promedio: 0.2232
Epoch 2 completado. Loss promedio: 0.1594
Epoch 3 completado. Loss promedio: 0.1285
Epoch 4 completado. Loss promedio: 0.1066
Epoch 5 completado. Loss promedio: 0.0863
✅ Archivo final generado: pred_genres_DeBERTaV3_Ep5.csv
