# Ensemble Optimizado para Predicción de Géneros

Este notebook implementa un ensemble avanzado con:
- Data augmentation (back-translation)
- DeBERTa-v3 (modelo top del leaderboard)
- Optimización de pesos con Optuna
- Test-time augmentation

## 1. Imports y Configuración

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import hstack as sp_hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sentence_transformers import SentenceTransformer
from transformers import (
    DistilBertTokenizer, DistilBertForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments,
    MarianMTModel, MarianTokenizer
)
from torch.utils.data import Dataset
import torch
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm





## 2. Carga y Preparación de Datos

In [2]:
train_dir = Path("../dataset_train.csv")
test_dir = Path("../dataset_test.csv")

df = pd.read_csv(train_dir)
print(f"Dataset size: {len(df)}")
df.head()

Dataset size: 8475


Unnamed: 0,movie_name,genre,description
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ..."
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...


In [3]:
df["text"] = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
y_list = df["genre"].apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_list)

print(f"Number of labels: {len(mlb.classes_)}")
print(f"Label distribution shape: {Y.shape}")

Number of labels: 18
Label distribution shape: (8475, 18)


## 3. Data Augmentation - Back Translation

In [4]:
def back_translate(texts, src_lang='en', pivot_lang='fr', sample_ratio=0.2):
    model_name_en_pivot = f'Helsinki-NLP/opus-mt-{src_lang}-{pivot_lang}'
    model_name_pivot_en = f'Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}'
    
    tokenizer_en_pivot = MarianTokenizer.from_pretrained(model_name_en_pivot)
    model_en_pivot = MarianMTModel.from_pretrained(model_name_en_pivot)
    
    tokenizer_pivot_en = MarianTokenizer.from_pretrained(model_name_pivot_en)
    model_pivot_en = MarianMTModel.from_pretrained(model_name_pivot_en)
    
    augmented_texts = []
    indices_to_augment = np.random.choice(len(texts), size=int(len(texts) * sample_ratio), replace=False)
    
    for i, idx in enumerate(indices_to_augment):
        if i % 50 == 0:
            print(f"Augmenting {i}/{len(indices_to_augment)}...", end='\r')
        
        text = texts.iloc[idx] if hasattr(texts, 'iloc') else texts[idx]
        
        translated = model_en_pivot.generate(**tokenizer_en_pivot(text, return_tensors="pt", padding=True, truncation=True, max_length=128))
        pivot_text = tokenizer_en_pivot.decode(translated[0], skip_special_tokens=True)
        
        back_translated = model_pivot_en.generate(**tokenizer_pivot_en(pivot_text, return_tensors="pt", padding=True, truncation=True, max_length=128))
        final_text = tokenizer_pivot_en.decode(back_translated[0], skip_special_tokens=True)
        
        augmented_texts.append(final_text)
    
    print(f"Augmentation complete!" + " "*20)
    return augmented_texts, indices_to_augment

In [5]:
print(f"Original dataset size: {len(df)}")

augmented_texts, aug_indices = back_translate(df["text"], sample_ratio=0.2)

df_augmented = pd.DataFrame({
    'movie_name': [df.iloc[i]['movie_name'] for i in aug_indices],
    'description': augmented_texts,
    'genre': [df.iloc[i]['genre'] for i in aug_indices]
})

df_combined = pd.concat([df, df_augmented], ignore_index=True)
print(f"Augmented dataset size: {len(df_combined)}")

df_combined["text"] = df_combined["movie_name"].fillna("") + " [SEP] " + df_combined["description"].fillna("")
y_list_combined = df_combined["genre"].apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])
Y_combined = mlb.fit_transform(y_list_combined)

X_tr, X_va, y_tr, y_va = train_test_split(df_combined["text"], Y_combined, test_size=0.1, random_state=42)
print(f"Training samples: {len(X_tr)}, Validation samples: {len(X_va)}")

Original dataset size: 8475


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

Augmenting 0/1695...

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Augmentation complete!                    
Augmentation complete!                    
Augmented dataset size: 10170
Training samples: 9153, Validation samples: 1017
Augmented dataset size: 10170
Training samples: 9153, Validation samples: 1017


## 4. Feature Engineering - TF-IDF

In [6]:
tfidf_word = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=2,
    max_features=500_000,
    sublinear_tf=True,
    stop_words="english",
    max_df=0.85,
    strip_accents='unicode',
    lowercase=True
)

tfidf_char = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3,6),
    min_df=2,
    max_features=500_000,
    sublinear_tf=True,
    max_df=0.85,
    strip_accents='unicode'
)

Xw_tr = tfidf_word.fit_transform(X_tr)
Xw_va = tfidf_word.transform(X_va)
Xc_tr = tfidf_char.fit_transform(X_tr)
Xc_va = tfidf_char.transform(X_va)

XTR_tfidf = sp_hstack([Xw_tr, Xc_tr], format="csr")
XVA_tfidf = sp_hstack([Xw_va, Xc_va], format="csr")
print(f"Combined TF-IDF features shape: {XTR_tfidf.shape}")

Combined TF-IDF features shape: (9153, 205446)


## 5. Sentence Embeddings

In [7]:
st_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Generating embeddings...")
emb_tr = st_model.encode(X_tr.tolist(), show_progress_bar=True, batch_size=32)
emb_va = st_model.encode(X_va.tolist(), show_progress_bar=True, batch_size=32)

XTR_combined = sp_hstack([XTR_tfidf, csr_matrix(emb_tr)], format="csr")
XVA_combined = sp_hstack([XVA_tfidf, csr_matrix(emb_va)], format="csr")
print(f"Combined features (TF-IDF + Embeddings) shape: {XTR_combined.shape}")

Generating embeddings...


Batches: 100%|██████████| 287/287 [00:53<00:00,  5.32it/s]
Batches:   0%|          | 0/32 [00:00<?, ?it/s]
Batches: 100%|██████████| 32/32 [00:05<00:00,  5.50it/s]



Combined features (TF-IDF + Embeddings) shape: (9153, 205830)


## 6. Modelos Base - Entrenamiento

In [8]:
clf_logreg = OneVsRestClassifier(
    LogisticRegression(C=8.0, solver="saga", max_iter=4000, class_weight='balanced', random_state=42),
    n_jobs=-1
)
print("Training LogisticRegression...")
clf_logreg.fit(XTR_combined, y_tr)
print("LogReg training complete!")

Training LogisticRegression...
LogReg training complete!
LogReg training complete!


In [9]:
logits_logreg = clf_logreg.decision_function(XVA_combined)
ths_logreg = np.zeros(logits_logreg.shape[1])

for k in range(logits_logreg.shape[1]):
    s = logits_logreg[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_logreg[k] = best_t

pred_logreg = (logits_logreg >= ths_logreg).astype(int)
print(f"LogReg - micro-F1: {f1_score(y_va, pred_logreg, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_logreg, average='macro'):.4f}")

LogReg - micro-F1: 0.7315, macro-F1: 0.6982


In [10]:
clf_xgb = MultiOutputClassifier(
    XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
)
print("Training XGBoost...")
clf_xgb.fit(emb_tr, y_tr)
print("XGBoost training complete!")

Training XGBoost...
XGBoost training complete!
XGBoost training complete!


In [11]:
pred_proba_xgb = clf_xgb.predict_proba(emb_va)
logits_xgb = np.column_stack([p[:, 1] for p in pred_proba_xgb])
ths_xgb = np.zeros(logits_xgb.shape[1])

for k in range(logits_xgb.shape[1]):
    s = logits_xgb[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_xgb[k] = best_t

pred_xgb = (logits_xgb >= ths_xgb).astype(int)
print(f"XGBoost - micro-F1: {f1_score(y_va, pred_xgb, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_xgb, average='macro'):.4f}")

XGBoost - micro-F1: 0.6934, macro-F1: 0.6395


In [12]:
clf_svc = OneVsRestClassifier(
    LinearSVC(C=2.0, max_iter=4000, class_weight='balanced', dual='auto', random_state=42),
    n_jobs=-1
)
print("Training LinearSVC...")
clf_svc.fit(XTR_tfidf, y_tr)
print("SVC training complete!")

Training LinearSVC...
SVC training complete!
SVC training complete!


In [13]:
logits_svc = clf_svc.decision_function(XVA_tfidf)
ths_svc = np.zeros(logits_svc.shape[1])

for k in range(logits_svc.shape[1]):
    s = logits_svc[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_svc[k] = best_t

pred_svc = (logits_svc >= ths_svc).astype(int)
print(f"LinearSVC - micro-F1: {f1_score(y_va, pred_svc, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_svc, average='macro'):.4f}")

LinearSVC - micro-F1: 0.7258, macro-F1: 0.7030


## 7. DistilBERT Fine-tuning

In [14]:
class MovieGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if hasattr(self.texts, 'iloc') else str(self.texts[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [15]:
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

train_dataset_distilbert = MovieGenreDataset(X_tr, y_tr, tokenizer_distilbert, max_length=128)
val_dataset_distilbert = MovieGenreDataset(X_va, y_va, tokenizer_distilbert, max_length=128)
print(f"Datasets created: {len(train_dataset_distilbert)} training, {len(val_dataset_distilbert)} validation")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Datasets created: 9153 training, 1017 validation


In [16]:
training_args_distilbert = TrainingArguments(
    output_dir='./distilbert_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args_distilbert,
    train_dataset=train_dataset_distilbert,
    eval_dataset=val_dataset_distilbert,
)

print("Training DistilBERT...")
trainer_distilbert.train()
print("DistilBERT training complete!")

Training DistilBERT...


Epoch,Training Loss,Validation Loss
1,0.2659,0.261026
2,0.1965,0.20957
3,0.1459,0.203075


DistilBERT training complete!


In [17]:
model_distilbert.eval()
with torch.no_grad():
    val_inputs = tokenizer_distilbert(X_va.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
    outputs = model_distilbert(**val_inputs)
    logits_distilbert = torch.sigmoid(outputs.logits).cpu().numpy()

ths_distilbert = np.zeros(logits_distilbert.shape[1])
for k in range(logits_distilbert.shape[1]):
    s = logits_distilbert[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_distilbert[k] = best_t

pred_distilbert = (logits_distilbert >= ths_distilbert).astype(int)
print(f"DistilBERT - micro-F1: {f1_score(y_va, pred_distilbert, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_distilbert, average='macro'):.4f}")

DistilBERT - micro-F1: 0.7234, macro-F1: 0.6736


## 8. DeBERTa-v3 (Top Model)

In [18]:
tokenizer_deberta = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
model_deberta = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

train_dataset_deberta = MovieGenreDataset(X_tr, y_tr, tokenizer_deberta, max_length=256)
val_dataset_deberta = MovieGenreDataset(X_va, y_va, tokenizer_deberta, max_length=256)
print(f"DeBERTa datasets created")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

DeBERTa datasets created


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [19]:
training_args_deberta = TrainingArguments(
    output_dir='./deberta_results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    gradient_accumulation_steps=2,
)

trainer_deberta = Trainer(
    model=model_deberta,
    args=training_args_deberta,
    train_dataset=train_dataset_deberta,
    eval_dataset=val_dataset_deberta,
)

print("Training DeBERTa-v3...")
trainer_deberta.train()
print("DeBERTa training complete!")

Training DeBERTa-v3...



Epoch,Training Loss,Validation Loss
1,0.2799,0.265815


KeyboardInterrupt: 

In [None]:
model_deberta.eval()
with torch.no_grad():
    val_inputs = tokenizer_deberta(X_va.tolist(), truncation=True, padding=True, max_length=256, return_tensors='pt')
    outputs = model_deberta(**val_inputs)
    logits_deberta = torch.sigmoid(outputs.logits).cpu().numpy()

ths_deberta = np.zeros(logits_deberta.shape[1])
for k in range(logits_deberta.shape[1]):
    s = logits_deberta[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_deberta[k] = best_t

pred_deberta = (logits_deberta >= ths_deberta).astype(int)
print(f"DeBERTa-v3 - micro-F1: {f1_score(y_va, pred_deberta, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_deberta, average='macro'):.4f}")

## 9. Ensemble Optimization con Optuna

In [None]:
import optuna

def objective(trial):
    w_deberta = trial.suggest_float("w_deberta", 0.3, 0.6)
    w_distilbert = trial.suggest_float("w_distilbert", 0.1, 0.4)
    w_logreg = trial.suggest_float("w_logreg", 0.1, 0.3)
    w_xgb = trial.suggest_float("w_xgb", 0.05, 0.25)
    w_svc = max(0.0, 1.0 - w_deberta - w_distilbert - w_logreg - w_xgb)
    
    ensemble_logits_opt = (w_deberta * logits_deberta + 
                           w_distilbert * logits_distilbert + 
                           w_logreg * logits_logreg + 
                           w_xgb * logits_xgb + 
                           w_svc * logits_svc)
    
    ths_opt = np.zeros(ensemble_logits_opt.shape[1])
    for k in range(ensemble_logits_opt.shape[1]):
        s = ensemble_logits_opt[:, k]
        best_f1, best_t = 0.0, 0.0
        candidates = np.quantile(s, np.linspace(0.01, 0.99, 30))
        for t in candidates:
            preds_k = (s >= t).astype(int)
            f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        ths_opt[k] = best_t
    
    pred_opt = (ensemble_logits_opt >= ths_opt).astype(int)
    return f1_score(y_va, pred_opt, average='macro')

print("Optimizing ensemble weights with Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print(f"\nBest F1 macro: {study.best_value:.4f}")
print("Best weights:", study.best_params)

In [None]:
best_params = study.best_params
w_deberta_opt = best_params['w_deberta']
w_distilbert_opt = best_params['w_distilbert']
w_logreg_opt = best_params['w_logreg']
w_xgb_opt = best_params['w_xgb']
w_svc_opt = 1.0 - w_deberta_opt - w_distilbert_opt - w_logreg_opt - w_xgb_opt

ensemble_optimized = (w_deberta_opt * logits_deberta + 
                      w_distilbert_opt * logits_distilbert + 
                      w_logreg_opt * logits_logreg + 
                      w_xgb_opt * logits_xgb + 
                      w_svc_opt * logits_svc)

ths_optimized = np.zeros(ensemble_optimized.shape[1])
for k in range(ensemble_optimized.shape[1]):
    s = ensemble_optimized[:, k]
    best_f1, best_t = 0.0, 0.0
    candidates = np.quantile(s, np.linspace(0.01, 0.99, 50))
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths_optimized[k] = best_t

pred_optimized = (ensemble_optimized >= ths_optimized).astype(int)
print(f"Optimized Ensemble - micro-F1: {f1_score(y_va, pred_optimized, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_optimized, average='macro'):.4f}")

## 10. Test Time Augmentation y Predicción Final

In [None]:
def tta_predict_deberta(texts, model, tokenizer, n_augmentations=3):
    all_predictions = []
    
    model.eval()
    with torch.no_grad():
        test_inputs = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
        outputs = model(**test_inputs)
        all_predictions.append(torch.sigmoid(outputs.logits).cpu().numpy())
    
    for _ in range(n_augmentations):
        model.train()
        with torch.no_grad():
            test_inputs = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
            outputs = model(**test_inputs)
            all_predictions.append(torch.sigmoid(outputs.logits).cpu().numpy())
    
    return np.mean(all_predictions, axis=0)

In [None]:
df_test = pd.read_csv(test_dir)
df_test["text"] = df_test["movie_name"].fillna("") + " [SEP] " + df_test["description"].fillna("")

print("Generating DeBERTa predictions with TTA...")
logits_deberta_test_tta = tta_predict_deberta(df_test["text"].tolist(), model_deberta, tokenizer_deberta)

print("Generating other model predictions...")
Xw_test = tfidf_word.transform(df_test["text"])
Xc_test = tfidf_char.transform(df_test["text"])
X_test_tfidf = sp_hstack([Xw_test, Xc_test], format="csr")
emb_test = st_model.encode(df_test["text"].tolist(), show_progress_bar=True, batch_size=32)
X_test_combined = sp_hstack([X_test_tfidf, csr_matrix(emb_test)], format="csr")

with torch.no_grad():
    test_inputs = tokenizer_distilbert(df_test["text"].tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
    outputs = model_distilbert(**test_inputs)
    logits_distilbert_test = torch.sigmoid(outputs.logits).cpu().numpy()

logits_logreg_test = clf_logreg.decision_function(X_test_combined)
pred_proba_xgb_test = clf_xgb.predict_proba(emb_test)
logits_xgb_test = np.column_stack([p[:, 1] for p in pred_proba_xgb_test])
logits_svc_test = clf_svc.decision_function(X_test_tfidf)

print("Creating optimized ensemble...")
ensemble_final_test = (w_deberta_opt * logits_deberta_test_tta + 
                       w_distilbert_opt * logits_distilbert_test + 
                       w_logreg_opt * logits_logreg_test + 
                       w_xgb_opt * logits_xgb_test + 
                       w_svc_opt * logits_svc_test)

pred_test_final = (ensemble_final_test >= ths_optimized).astype(int)

pred_labels = [", ".join([mlb.classes_[j] for j, v in enumerate(row) if v == 1]) for row in pred_test_final]
result_df = pd.DataFrame({
    "movie_name": df_test["movie_name"],
    "genre": pred_labels,
    "description": df_test["description"]
})
result_df.to_csv("dataset_test_preds_optimized.csv", index=False)
print(f"Optimized predictions saved: {len(result_df)} samples")

## 11. Resumen de Resultados

In [None]:
print("="*70)
print("FINAL PERFORMANCE COMPARISON")
print("="*70)
print(f"1. LogReg (TF-IDF+Embed):       micro-F1: {f1_score(y_va, pred_logreg, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_logreg, average='macro'):.4f}")
print(f"2. XGBoost (Embeddings):        micro-F1: {f1_score(y_va, pred_xgb, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_xgb, average='macro'):.4f}")
print(f"3. LinearSVC (TF-IDF):          micro-F1: {f1_score(y_va, pred_svc, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_svc, average='macro'):.4f}")
print(f"4. DistilBERT (Fine-tuned):     micro-F1: {f1_score(y_va, pred_distilbert, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_distilbert, average='macro'):.4f}")
print(f"5. DeBERTa-v3 (Fine-tuned):     micro-F1: {f1_score(y_va, pred_deberta, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_deberta, average='macro'):.4f}")
print(f"6. OPTIMIZED ENSEMBLE (All 5):  micro-F1: {f1_score(y_va, pred_optimized, average='micro'):.4f}, macro-F1: {f1_score(y_va, pred_optimized, average='macro'):.4f}")
print("="*70)