In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.sparse import hstack as sp_hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### **Load data**

In [40]:
test_dir = Path("../dataset_test.csv")
train_dir = Path("../dataset_train.csv")

In [41]:
df = pd.read_csv(train_dir)
len(df)

8475

In [42]:
df

Unnamed: 0,movie_name,genre,description
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ..."
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...
...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th..."


## **Text to disperse vector**

In [43]:
df["text"] = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
y_list = df["genre"].apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])

In [44]:
df

Unnamed: 0,movie_name,genre,description,text
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau...","Silent Hill [SEP] Rose, a desperate mother tak..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ...",Breaking the Waves [SEP] In a small and conser...
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...,Wind Chill [SEP] Two college students share a ...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...,Godmothered [SEP] A young and unskilled fairy ...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...,Donkey Skin [SEP] A fairy godmother helps a pr...
...,...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...,Infested [SEP] Residents of a rundown French a...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...,The Tailor of Panama [SEP] A British spy is ba...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...,Bad Education [SEP] An examination on the effe...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th...",From Dusk Till Dawn [SEP] After kidnapping a f...


In [45]:
y_list[0]

['Horror', 'Mystery']

In [46]:
mlb = MultiLabelBinarizer() # returns a list per sample with 0/1 for each label
Y = mlb.fit_transform(y_list)

X_tr, X_va, y_tr, y_va = train_test_split(
    df["text"], Y, test_size=0.1, random_state=42
)

In [47]:
print(X_tr.iloc[0])
print(y_tr[0])

Scooby-Doo! Camp Scare [SEP] Scooby and the gang experience outdoor fun as they go back to Fred's old summer camp. As summer goes on, it becomes increasingly clear that the spooky camp stories told by the fireplace, are more real than they've though and soon, it's up to the gang to try and solve the mystery of camp scare.
[0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0]


In [48]:
# --- TF-IDF word + char ---
tfidf_word = TfidfVectorizer(
    ngram_range=(1,2), min_df=3, max_features=300_000, sublinear_tf=True, stop_words="english"
)
tfidf_char = TfidfVectorizer(
    analyzer="char_wb", ngram_range=(3,5), min_df=3, max_features=300_000, sublinear_tf=True, stop_words="english"
)

Xw_tr = tfidf_word.fit_transform(X_tr);  Xw_va = tfidf_word.transform(X_va)
Xc_tr = tfidf_char.fit_transform(X_tr);  Xc_va = tfidf_char.transform(X_va)



In [49]:
print(tfidf_word.get_feature_names_out()[10:20])
print(tfidf_char.get_feature_names_out()[10:20])

print(Xw_tr.shape, Xc_tr.shape)

['11' '11 year' '117' '11th' '12' '12 angry' '12 year' '12 years' '12th'
 '13']
[' "br' ' "bu' ' "c' ' "ca' ' "co' ' "cr' ' "cu' ' "d' ' "da' ' "de']
(7627, 14852) (7627, 62853)


In [50]:
XTR = sp_hstack([Xw_tr, Xc_tr], format="csr")
XVA = sp_hstack([Xw_va, Xc_va], format="csr")

# --- Clasificador ---
clf = OneVsRestClassifier(
    LogisticRegression(C=4.0, solver="saga", max_iter=2000, n_jobs=-1),
    n_jobs=-1
)
clf.fit(XTR, y_tr)

# --- Calibración de umbrales por clase ---
logits = clf.decision_function(XVA)  # [n_samples, n_classes]
ths = np.zeros(logits.shape[1])
for k in range(logits.shape[1]):
    s = logits[:, k]
    best_f1, best_t = 0.0, 0.0
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1 = f1_score(y_va[:, k], (s >= t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths[k] = best_t

pred = (logits >= ths).astype(int)
print("micro-F1:", f1_score(y_va, pred, average="micro"))
print("macro-F1:", f1_score(y_va, pred, average="macro"))

# --- Guardar artefactos para inferencia ---
import joblib, json
joblib.dump(tfidf_word, "tfidf_word.joblib")
joblib.dump(tfidf_char, "tfidf_char.joblib")
joblib.dump(clf, "ovr_logreg.joblib")
with open("labels.json","w") as f: json.dump(mlb.classes_.tolist(), f)
np.save("thresholds.npy", ths)

micro-F1: 0.6333592534992224
macro-F1: 0.555072354314252


In [51]:
# predict_classic.py
import pandas as pd, numpy as np, json, joblib
from scipy.sparse import hstack

tfidf_word = joblib.load("tfidf_word.joblib")
tfidf_char = joblib.load("tfidf_char.joblib")
clf = joblib.load("ovr_logreg.joblib")
labels = json.load(open("labels.json"))
ths = np.load("thresholds.npy")

def predict(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    text = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
    X = hstack([tfidf_word.transform(text), tfidf_char.transform(text)])
    logits = clf.decision_function(X)
    pred = (logits >= ths).astype(int)
    # Formato: lista de géneros separados por coma
    pred_labels = [",".join([labels[j] for j,v in enumerate(row) if v==1]) for row in pred]
    pd.DataFrame({"id": df.index, "genre": pred_labels}).to_csv(output_csv, index=False)

pathTest = Path("../dataset_test.csv")
pathPredict = Path("predictions.csv")
predict(pathTest, pathPredict)


In [52]:
import sys
sys.path.append("..")
from validator import compute_metrics

print(compute_metrics(y_va, pred))

{'accuracy': 0.10141509433962265, 'f1': 0.555072354314252, 'precision': 0.5165202924767383, 'recall': 0.6347798822726912, 'hamming_loss': 0.1235587002096436}


## **Comparación de Modelos**

Vamos a comparar diferentes clasificadores:
1. Logistic Regression con calibración óptima de umbrales (ya entrenado)
2. Naive Bayes
3. Random Forest

In [53]:
# Resultados del modelo actual (Logistic Regression con calibración)
print("="*60)
print("MODELO 1: Logistic Regression con calibración de umbrales")
print("="*60)
print("micro-F1:", f1_score(y_va, pred, average="micro"))
print("macro-F1:", f1_score(y_va, pred, average="macro"))
print("weighted-F1:", f1_score(y_va, pred, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred))

MODELO 1: Logistic Regression con calibración de umbrales
micro-F1: 0.6333592534992224
macro-F1: 0.555072354314252
weighted-F1: 0.6458593734356026

Métricas detalladas:
{'accuracy': 0.10141509433962265, 'f1': 0.555072354314252, 'precision': 0.5165202924767383, 'recall': 0.6347798822726912, 'hamming_loss': 0.1235587002096436}


### **Modelo 2: Naive Bayes (MultinomialNB)**

In [54]:
from sklearn.naive_bayes import MultinomialNB

# Entrenar Naive Bayes
clf_nb = OneVsRestClassifier(
    MultinomialNB(alpha=0.1),
    n_jobs=-1
)

print("Entrenando Naive Bayes...")
clf_nb.fit(XTR, y_tr)

# Predicciones (sin calibración de umbrales, usando probabilidad 0.5)
probs_nb = clf_nb.predict_proba(XVA)
pred_nb = (probs_nb >= 0.5).astype(int)

print("\n" + "="*60)
print("MODELO 2: Naive Bayes (umbral fijo 0.5)")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_nb, average="micro"))
print("macro-F1:", f1_score(y_va, pred_nb, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_nb, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_nb))

Entrenando Naive Bayes...

MODELO 2: Naive Bayes (umbral fijo 0.5)
micro-F1: 0.6139965861984882
macro-F1: 0.4435358072682958
weighted-F1: 0.5934043290825758

Métricas detalladas:
{'accuracy': 0.1615566037735849, 'f1': 0.4435358072682958, 'precision': 0.6055277954176069, 'recall': 0.39046941033006455, 'hamming_loss': 0.10370807127882599}

MODELO 2: Naive Bayes (umbral fijo 0.5)
micro-F1: 0.6139965861984882
macro-F1: 0.4435358072682958
weighted-F1: 0.5934043290825758

Métricas detalladas:
{'accuracy': 0.1615566037735849, 'f1': 0.4435358072682958, 'precision': 0.6055277954176069, 'recall': 0.39046941033006455, 'hamming_loss': 0.10370807127882599}


In [55]:
# Calibrar umbrales para Naive Bayes (igual que hicimos con Logistic Regression)
print("Calibrando umbrales para Naive Bayes...")
probs_nb_train = clf_nb.predict_proba(XTR)
ths_nb = np.zeros(probs_nb_train.shape[1])

for k in range(probs_nb_train.shape[1]):
    s = probs_nb_train[:, k]
    best_f1, best_t = 0.0, 0.5
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1_tmp = f1_score(y_tr[:, k], (s >= t).astype(int), zero_division=0)
        if f1_tmp > best_f1:
            best_f1, best_t = f1_tmp, t
    ths_nb[k] = best_t

# Predicciones con umbrales calibrados
pred_nb_cal = (probs_nb >= ths_nb).astype(int)

print("\n" + "="*60)
print("MODELO 2: Naive Bayes CON calibración de umbrales")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_nb_cal, average="micro"))
print("macro-F1:", f1_score(y_va, pred_nb_cal, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_nb_cal, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_nb_cal))

Calibrando umbrales para Naive Bayes...

MODELO 2: Naive Bayes CON calibración de umbrales
micro-F1: 0.601571802810193
macro-F1: 0.4984297864418147
weighted-F1: 0.5989285520011974

Métricas detalladas:
{'accuracy': 0.15919811320754718, 'f1': 0.4984297864418147, 'precision': 0.5891897809876009, 'recall': 0.47013391810723815, 'hamming_loss': 0.10960429769392034}

MODELO 2: Naive Bayes CON calibración de umbrales
micro-F1: 0.601571802810193
macro-F1: 0.4984297864418147
weighted-F1: 0.5989285520011974

Métricas detalladas:
{'accuracy': 0.15919811320754718, 'f1': 0.4984297864418147, 'precision': 0.5891897809876009, 'recall': 0.47013391810723815, 'hamming_loss': 0.10960429769392034}


### **Modelo 3: Random Forest**

In [56]:
from sklearn.ensemble import RandomForestClassifier

# Entrenar Random Forest (con menos árboles para que no tarde mucho)
clf_rf = OneVsRestClassifier(
    RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1, random_state=42),
    n_jobs=-1
)

print("Entrenando Random Forest (esto puede tardar un poco)...")
clf_rf.fit(XTR, y_tr)

# Predicciones (sin calibración de umbrales, usando probabilidad 0.5)
probs_rf = clf_rf.predict_proba(XVA)
pred_rf = (probs_rf >= 0.5).astype(int)

print("\n" + "="*60)
print("MODELO 3: Random Forest (umbral fijo 0.5)")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_rf, average="micro"))
print("macro-F1:", f1_score(y_va, pred_rf, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_rf, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_rf))

Entrenando Random Forest (esto puede tardar un poco)...

MODELO 3: Random Forest (umbral fijo 0.5)
micro-F1: 0.2544400144980065
macro-F1: 0.11725036454742388
weighted-F1: 0.21657318624665475

Métricas detalladas:
{'accuracy': 0.04245283018867924, 'f1': 0.11725036454742388, 'precision': 0.5967823535901339, 'recall': 0.07544193117760055, 'hamming_loss': 0.13476153039832284}

MODELO 3: Random Forest (umbral fijo 0.5)
micro-F1: 0.2544400144980065
macro-F1: 0.11725036454742388
weighted-F1: 0.21657318624665475

Métricas detalladas:
{'accuracy': 0.04245283018867924, 'f1': 0.11725036454742388, 'precision': 0.5967823535901339, 'recall': 0.07544193117760055, 'hamming_loss': 0.13476153039832284}


In [57]:
# Calibrar umbrales para Random Forest
print("Calibrando umbrales para Random Forest...")
probs_rf_train = clf_rf.predict_proba(XTR)
ths_rf = np.zeros(probs_rf_train.shape[1])

for k in range(probs_rf_train.shape[1]):
    s = probs_rf_train[:, k]
    best_f1, best_t = 0.0, 0.5
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1_tmp = f1_score(y_tr[:, k], (s >= t).astype(int), zero_division=0)
        if f1_tmp > best_f1:
            best_f1, best_t = f1_tmp, t
    ths_rf[k] = best_t

# Predicciones con umbrales calibrados
pred_rf_cal = (probs_rf >= ths_rf).astype(int)

print("\n" + "="*60)
print("MODELO 3: Random Forest CON calibración de umbrales")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_rf_cal, average="micro"))
print("macro-F1:", f1_score(y_va, pred_rf_cal, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_rf_cal, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_rf_cal))

Calibrando umbrales para Random Forest...

MODELO 3: Random Forest CON calibración de umbrales
micro-F1: 0.5444832545716047
macro-F1: 0.47330195719224744
weighted-F1: 0.5649687243743775

Métricas detalladas:
{'accuracy': 0.06721698113207547, 'f1': 0.47330195719224744, 'precision': 0.4553511035206816, 'recall': 0.5525472162644545, 'hamming_loss': 0.1452437106918239}

MODELO 3: Random Forest CON calibración de umbrales
micro-F1: 0.5444832545716047
macro-F1: 0.47330195719224744
weighted-F1: 0.5649687243743775

Métricas detalladas:
{'accuracy': 0.06721698113207547, 'f1': 0.47330195719224744, 'precision': 0.4553511035206816, 'recall': 0.5525472162644545, 'hamming_loss': 0.1452437106918239}


### **Resumen Comparativo**

In [58]:
# Crear tabla comparativa
results = []

models = [
    ("Logistic Regression + calibración", pred),
    ("Naive Bayes (umbral 0.5)", pred_nb),
    ("Naive Bayes + calibración", pred_nb_cal),
    ("Random Forest (umbral 0.5)", pred_rf),
    ("Random Forest + calibración", pred_rf_cal)
]

print("="*80)
print("RESUMEN COMPARATIVO DE MODELOS")
print("="*80)
print(f"{'Modelo':<35} {'Micro-F1':>12} {'Macro-F1':>12} {'Weighted-F1':>12}")
print("-"*80)

for name, predictions in models:
    micro = f1_score(y_va, predictions, average="micro")
    macro = f1_score(y_va, predictions, average="macro")
    weighted = f1_score(y_va, predictions, average="weighted")
    print(f"{name:<35} {micro:>12.4f} {macro:>12.4f} {weighted:>12.4f}")
    results.append({
        'Modelo': name,
        'Micro-F1': micro,
        'Macro-F1': macro,
        'Weighted-F1': weighted
    })

print("="*80)

# Crear DataFrame para mejor visualización
df_results = pd.DataFrame(results)
df_results = df_results.sort_values('Micro-F1', ascending=False)
print("\nMejor modelo por Micro-F1:")
print(df_results.head(1))

RESUMEN COMPARATIVO DE MODELOS
Modelo                                  Micro-F1     Macro-F1  Weighted-F1
--------------------------------------------------------------------------------
Logistic Regression + calibración         0.6334       0.5551       0.6459
Naive Bayes (umbral 0.5)                  0.6140       0.4435       0.5934
Naive Bayes + calibración                 0.6016       0.4984       0.5989
Random Forest (umbral 0.5)                0.2544       0.1173       0.2166
Random Forest + calibración               0.5445       0.4733       0.5650

Mejor modelo por Micro-F1:
                              Modelo  Micro-F1  Macro-F1  Weighted-F1
0  Logistic Regression + calibración  0.633359  0.555072     0.645859


## **Comparación con CountVectorizer**

Ahora vamos a probar los mismos modelos pero usando **CountVectorizer** en lugar de TF-IDF para ver si hay diferencias en el rendimiento.

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

# --- CountVectorizer word + char ---
count_word = CountVectorizer(
    ngram_range=(1,2), min_df=3, max_features=300_000, stop_words="english"
)
count_char = CountVectorizer(
    analyzer="char_wb", ngram_range=(3,5), min_df=3, max_features=300_000
)

print("Transformando con CountVectorizer...")
Xw_tr_count = count_word.fit_transform(X_tr);  Xw_va_count = count_word.transform(X_va)
Xc_tr_count = count_char.fit_transform(X_tr);  Xc_va_count = count_char.transform(X_va)

# Combinar word + char
XTR_count = sp_hstack([Xw_tr_count, Xc_tr_count], format="csr")
XVA_count = sp_hstack([Xw_va_count, Xc_va_count], format="csr")

print(f"Shape: {XTR_count.shape}")

Transformando con CountVectorizer...
Shape: (7627, 77705)
Shape: (7627, 77705)


### **Modelo 1: Logistic Regression con CountVectorizer**

In [60]:
# Entrenar Logistic Regression con CountVectorizer
clf_lr_count = OneVsRestClassifier(
    LogisticRegression(C=4.0, solver="saga", max_iter=2000, n_jobs=-1),
    n_jobs=-1
)

print("Entrenando Logistic Regression con CountVectorizer...")
clf_lr_count.fit(XTR_count, y_tr)

# Calibración de umbrales
logits_count = clf_lr_count.decision_function(XVA_count)
ths_lr_count = np.zeros(logits_count.shape[1])

for k in range(logits_count.shape[1]):
    s = logits_count[:, k]
    best_f1, best_t = 0.0, 0.0
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1_tmp = f1_score(y_va[:, k], (s >= t).astype(int), zero_division=0)
        if f1_tmp > best_f1:
            best_f1, best_t = f1_tmp, t
    ths_lr_count[k] = best_t

pred_lr_count = (logits_count >= ths_lr_count).astype(int)

print("\n" + "="*60)
print("Logistic Regression + CountVectorizer + calibración")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_lr_count, average="micro"))
print("macro-F1:", f1_score(y_va, pred_lr_count, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_lr_count, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_lr_count))

Entrenando Logistic Regression con CountVectorizer...

Logistic Regression + CountVectorizer + calibración
micro-F1: 0.5911634188693806
macro-F1: 0.5084452569620344
weighted-F1: 0.5947186263714275

Métricas detalladas:
{'accuracy': 0.08254716981132075, 'f1': 0.5084452569620344, 'precision': 0.46659793173969966, 'recall': 0.587063405690667, 'hamming_loss': 0.13882337526205452}

Logistic Regression + CountVectorizer + calibración
micro-F1: 0.5911634188693806
macro-F1: 0.5084452569620344
weighted-F1: 0.5947186263714275

Métricas detalladas:
{'accuracy': 0.08254716981132075, 'f1': 0.5084452569620344, 'precision': 0.46659793173969966, 'recall': 0.587063405690667, 'hamming_loss': 0.13882337526205452}


### **Modelo 2: Naive Bayes con CountVectorizer**

In [61]:
# Entrenar Naive Bayes con CountVectorizer
clf_nb_count = OneVsRestClassifier(
    MultinomialNB(alpha=0.1),
    n_jobs=-1
)

print("Entrenando Naive Bayes con CountVectorizer...")
clf_nb_count.fit(XTR_count, y_tr)

# Calibración de umbrales
probs_nb_count = clf_nb_count.predict_proba(XVA_count)
probs_nb_count_train = clf_nb_count.predict_proba(XTR_count)
ths_nb_count = np.zeros(probs_nb_count_train.shape[1])

for k in range(probs_nb_count_train.shape[1]):
    s = probs_nb_count_train[:, k]
    best_f1, best_t = 0.0, 0.5
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1_tmp = f1_score(y_tr[:, k], (s >= t).astype(int), zero_division=0)
        if f1_tmp > best_f1:
            best_f1, best_t = f1_tmp, t
    ths_nb_count[k] = best_t

pred_nb_count = (probs_nb_count >= ths_nb_count).astype(int)

print("\n" + "="*60)
print("Naive Bayes + CountVectorizer + calibración")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_nb_count, average="micro"))
print("macro-F1:", f1_score(y_va, pred_nb_count, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_nb_count, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_nb_count))

Entrenando Naive Bayes con CountVectorizer...

Naive Bayes + CountVectorizer + calibración
micro-F1: 0.6006148025537952
macro-F1: 0.5039395423835606
weighted-F1: 0.6010710781338638

Métricas detalladas:
{'accuracy': 0.1403301886792453, 'f1': 0.5039395423835606, 'precision': 0.5736236925916347, 'recall': 0.48357022889873424, 'hamming_loss': 0.11065251572327044}

Naive Bayes + CountVectorizer + calibración
micro-F1: 0.6006148025537952
macro-F1: 0.5039395423835606
weighted-F1: 0.6010710781338638

Métricas detalladas:
{'accuracy': 0.1403301886792453, 'f1': 0.5039395423835606, 'precision': 0.5736236925916347, 'recall': 0.48357022889873424, 'hamming_loss': 0.11065251572327044}


### **Modelo 3: Random Forest con CountVectorizer**

In [62]:
# Entrenar Random Forest con CountVectorizer
clf_rf_count = OneVsRestClassifier(
    RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1, random_state=42),
    n_jobs=-1
)

print("Entrenando Random Forest con CountVectorizer (esto puede tardar un poco)...")
clf_rf_count.fit(XTR_count, y_tr)

# Calibración de umbrales
probs_rf_count = clf_rf_count.predict_proba(XVA_count)
probs_rf_count_train = clf_rf_count.predict_proba(XTR_count)
ths_rf_count = np.zeros(probs_rf_count_train.shape[1])

for k in range(probs_rf_count_train.shape[1]):
    s = probs_rf_count_train[:, k]
    best_f1, best_t = 0.0, 0.5
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1_tmp = f1_score(y_tr[:, k], (s >= t).astype(int), zero_division=0)
        if f1_tmp > best_f1:
            best_f1, best_t = f1_tmp, t
    ths_rf_count[k] = best_t

pred_rf_count = (probs_rf_count >= ths_rf_count).astype(int)

print("\n" + "="*60)
print("Random Forest + CountVectorizer + calibración")
print("="*60)
print("micro-F1:", f1_score(y_va, pred_rf_count, average="micro"))
print("macro-F1:", f1_score(y_va, pred_rf_count, average="macro"))
print("weighted-F1:", f1_score(y_va, pred_rf_count, average="weighted"))
print("\nMétricas detalladas:")
print(compute_metrics(y_va, pred_rf_count))

Entrenando Random Forest con CountVectorizer (esto puede tardar un poco)...

Random Forest + CountVectorizer + calibración
micro-F1: 0.5626009693053312
macro-F1: 0.47467920256782326
weighted-F1: 0.5740550901842221

Métricas detalladas:
{'accuracy': 0.09080188679245282, 'f1': 0.47467920256782326, 'precision': 0.44896292823722495, 'recall': 0.5415173911309075, 'hamming_loss': 0.14190251572327045}

Random Forest + CountVectorizer + calibración
micro-F1: 0.5626009693053312
macro-F1: 0.47467920256782326
weighted-F1: 0.5740550901842221

Métricas detalladas:
{'accuracy': 0.09080188679245282, 'f1': 0.47467920256782326, 'precision': 0.44896292823722495, 'recall': 0.5415173911309075, 'hamming_loss': 0.14190251572327045}


### **Resumen Comparativo: TF-IDF vs CountVectorizer**

In [63]:
# Comparación completa: TF-IDF vs CountVectorizer
all_models = [
    # TF-IDF
    ("TF-IDF: Logistic Regression + calibración", pred),
    ("TF-IDF: Naive Bayes (umbral 0.5)", pred_nb),
    ("TF-IDF: Naive Bayes + calibración", pred_nb_cal),
    ("TF-IDF: Random Forest (umbral 0.5)", pred_rf),
    ("TF-IDF: Random Forest + calibración", pred_rf_cal),
    # CountVectorizer
    ("Count: Logistic Regression + calibración", pred_lr_count),
    ("Count: Naive Bayes + calibración", pred_nb_count),
    ("Count: Random Forest + calibración", pred_rf_count),
]

print("="*85)
print("COMPARACIÓN COMPLETA: TF-IDF vs CountVectorizer")
print("="*85)
print(f"{'Modelo':<45} {'Micro-F1':>12} {'Macro-F1':>12} {'Weighted-F1':>12}")
print("-"*85)

all_results = []
for name, predictions in all_models:
    micro = f1_score(y_va, predictions, average="micro")
    macro = f1_score(y_va, predictions, average="macro")
    weighted = f1_score(y_va, predictions, average="weighted")
    print(f"{name:<45} {micro:>12.4f} {macro:>12.4f} {weighted:>12.4f}")
    all_results.append({
        'Modelo': name,
        'Micro-F1': micro,
        'Macro-F1': macro,
        'Weighted-F1': weighted
    })

print("="*85)

# Crear DataFrame ordenado por Micro-F1
df_all_results = pd.DataFrame(all_results)
df_all_results = df_all_results.sort_values('Micro-F1', ascending=False)

print("\n🏆 TOP 3 MODELOS (por Micro-F1):")
print(df_all_results.head(3).to_string(index=False))

print("\n📊 Comparación por vectorizador:")
tfidf_avg = df_all_results[df_all_results['Modelo'].str.contains('TF-IDF')]['Micro-F1'].mean()
count_avg = df_all_results[df_all_results['Modelo'].str.contains('Count')]['Micro-F1'].mean()
print(f"Promedio TF-IDF: {tfidf_avg:.4f}")
print(f"Promedio CountVectorizer: {count_avg:.4f}")

COMPARACIÓN COMPLETA: TF-IDF vs CountVectorizer
Modelo                                            Micro-F1     Macro-F1  Weighted-F1
-------------------------------------------------------------------------------------
TF-IDF: Logistic Regression + calibración           0.6334       0.5551       0.6459
TF-IDF: Naive Bayes (umbral 0.5)                    0.6140       0.4435       0.5934
TF-IDF: Naive Bayes + calibración                   0.6016       0.4984       0.5989
TF-IDF: Random Forest (umbral 0.5)                  0.2544       0.1173       0.2166
TF-IDF: Random Forest + calibración                 0.5445       0.4733       0.5650
Count: Logistic Regression + calibración            0.5912       0.5084       0.5947
Count: Naive Bayes + calibración                    0.6006       0.5039       0.6011
Count: Random Forest + calibración                  0.5626       0.4747       0.5741

🏆 TOP 3 MODELOS (por Micro-F1):
                                   Modelo  Micro-F1  Macro-F1  Weigh