In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.sparse import hstack as sp_hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### **Load data**

In [2]:
test_dir = Path("../dataset_test.csv")
train_dir = Path("../dataset_train.csv")

In [3]:
df = pd.read_csv(train_dir)
len(df)

8475

In [182]:
df

Unnamed: 0,movie_name,genre,description
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ..."
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...
...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th..."


## **Text to disperse vector**

In [183]:
df["text"] = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
y_list = df["genre"].apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])

In [184]:
df

Unnamed: 0,movie_name,genre,description,text
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau...","Silent Hill [SEP] Rose, a desperate mother tak..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ...",Breaking the Waves [SEP] In a small and conser...
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...,Wind Chill [SEP] Two college students share a ...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...,Godmothered [SEP] A young and unskilled fairy ...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...,Donkey Skin [SEP] A fairy godmother helps a pr...
...,...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...,Infested [SEP] Residents of a rundown French a...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...,The Tailor of Panama [SEP] A British spy is ba...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...,Bad Education [SEP] An examination on the effe...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th...",From Dusk Till Dawn [SEP] After kidnapping a f...


In [185]:
y_list[0]

['Horror', 'Mystery']

In [186]:
mlb = MultiLabelBinarizer() # returns a list per sample with 0/1 for each label
Y = mlb.fit_transform(y_list)

X_tr, X_va, y_tr, y_va = train_test_split(
    df["text"], Y, test_size=0.1, random_state=42
)

In [187]:
print(X_tr.iloc[0])
print(y_tr[0])

Scooby-Doo! Camp Scare [SEP] Scooby and the gang experience outdoor fun as they go back to Fred's old summer camp. As summer goes on, it becomes increasingly clear that the spooky camp stories told by the fireplace, are more real than they've though and soon, it's up to the gang to try and solve the mystery of camp scare.
[0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0]


In [188]:
# --- TF-IDF word + char ---
tfidf_word = TfidfVectorizer(
    ngram_range=(1,2), min_df=3, max_features=300_000, sublinear_tf=True
)
tfidf_char = TfidfVectorizer(
    analyzer="char_wb", ngram_range=(3,5), min_df=3, max_features=300_000, sublinear_tf=True
)

Xw_tr = tfidf_word.fit_transform(X_tr);  Xw_va = tfidf_word.transform(X_va)
Xc_tr = tfidf_char.fit_transform(X_tr);  Xc_va = tfidf_char.transform(X_va)

In [189]:
print(tfidf_word.get_feature_names_out()[10:20])
print(tfidf_char.get_feature_names_out()[10:20])

print(Xw_tr.shape, Xc_tr.shape)

['11' '11 year' '117' '117 is' '11th' '12' '12 angry' '12 year' '12 years'
 '12th']
[' "br' ' "bu' ' "c' ' "ca' ' "co' ' "cr' ' "cu' ' "d' ' "da' ' "de']
(7627, 28369) (7627, 62853)


In [190]:
XTR = sp_hstack([Xw_tr, Xc_tr], format="csr")
XVA = sp_hstack([Xw_va, Xc_va], format="csr")

# --- Clasificador ---
clf = OneVsRestClassifier(
    LogisticRegression(C=4.0, solver="saga", max_iter=2000, n_jobs=-1),
    n_jobs=-1
)
clf.fit(XTR, y_tr)

# --- Calibración de umbrales por clase ---
logits = clf.decision_function(XVA)  # [n_samples, n_classes]
ths = np.zeros(logits.shape[1])
for k in range(logits.shape[1]):
    s = logits[:, k]
    best_f1, best_t = 0.0, 0.0
    for t in np.quantile(s, np.linspace(0.05, 0.95, 19)):
        f1 = f1_score(y_va[:, k], (s >= t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    ths[k] = best_t

pred = (logits >= ths).astype(int)
print("micro-F1:", f1_score(y_va, pred, average="micro"))
print("macro-F1:", f1_score(y_va, pred, average="macro"))

# --- Guardar artefactos para inferencia ---
import joblib, json
joblib.dump(tfidf_word, "tfidf_word.joblib")
joblib.dump(tfidf_char, "tfidf_char.joblib")
joblib.dump(clf, "ovr_logreg.joblib")
with open("labels.json","w") as f: json.dump(mlb.classes_.tolist(), f)
np.save("thresholds.npy", ths)

micro-F1: 0.5952936949883241
macro-F1: 0.5550593125663562


In [191]:
# predict_classic.py
import pandas as pd, numpy as np, json, joblib
from scipy.sparse import hstack

tfidf_word = joblib.load("tfidf_word.joblib")
tfidf_char = joblib.load("tfidf_char.joblib")
clf = joblib.load("ovr_logreg.joblib")
labels = json.load(open("labels.json"))
ths = np.load("thresholds.npy")

def predict(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    text = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
    X = hstack([tfidf_word.transform(text), tfidf_char.transform(text)])
    logits = clf.decision_function(X)
    pred = (logits >= ths).astype(int)
    # Formato: lista de géneros separados por coma
    pred_labels = [",".join([labels[j] for j,v in enumerate(row) if v==1]) for row in pred]
    pd.DataFrame({"id": df.index, "genre": pred_labels}).to_csv(output_csv, index=False)

predict("dataset_test.csv", "predictions.csv")


In [192]:
from validator import compute_metrics

print(compute_metrics(y_va, pred))

{'accuracy': 0.05070754716981132, 'f1': 0.5550593125663562, 'precision': 0.5076310749633357, 'recall': 0.6910690970951362, 'hamming_loss': 0.14760220125786164}
