In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.sparse import hstack as sp_hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### **Load data**

In [2]:
test_dir = Path("../dataset_test.csv")
train_dir = Path("../dataset_train.csv")

In [3]:
df = pd.read_csv(train_dir)
len(df)

8475

In [4]:
df

Unnamed: 0,movie_name,genre,description
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ..."
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...
...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th..."


## **Text to disperse vector**

In [5]:
df["text"] = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
y_list = df["genre"].apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])

In [6]:
df

Unnamed: 0,movie_name,genre,description,text
0,Silent Hill,"Horror, Mystery","Rose, a desperate mother takes her adopted dau...","Silent Hill [SEP] Rose, a desperate mother tak..."
1,Breaking the Waves,"Drama, Romance","In a small and conservative Scottish village, ...",Breaking the Waves [SEP] In a small and conser...
2,Wind Chill,"Drama, Horror, Thriller",Two college students share a ride home for the...,Wind Chill [SEP] Two college students share a ...
3,Godmothered,"Family, Fantasy, Comedy",A young and unskilled fairy godmother that ven...,Godmothered [SEP] A young and unskilled fairy ...
4,Donkey Skin,"Fantasy, Comedy, Music, Romance",A fairy godmother helps a princess disguise he...,Donkey Skin [SEP] A fairy godmother helps a pr...
...,...,...,...,...
8470,Infested,"Horror, Thriller",Residents of a rundown French apartment buildi...,Infested [SEP] Residents of a rundown French a...
8471,The Tailor of Panama,"Drama, Thriller",A British spy is banished to Panama after havi...,The Tailor of Panama [SEP] A British spy is ba...
8472,Bad Education,"Drama, Crime",An examination on the effect of Franco-era rel...,Bad Education [SEP] An examination on the effe...
8473,From Dusk Till Dawn,"Horror, Action, Thriller, Crime","After kidnapping a father and his two kids, th...",From Dusk Till Dawn [SEP] After kidnapping a f...


In [7]:
y_list[0]

['Horror', 'Mystery']

In [8]:
mlb = MultiLabelBinarizer() # returns a list per sample with 0/1 for each label
Y = mlb.fit_transform(y_list)

X_tr, X_va, y_tr, y_va = train_test_split(
    df["text"], Y, test_size=0.1, random_state=42
)

In [9]:
print(X_tr.iloc[0])
print(y_tr[0])

Scooby-Doo! Camp Scare [SEP] Scooby and the gang experience outdoor fun as they go back to Fred's old summer camp. As summer goes on, it becomes increasingly clear that the spooky camp stories told by the fireplace, are more real than they've though and soon, it's up to the gang to try and solve the mystery of camp scare.
[0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0]


In [10]:
# --- TF-IDF word + char with improved parameters ---
# Key improvements:
# 1. Increased ngram_range for better context capture
# 2. Lower min_df to capture more rare but informative terms
# 3. Increased max_features for richer representation
# 4. Added max_df to remove very common uninformative terms
# 5. strip_accents for better text normalization

tfidf_word = TfidfVectorizer(
    ngram_range=(1,3),           # Capture unigrams, bigrams, and trigrams
    min_df=2,                     # Lower threshold to capture more features
    max_features=500_000,         # More features for better representation
    sublinear_tf=True,            # Use log(tf) for better scaling
    stop_words="english",         # Remove common English words
    max_df=0.85,                  # Remove very common terms
    strip_accents='unicode',      # Normalize unicode characters
    lowercase=True                # Normalize case
)

tfidf_char = TfidfVectorizer(
    analyzer="char_wb",           # Character n-grams within word boundaries
    ngram_range=(3,6),            # Capture character patterns of length 3-6
    min_df=2,                     # Lower threshold
    max_features=500_000,         # More character-level features
    sublinear_tf=True,
    max_df=0.85,
    strip_accents='unicode'
)

Xw_tr = tfidf_word.fit_transform(X_tr);  Xw_va = tfidf_word.transform(X_va)
Xc_tr = tfidf_char.fit_transform(X_tr);  Xc_va = tfidf_char.transform(X_va)

In [11]:
print(tfidf_word.get_feature_names_out()[10:20])
print(tfidf_char.get_feature_names_out()[10:20])

print(Xw_tr.shape, Xc_tr.shape)

['10 million' '10 minutes' '10 year' '10 year old' '10 years'
 '10 years later' '10 years prison' '100' '100 years' '10th']
[' "b' ' "ba' ' "bat' ' "batt' ' "be' ' "bi' ' "big' ' "big ' ' "bl'
 ' "bla']
(7627, 29337) (7627, 128989)


In [12]:
XTR = sp_hstack([Xw_tr, Xc_tr], format="csr")
XVA = sp_hstack([Xw_va, Xc_va], format="csr")

# --- Improved Classifier with optimized hyperparameters ---
# Key improvements:
# 1. Higher C value (less regularization) - model can be more complex
# 2. class_weight='balanced' - handles class imbalance
# 3. Increased max_iter for convergence
# 4. Using saga solver which handles L1/L2 regularization well

clf = OneVsRestClassifier(
    LogisticRegression(
        C=8.0,                      # Higher C = less regularization (more complex model)
        solver="saga",              # Efficient solver for large datasets
        max_iter=4000,              # More iterations for better convergence
        class_weight='balanced',    # Handle class imbalance
        penalty='l2',               # L2 regularization
        random_state=42,            # Reproducibility
        warm_start=False
    ),
    n_jobs=-1
)

print("Training improved model...")
clf.fit(XTR, y_tr)
print("Training complete!")

# --- Enhanced threshold calibration per class ---
# More sophisticated threshold search for optimal F1
logits = clf.decision_function(XVA)
ths = np.zeros(logits.shape[1])

print("Calibrating thresholds per class...")
for k in range(logits.shape[1]):
    s = logits[:, k]
    best_f1, best_t = 0.0, 0.0
    
    # Enhanced threshold search strategy:
    # 1. Check quantiles across full range
    # 2. Check statistical measures (mean, median)
    # 3. Check around 0 (default threshold)
    candidates = np.concatenate([
        np.quantile(s, np.linspace(0.01, 0.99, 50)),  # 50 quantile points
        [s.mean(), np.median(s), 0.0, -0.5, 0.5],       # Statistical + default thresholds
    ])
    candidates = np.unique(candidates)  # Remove duplicates
    
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    
    ths[k] = best_t

pred = (logits >= ths).astype(int)

print("\n" + "="*50)
print("IMPROVED MODEL PERFORMANCE")
print("="*50)
print(f"micro-F1: {f1_score(y_va, pred, average='micro'):.4f}")
print(f"macro-F1: {f1_score(y_va, pred, average='macro'):.4f}")
print("="*50)

# --- Save artifacts for inference ---
import joblib, json
joblib.dump(tfidf_word, "tfidf_word.joblib")
joblib.dump(tfidf_char, "tfidf_char.joblib")
joblib.dump(clf, "ovr_logreg.joblib")
with open("labels.json","w") as f: json.dump(mlb.classes_.tolist(), f)
np.save("thresholds.npy", ths)
print("Model artifacts saved successfully!")


Training improved model...
Training complete!
Calibrating thresholds per class...
Training complete!
Calibrating thresholds per class...

IMPROVED MODEL PERFORMANCE
micro-F1: 0.5622
macro-F1: 0.5602

IMPROVED MODEL PERFORMANCE
micro-F1: 0.5622
macro-F1: 0.5602
Model artifacts saved successfully!
Model artifacts saved successfully!


## 🚀 Advanced Approach: Ensemble Model for Even Higher F1

This alternative approach uses multiple models and combines their predictions for potentially better performance.

In [13]:
# Updated prediction script with improved model
import pandas as pd, numpy as np, json, joblib
from scipy.sparse import hstack

# Load improved model artifacts
tfidf_word = joblib.load("tfidf_word.joblib")
tfidf_char = joblib.load("tfidf_char.joblib")
clf = joblib.load("ovr_logreg.joblib")
labels = json.load(open("labels.json"))
ths = np.load("thresholds.npy")

def predict(input_csv, output_csv, use_ensemble=False):
    """
    Predict movie genres using the trained model.
    
    Args:
        input_csv: Path to input CSV file
        output_csv: Path to save predictions
        use_ensemble: If True, uses ensemble model (requires ensemble models saved)
    """
    df = pd.read_csv(input_csv)
    text = df["movie_name"].fillna("") + " [SEP] " + df["description"].fillna("")
    X = hstack([tfidf_word.transform(text), tfidf_char.transform(text)])
    
    if use_ensemble:
        # Load ensemble models
        clf_logreg = joblib.load("ovr_logreg_ensemble.joblib")
        clf_svc = joblib.load("ovr_svc_ensemble.joblib")
        clf_nb = joblib.load("ovr_nb_ensemble.joblib")
        ths_ens = np.load("thresholds_ensemble.npy")
        
        # Get predictions from all models
        logits_logreg = clf_logreg.decision_function(X)
        logits_svc = clf_svc.decision_function(X)
        probs_nb = clf_nb.predict_proba(X)
        logits_nb = np.log(probs_nb + 1e-10)
        
        # Weighted ensemble
        logits = 0.5 * logits_logreg + 0.35 * logits_svc + 0.15 * logits_nb
        pred = (logits >= ths_ens).astype(int)
    else:
        # Single model prediction
        logits = clf.decision_function(X)
        pred = (logits >= ths).astype(int)
    
    # Convert predictions to genre strings
    pred_labels = [", ".join([labels[j] for j,v in enumerate(row) if v==1]) for row in pred]
    
    # Save predictions
    result_df = pd.DataFrame({
        "movie_name": df["movie_name"], 
        "genre": pred_labels, 
        "description": df["description"]
    })
    result_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")
    return result_df

# Generate predictions on test set
print("Generating predictions on test set...")
predict("../dataset_test.csv", "dataset_test_preds.csv", use_ensemble=False)

Generating predictions on test set...
Predictions saved to dataset_test_preds.csv
Predictions saved to dataset_test_preds.csv


Unnamed: 0,movie_name,genre,description
0,Opposites Attract,"Comedy, Drama, Romance, TV Movie","She's a divorce lawyer, single mother and perp..."
1,A Turtle's Tale: Sammy's Adventures,"Adventure, Animation, Comedy, Drama, Family, F...",A sea turtle who was hatched in 1959 spends th...
2,My Stepmother Is an Alien,"Drama, Romance, Science Fiction, TV Movie",Trying to rescue her home planet from destruct...
3,You've Got Mail,"Comedy, Family, Music, TV Movie","Book superstore magnate, Joe Fox and independe..."
4,The Thing,"Action, Horror, Science Fiction, TV Movie","In the winter of 1982, a twelve-man research t..."
...,...,...,...
937,Olympus Has Fallen,"Action, Comedy, Thriller","When the White House (Secret Service Code: ""Ol..."
938,Flashdance,"Comedy, Drama, Music, Romance, TV Movie","Alex Owens, a young woman juggling between two..."
939,Hands of Stone,"Drama, Family, Fantasy, History, TV Movie",The legendary Roberto Duran and his equally le...
940,La matassa,"Action, Comedy, TV Movie",Two cousins haven't spoken for almost twenty y...


In [14]:
import sys
sys.path.append("..")
from validator import compute_metrics 

print(compute_metrics(y_va, pred))

{'accuracy': 0.003537735849056604, 'f1': 0.5601842477716726, 'precision': 0.5315817417436101, 'recall': 0.6564379944797808, 'hamming_loss': 0.1669287211740042}


In [15]:
# OPTIONAL: Advanced Ensemble Approach for Maximum F1 Score
# This combines multiple classifiers for potentially better performance

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

print("\n" + "="*50)
print("TRAINING ENSEMBLE MODELS")
print("="*50)

# Model 1: Optimized Logistic Regression (Primary model)
clf_logreg = OneVsRestClassifier(
    LogisticRegression(C=8.0, solver="saga", max_iter=4000, 
                       class_weight='balanced', random_state=42),
    n_jobs=-1
)

# Model 2: LinearSVC (Fast and effective for text)
clf_svc = OneVsRestClassifier(
    LinearSVC(C=2.0, max_iter=4000, class_weight='balanced', 
              dual='auto', random_state=42),
    n_jobs=-1
)

# Model 3: Multinomial Naive Bayes (Good baseline for text)
clf_nb = OneVsRestClassifier(
    MultinomialNB(alpha=0.05),  # Lower alpha for less smoothing
    n_jobs=-1
)

# Train all models
print("Training Logistic Regression...")
clf_logreg.fit(XTR, y_tr)

print("Training LinearSVC...")
clf_svc.fit(XTR, y_tr)

print("Training Naive Bayes...")
clf_nb.fit(XTR, y_tr)

# Get predictions from all models
print("\nGenerating ensemble predictions...")
logits_logreg = clf_logreg.decision_function(XVA)
logits_svc = clf_svc.decision_function(XVA)

# For Naive Bayes, convert probabilities to decision scores
probs_nb = clf_nb.predict_proba(XVA)
logits_nb = np.log(probs_nb + 1e-10)  # Log probability as decision function

# Weighted ensemble: LogReg gets highest weight as it typically performs best
# You can tune these weights based on validation performance
ensemble_logits = 0.5 * logits_logreg + 0.35 * logits_svc + 0.15 * logits_nb

# Calibrate thresholds on ensemble predictions
ths_ensemble = np.zeros(ensemble_logits.shape[1])

print("Calibrating ensemble thresholds...")
for k in range(ensemble_logits.shape[1]):
    s = ensemble_logits[:, k]
    best_f1, best_t = 0.0, 0.0
    
    # Extended threshold search for ensemble
    candidates = np.concatenate([
        np.quantile(s, np.linspace(0.005, 0.995, 60)),
        [s.mean(), np.median(s), 0.0, -1.0, -0.5, 0.5, 1.0],
        np.linspace(s.min(), s.max(), 20)
    ])
    candidates = np.unique(candidates)
    
    for t in candidates:
        preds_k = (s >= t).astype(int)
        f1 = f1_score(y_va[:, k], preds_k, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    
    ths_ensemble[k] = best_t

pred_ensemble = (ensemble_logits >= ths_ensemble).astype(int)

print("\n" + "="*50)
print("ENSEMBLE MODEL PERFORMANCE")
print("="*50)
print(f"micro-F1: {f1_score(y_va, pred_ensemble, average='micro'):.4f}")
print(f"macro-F1: {f1_score(y_va, pred_ensemble, average='macro'):.4f}")
print("="*50)

# Save ensemble models
print("\nSaving ensemble models...")
joblib.dump(clf_logreg, "ovr_logreg_ensemble.joblib")
joblib.dump(clf_svc, "ovr_svc_ensemble.joblib")
joblib.dump(clf_nb, "ovr_nb_ensemble.joblib")
np.save("thresholds_ensemble.npy", ths_ensemble)
print("Ensemble models saved!")



TRAINING ENSEMBLE MODELS
Training Logistic Regression...
Training LinearSVC...
Training LinearSVC...
Training Naive Bayes...
Training Naive Bayes...

Generating ensemble predictions...
Calibrating ensemble thresholds...

Generating ensemble predictions...
Calibrating ensemble thresholds...

ENSEMBLE MODEL PERFORMANCE
micro-F1: 0.6285
macro-F1: 0.5787

Saving ensemble models...
Ensemble models saved!

ENSEMBLE MODEL PERFORMANCE
micro-F1: 0.6285
macro-F1: 0.5787

Saving ensemble models...
Ensemble models saved!


## 📊 Performance Comparison

In [16]:
# Compare all approaches
print("\n" + "="*60)
print("PERFORMANCE COMPARISON")
print("="*60)

# Single model results
print("\n1. IMPROVED SINGLE MODEL (Logistic Regression):")
print(f"   micro-F1: {f1_score(y_va, pred, average='micro'):.4f}")
print(f"   macro-F1: {f1_score(y_va, pred, average='macro'):.4f}")

# Ensemble results
print("\n2. ENSEMBLE MODEL (LogReg + SVC + NB):")
print(f"   micro-F1: {f1_score(y_va, pred_ensemble, average='micro'):.4f}")
print(f"   macro-F1: {f1_score(y_va, pred_ensemble, average='macro'):.4f}")

# Detailed metrics comparison
from validator import compute_metrics

print("\n" + "-"*60)
print("DETAILED METRICS (Single Model):")
print(compute_metrics(y_va, pred))

print("\n" + "-"*60)
print("DETAILED METRICS (Ensemble):")
print(compute_metrics(y_va, pred_ensemble))
print("="*60)


PERFORMANCE COMPARISON

1. IMPROVED SINGLE MODEL (Logistic Regression):
   micro-F1: 0.5622
   macro-F1: 0.5602

2. ENSEMBLE MODEL (LogReg + SVC + NB):
   micro-F1: 0.6285
   macro-F1: 0.5787

------------------------------------------------------------
DETAILED METRICS (Single Model):
{'accuracy': 0.003537735849056604, 'f1': 0.5601842477716726, 'precision': 0.5315817417436101, 'recall': 0.6564379944797808, 'hamming_loss': 0.1669287211740042}

------------------------------------------------------------
DETAILED METRICS (Ensemble):
{'accuracy': 0.08608490566037735, 'f1': 0.5787466415612805, 'precision': 0.5699927428215142, 'recall': 0.6358995845428329, 'hamming_loss': 0.1292583857442348}
