In [2]:
import warnings, json, ast
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier,
)
from sklearn.neighbors import KNeighborsClassifier

try:
    import xgboost as xgb
except Exception:
    xgb = None
    print("xgboost not installed. Run: pip install xgboost")

try:
    from lightgbm import LGBMClassifier
except Exception:
    LGBMClassifier = None
    print("lightgbm not installed. Run: pip install lightgbm")

try:
    from catboost import CatBoostClassifier
except Exception:
    CatBoostClassifier = None
    print("catboost not installed. Run: pip install catboost")

try:
    import optuna
except Exception:
    optuna = None
    print("optuna not installed. Run: pip install optuna")


def load_and_preprocess(train_path, val_path, test_path):
    """
    Load train/val/test datasets and perform all preprocessing steps.
    This includes:
    - Merge train + validation
    - Parse release_date to year / month
    - Extract primary_genre
    - Compute ROI and 3-class label label_roi3: Hit / Average / Flop
    """
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    train = pd.concat([train, val], ignore_index=True)
    print("Loaded datasets — Train+Val:", train.shape, " Test:", test.shape)

    if "release_date" in train.columns:
        train["release_date"] = pd.to_datetime(train["release_date"], errors="coerce")
        test["release_date"] = pd.to_datetime(test["release_date"], errors="coerce")

        train["year"] = train["release_date"].dt.year
        train["month"] = train["release_date"].dt.month

        test["year"] = test["release_date"].dt.year
        test["month"] = test["release_date"].dt.month

    for col in ["year", "month"]:
        train[col] = pd.to_numeric(train.get(col, 2015), errors="coerce").fillna(2015)
        test[col] = pd.to_numeric(test.get(col, 2016), errors="coerce").fillna(2016)

    def parse_primary_genre(genres_cell):
        if pd.isna(genres_cell):
            return "Unknown"
        try:
            if isinstance(genres_cell, str):
                try:
                    val = json.loads(genres_cell)
                except Exception:
                    val = ast.literal_eval(genres_cell)
            else:
                val = genres_cell

            if isinstance(val, list) and len(val) > 0:
                first = val[0]
                if isinstance(first, dict) and "name" in first:
                    return first["name"]
                if isinstance(first, str):
                    return first
            if isinstance(val, dict) and "name" in val:
                return val["name"]
            return str(val)
        except Exception:
            return "Unknown"

    if "primary_genre" not in train.columns:
        if "genres" in train.columns:
            train["primary_genre"] = train["genres"].apply(parse_primary_genre)
            if "genres" in test.columns:
                test["primary_genre"] = test["genres"].apply(parse_primary_genre)
            else:
                test["primary_genre"] = "Unknown"
        else:
            train["primary_genre"] = "Unknown"
            test["primary_genre"] = "Unknown"

    if "label_roi3" not in train.columns:
        print("Creating label_roi3 from budget & revenue.")
        for df in [train, test]:
            df["budget"] = pd.to_numeric(df["budget"], errors="coerce")
            df["revenue"] = pd.to_numeric(df["revenue"], errors="coerce")
            df["ROI"] = (df["revenue"] - df["budget"]) / df["budget"]
            df["ROI"] = df["ROI"].replace([np.inf, -np.inf], np.nan)

            def roi_to_class(roi):
                if pd.isna(roi):
                    return np.nan
                if roi >= 2.5567:
                    return "Hit"
                if roi < 0.0049:
                    return "Flop"
                return "Average"

            df["label_roi3"] = df["ROI"].apply(roi_to_class)

    train = train.dropna(subset=["label_roi3"])
    test = test.dropna(subset=["label_roi3"])

    for col in ["budget", "revenue", "year", "month"]:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col] = pd.to_numeric(test[col], errors="coerce")

    print("Finished preprocessing. Label distribution (train):")
    print(train["label_roi3"].value_counts())

    return train, test


def train_models(train, test, random_state=42, use_optuna=False, n_optuna_trials=20):
    """
    Train:
    - Tang 2024 Optimized XGBoost (RandomizedSearchCV)  [baseline]
    - Gupta-style Voting & Stacking Ensembles            [baseline]
    - NEW: LightGBM                                      [novelty]
    - NEW: CatBoost                                      [novelty]
    - OPTIONAL NEW: Optuna-optimized XGBoost             [novelty]

    Returns:
        results: dict of model_name -> metrics dict
    """
    num_features = ["budget", "revenue", "year", "month"]
    cat_features = ["primary_genre"]
    target = "label_roi3"

    le = LabelEncoder()
    y_train_enc = le.fit_transform(train[target])
    y_test_enc = le.transform(test[target])

    X_train = train[num_features + cat_features].copy()
    X_test = test[num_features + cat_features].copy()

    num_tf = Pipeline([("imp", SimpleImputer(strategy="median"))])
    cat_tf = Pipeline(
        [
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    pre = ColumnTransformer(
        [
            ("num", num_tf, num_features),
            ("cat", cat_tf, cat_features),
        ]
    )

    results = {}

    if xgb is not None:
        print("\n[Baseline] Tang 2024 Optimized XGBoost (RandomizedSearchCV)")
        clf_xgb = xgb.XGBClassifier(
            objective="multi:softprob",
            num_class=len(le.classes_),
            eval_metric="mlogloss",
            random_state=random_state,
            n_estimators=300,
        )

        pipe_xgb = Pipeline([("pre", pre), ("xgb", clf_xgb)])

        param_dist = {
            "xgb__max_depth": [3, 4, 5, 6],
            "xgb__learning_rate": [0.02, 0.05, 0.1],
            "xgb__subsample": [0.8, 1.0],
            "xgb__colsample_bytree": [0.8, 1.0],
            "xgb__min_child_weight": [1, 3, 5],
        }

        search = RandomizedSearchCV(
            pipe_xgb,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1_macro",
            cv=3,
            n_jobs=-1,
            verbose=1,
            random_state=random_state,
        )
        search.fit(X_train, y_train_enc)
        best_xgb = search.best_estimator_

        pred_xgb_enc = best_xgb.predict(X_test)

        results["Tang_XGBoost"] = {
            "Accuracy": accuracy_score(y_test_enc, pred_xgb_enc),
            "F1_macro": f1_score(y_test_enc, pred_xgb_enc, average="macro"),
        }
        print("Tang XGBoost Results:", results["Tang_XGBoost"])
    else:
        print("Skip XGBoost baseline (xgboost not installed).")

    print("\n[Baseline] Gupta-style Ensemble Models (Voting & Stacking)")
    lr = LogisticRegression(max_iter=200)
    svm = SVC(probability=True, kernel="rbf", C=2.0, gamma="scale", random_state=random_state)
    rf = RandomForestClassifier(n_estimators=300, random_state=random_state)
    gb = GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, random_state=random_state
    )
    knn = KNeighborsClassifier(n_neighbors=15)

    voters = [("lr", lr), ("svm", svm), ("rf", rf), ("gb", gb), ("knn", knn)]

    soft_vote = VotingClassifier(estimators=voters, voting="soft")
    pipe_vote = Pipeline([("pre", pre), ("ens", soft_vote)])
    pipe_vote.fit(X_train, train[target])
    pred_vote = pipe_vote.predict(X_test)
    results["Voting"] = {
        "Accuracy": accuracy_score(test[target], pred_vote),
        "F1_macro": f1_score(test[target], pred_vote, average="macro"),
    }
    print("Voting Results:", results["Voting"])

    stack = StackingClassifier(
        estimators=voters,
        final_estimator=LogisticRegression(max_iter=200),
        stack_method="predict_proba",
    )
    pipe_stack = Pipeline([("pre", pre), ("ens", stack)])
    pipe_stack.fit(X_train, train[target])
    pred_stack = pipe_stack.predict(X_test)
    results["Stacking"] = {
        "Accuracy": accuracy_score(test[target], pred_stack),
        "F1_macro": f1_score(test[target], pred_stack, average="macro"),
    }
    print("Stacking Results:", results["Stacking"])

    if LGBMClassifier is not None:
        print("\n[Novelty] LightGBM Model")
        lgbm = LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_state,
        )
        pipe_lgbm = Pipeline([("pre", pre), ("lgbm", lgbm)])
        pipe_lgbm.fit(X_train, y_train_enc)
        pred_lgbm = pipe_lgbm.predict(X_test)
        results["LightGBM"] = {
            "Accuracy": accuracy_score(y_test_enc, pred_lgbm),
            "F1_macro": f1_score(y_test_enc, pred_lgbm, average="macro"),
        }
        print("LightGBM Results:", results["LightGBM"])
    else:
        print("Skip LightGBM (lightgbm not installed).")

    if CatBoostClassifier is not None:
        print("\n[Novelty] CatBoost Model")
        cb = CatBoostClassifier(
            depth=6,
            learning_rate=0.05,
            iterations=500,
            loss_function="MultiClass",
            verbose=False,
            random_state=random_state,
        )
        pipe_cb = Pipeline([("pre", pre), ("cb", cb)])
        pipe_cb.fit(X_train, y_train_enc)
        pred_cb = pipe_cb.predict(X_test)
        results["CatBoost"] = {
            "Accuracy": accuracy_score(y_test_enc, pred_cb),
            "F1_macro": f1_score(y_test_enc, pred_cb, average="macro"),
        }
        print("CatBoost Results:", results["CatBoost"])
    else:
        print("Skip CatBoost (catboost not installed).")

    if use_optuna and (optuna is not None) and (xgb is not None):
        print("\n[Novelty] Optuna-optimized XGBoost (this may take some time)...")

        def objective(trial):
            params = {
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "n_estimators": trial.suggest_int("n_estimators", 200, 500),
            }

            clf = xgb.XGBClassifier(
                objective="multi:softprob",
                num_class=len(le.classes_),
                eval_metric="mlogloss",
                random_state=random_state,
                **params,
            )
            pipe = Pipeline([("pre", pre), ("xgb", clf)])
            scores = cross_val_score(
                pipe,
                X_train,
                y_train_enc,
                cv=3,
                scoring="f1_macro",
                n_jobs=-1,
            )
            return scores.mean()

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_optuna_trials)

        best_params = study.best_params
        print("Best Optuna XGBoost params:", best_params)

        clf_opt = xgb.XGBClassifier(
            objective="multi:softprob",
            num_class=len(le.classes_),
            eval_metric="mlogloss",
            random_state=random_state,
            **best_params,
        )
        pipe_opt = Pipeline([("pre", pre), ("xgb", clf_opt)])
        pipe_opt.fit(X_train, y_train_enc)
        pred_opt = pipe_opt.predict(X_test)
        results["Optuna_XGBoost"] = {
            "Accuracy": accuracy_score(y_test_enc, pred_opt),
            "F1_macro": f1_score(y_test_enc, pred_opt, average="macro"),
        }
        print("Optuna_XGBoost Results:", results["Optuna_XGBoost"])
    elif use_optuna:
        print("Optuna XGBoost requested but optuna/xgboost not installed — skipping.")

    return results


def evaluate_results(results_dict):
    """Print and compare all model results in a table."""
    print("\n FINAL COMPARISON ")
    df = pd.DataFrame(results_dict).T
    if "F1_macro" in df.columns:
        df = df.sort_values("F1_macro", ascending=False)
    print(df)
    if "F1_macro" in df.columns:
        print(
            "\nBest model based on F1_macro:",
            df["F1_macro"].idxmax(),
            "→ F1 =",
            round(df["F1_macro"].max(), 4),
        )
    return df

if __name__ == "__main__":
    TRAIN_PATH = "train_movies.csv"
    VAL_PATH = "validation_movies.csv"
    TEST_PATH = "test_movies.csv"

    train, test = load_and_preprocess(TRAIN_PATH, VAL_PATH, TEST_PATH)

    results = train_models(train, test, random_state=42, use_optuna=False)
    summary = evaluate_results(results)


catboost not installed. Run: pip install catboost
optuna not installed. Run: pip install optuna
Loaded datasets — Train+Val: (36373, 24)  Test: (9093, 24)
Creating label_roi3 from budget & revenue.
Finished preprocessing. Label distribution (train):
label_roi3
Flop       4205
Average    1681
Hit        1320
Name: count, dtype: int64

[Baseline] Tang 2024 Optimized XGBoost (RandomizedSearchCV)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Tang XGBoost Results: {'Accuracy': 0.9845605700712589, 'F1_macro': 0.978974825891488}

[Baseline] Gupta-style Ensemble Models (Voting & Stacking)
Voting Results: {'Accuracy': 0.9946555819477435, 'F1_macro': 0.9935412312633184}
Stacking Results: {'Accuracy': 1.0, 'F1_macro': 1.0}

[Novelty] LightGBM Model
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 651
[LightGBM] [Info] Number of data points in