In [2]:
pip install xgboost



In [3]:
pip install shap



In [2]:
import warnings, json, ast
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

try:
    import xgboost as xgb
except Exception:
    xgb = None
    print("xgboost not installed. Run: pip install xgboost")

def load_and_preprocess(train_path, val_path, test_path):
    """Load train/val/test datasets and perform all preprocessing steps."""
    train = pd.read_csv(train_path)
    val   = pd.read_csv(val_path)
    test  = pd.read_csv(test_path)
    train = pd.concat([train, val], ignore_index=True)
    print("Loaded datasets — Train+Val:", train.shape, " Test:", test.shape)

    if "release_date" in train.columns:
        train["release_date"] = pd.to_datetime(train["release_date"], errors="coerce")
        test["release_date"]  = pd.to_datetime(test["release_date"], errors="coerce")
        train["year"]  = train["release_date"].dt.year
        train["month"] = train["release_date"].dt.month
        test["year"]   = test["release_date"].dt.year
        test["month"]  = test["release_date"].dt.month

    for col in ["year", "month"]:
        train[col] = pd.to_numeric(train.get(col, 2015), errors="coerce").fillna(2015)
        test[col]  = pd.to_numeric(test.get(col, 2016), errors="coerce").fillna(2016)
    def parse_primary_genre(genres_cell):
        if pd.isna(genres_cell): return "Unknown"
        try:
            if isinstance(genres_cell, str):
                try:
                    val = json.loads(genres_cell)
                except Exception:
                    val = ast.literal_eval(genres_cell)
            else:
                val = genres_cell
            if isinstance(val, list) and len(val) > 0:
                first = val[0]
                if isinstance(first, dict) and "name" in first:
                    return first["name"]
                if isinstance(first, str):
                    return first
            if isinstance(val, dict) and "name" in val:
                return val["name"]
            return str(val)
        except Exception:
            return "Unknown"

    if "primary_genre" not in train.columns:
        if "genres" in train.columns:
            train["primary_genre"] = train["genres"].apply(parse_primary_genre)
            test["primary_genre"]  = test["genres"].apply(parse_primary_genre) if "genres" in test.columns else "Unknown"
        else:
            train["primary_genre"] = "Unknown"
            test["primary_genre"]  = "Unknown"

    if "label_roi3" not in train.columns:
        print("Creating label_roi3 from budget & revenue.")
        for df in [train, test]:
            df["budget"] = pd.to_numeric(df["budget"], errors="coerce")
            df["revenue"] = pd.to_numeric(df["revenue"], errors="coerce")
            df["ROI"] = (df["revenue"] - df["budget"]) / df["budget"]
            df["ROI"] = df["ROI"].replace([np.inf, -np.inf], np.nan)

            def roi_to_class(roi):
                if pd.isna(roi): return np.nan
                if roi >= 2.5567: return "Hit"
                if roi < 0.0049: return "Flop"
                return "Average"
            df["label_roi3"] = df["ROI"].apply(roi_to_class)

    train = train.dropna(subset=["label_roi3"])
    test  = test.dropna(subset=["label_roi3"])

    for col in ["budget","revenue","year","month"]:
        train[col] = pd.to_numeric(train[col], errors="coerce")
        test[col]  = pd.to_numeric(test[col], errors="coerce")

    print("Finished preprocessing. Label distribution:")
    print(train["label_roi3"].value_counts())

    return train, test


def train_models(train, test, random_state=42):
    """Train Tang 2024 Optimized XGBoost and Gupta Ensemble models."""
    num_features = ["budget","revenue","year","month"]
    cat_features = ["primary_genre"]
    target = "label_roi3"

    le = LabelEncoder()
    y_train_enc = le.fit_transform(train[target])
    y_test_enc  = le.transform(test[target])

    X_train = train[num_features + cat_features].copy()
    X_test  = test[num_features + cat_features].copy()

    num_tf = Pipeline([("imp", SimpleImputer(strategy="median"))])
    cat_tf = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                       ("oh", OneHotEncoder(handle_unknown="ignore"))])
    pre = ColumnTransformer([("num", num_tf, num_features),
                             ("cat", cat_tf, cat_features)])

    results = {}


    print("\n Tang 2024 Optimized XGBoost")
    clf = xgb.XGBClassifier(
        objective="multi:softprob",
        num_class=len(le.classes_),
        eval_metric="mlogloss",
        random_state=random_state,
        n_estimators=300
    )
    param_dist = {
        "xgb__max_depth": [3,4,5,6],
        "xgb__learning_rate": [0.02,0.05,0.1],
        "xgb__subsample": [0.8,1.0],
        "xgb__colsample_bytree": [0.8,1.0],
        "xgb__min_child_weight": [1,3,5],
    }
    pipe_xgb = Pipeline([("pre", pre), ("xgb", clf)])
    search = RandomizedSearchCV(pipe_xgb, param_dist, n_iter=10,
                                scoring="f1_macro", cv=3, n_jobs=-1,
                                verbose=1, random_state=random_state)
    search.fit(X_train, y_train_enc)
    best_xgb = search.best_estimator_
    pred_xgb_enc = best_xgb.predict(X_test)
    pred_xgb = le.inverse_transform(pred_xgb_enc)

    results["Tang_XGBoost"] = {
        "Accuracy": accuracy_score(test[target], pred_xgb),
        "F1_macro": f1_score(test[target], pred_xgb, average="macro")
    }
    print("Tang XGBoost Results:", results["Tang_XGBoost"])

    print("\n Gupta Ensemble Models")
    lr  = LogisticRegression(max_iter=200)
    svm = SVC(probability=True, kernel="rbf", C=2.0, gamma="scale", random_state=random_state)
    rf  = RandomForestClassifier(n_estimators=300, random_state=random_state)
    gb  = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, random_state=random_state)
    knn = KNeighborsClassifier(n_neighbors=15)
    voters = [("lr", lr), ("svm", svm), ("rf", rf), ("gb", gb), ("knn", knn)]

    soft_vote = VotingClassifier(estimators=voters, voting="soft")
    pipe_vote = Pipeline([("pre", pre), ("ens", soft_vote)])
    pipe_vote.fit(X_train, train[target])
    pred_vote = pipe_vote.predict(X_test)
    results["Voting"] = {
        "Accuracy": accuracy_score(test[target], pred_vote),
        "F1_macro": f1_score(test[target], pred_vote, average="macro")
    }
    print("Voting Results:", results["Voting"])

    stack = StackingClassifier(
        estimators=voters,
        final_estimator=LogisticRegression(max_iter=200),
        stack_method="predict_proba"
    )
    pipe_stack = Pipeline([("pre", pre), ("ens", stack)])
    pipe_stack.fit(X_train, train[target])
    pred_stack = pipe_stack.predict(X_test)
    results["Stacking"] = {
        "Accuracy": accuracy_score(test[target], pred_stack),
        "F1_macro": f1_score(test[target], pred_stack, average="macro")
    }
    print("Stacking Results:", results["Stacking"])

    return results


def evaluate_results(results_dict):
    """Print and compare all model results."""
    print("\n FINAL COMPARISON")
    df = pd.DataFrame(results_dict).T
    print(df)
    print("\n Best model based on F1_macro:",
          df["F1_macro"].idxmax(),
          "→ F1 =", round(df["F1_macro"].max(), 4))
    return df

if __name__ == "__main__":
    TRAIN_PATH = "train_movies.csv"
    VAL_PATH   = "validation_movies.csv"
    TEST_PATH  = "test_movies.csv"

    train, test = load_and_preprocess(TRAIN_PATH, VAL_PATH, TEST_PATH)
    results = train_models(train, test, random_state=42)
    summary = evaluate_results(results)


Loaded datasets — Train+Val: (36373, 24)  Test: (9093, 24)
Creating label_roi3 from budget & revenue.
Finished preprocessing. Label distribution:
label_roi3
Flop       4205
Average    1681
Hit        1320
Name: count, dtype: int64

 Tang 2024 Optimized XGBoost
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Tang XGBoost Results: {'Accuracy': 0.9845605700712589, 'F1_macro': 0.978974825891488}

 Gupta Ensemble Models
Voting Results: {'Accuracy': 0.9946555819477435, 'F1_macro': 0.9935412312633184}
Stacking Results: {'Accuracy': 1.0, 'F1_macro': 1.0}

 FINAL COMPARISON
              Accuracy  F1_macro
Tang_XGBoost  0.984561  0.978975
Voting        0.994656  0.993541
Stacking      1.000000  1.000000

 Best model based on F1_macro: Stacking → F1 = 1.0
