# Start

In [0]:
# Essential Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import warnings

# Data prepare

In [0]:
# read data
data_route = '/Workspace/Repos/aeo-chiyou-ip@aeondic.onmicrosoft.com/kaggles/March_Machine_Learning_Mania_2025/data'
m_dfs, f_dfs = {}, {}
m_dfs['season'] = pd.read_csv(os.path.join(data_route, 'MRegularSeasonDetailedResults.csv'))
m_dfs['tournament'] = pd.read_csv(os.path.join(data_route, 'MNCAATourneyDetailedResults.csv'))
m_dfs['seed'] = pd.read_csv(os.path.join(data_route, 'MNCAATourneySeeds.csv'))

# Training Model

In [0]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb

In [0]:
df_season = m_dfs['season'].copy()
df_submission = pd.read_csv(os.path.join(data_route, "SampleSubmissionStage1.csv"))

In [0]:
# 2. Compute average team statistics from regular season games
stat_cols = ["WScore", "LScore", "WFGM", "WFGA", "WFGM3", "WFGA3", "WFTM", "WFTA", "WAst", "WTO", "WOR", "WDR"]
team_stats = df_season.groupby("WTeamID").agg({col: "mean" for col in stat_cols}).reset_index()
team_stats.rename(columns={"WTeamID": "TeamID"}, inplace=True)

In [0]:
# 3. Create the training dataset from historical games
df_train = df_season.copy()
# Define Team1 as the team with the lower ID and Team2 as the one with the higher ID.
df_train["Team1"] = df_train[["WTeamID", "LTeamID"]].min(axis=1)
df_train["Team2"] = df_train[["WTeamID", "LTeamID"]].max(axis=1)

## Create features

In [0]:
# Label = 1 if Team1 won (i.e., if WTeamID == Team1), else 0.
df_train["Label"] = (df_train["WTeamID"] == df_train["Team1"]).astype(int)
# Drop original game statistics to avoid conflicts.
df_train = df_train.drop(columns=stat_cols, errors="ignore")

In [0]:
# Merge average statistics for Team1
df_train = df_train.merge(team_stats, left_on="Team1", right_on="TeamID", how="left")
rename_dict_T1 = {col: col + "_T1" for col in stat_cols}
df_train.rename(columns=rename_dict_T1, inplace=True)
df_train.drop("TeamID", axis=1, inplace=True)
# Merge average statistics for Team2
df_train = df_train.merge(team_stats, left_on="Team2", right_on="TeamID", how="left", suffixes=("", "_T2"))
rename_dict_T2 = {col: col + "_T2" for col in stat_cols}
df_train.rename(columns=rename_dict_T2, inplace=True)
df_train.drop("TeamID", axis=1, inplace=True)

# Compute difference features for each game
for col in stat_cols:
    df_train["Diff_" + col] = df_train[col + "_T1"] - df_train[col + "_T2"]

feature_cols = ["Diff_" + col for col in stat_cols]
X = df_train[feature_cols]
y = df_train["Label"]
# Imputation (au cas où certaines valeurs manqueraient)
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# Train

In [0]:
# 4. Create a stacking model using XGBoost, LightGBM and CatBoost as base learners and Logistic Regression as meta-model.
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
lgb_model = lgb.LGBMClassifier(random_state=42)

estimators = [
    ("xgb", xgb_model),
    ("lgb", lgb_model),
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    n_jobs=-1
)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
stacking_model.fit(X_train, y_train)

# Evaluate on validation set using Brier score loss
y_val_proba = stacking_model.predict_proba(X_val)[:, 1]
brier = brier_score_loss(y_val, y_val_proba)
print("Brier score loss on validation:", brier)

In [0]:
# 5. Prepare predictions for the submission file
# The submission file contains an "ID" column in the format "2025_Team1_Team2" (with Team1 < Team2)
df_submission[["Season", "Team1", "Team2"]] = df_submission["ID"].str.split("_", expand=True)
df_submission["Team1"] = df_submission["Team1"].astype(int)
df_submission["Team2"] = df_submission["Team2"].astype(int)

# Merge average statistics for Team1 in the submission
df_sub = df_submission.merge(team_stats, left_on="Team1", right_on="TeamID", how="left")
df_sub.rename(columns=rename_dict_T1, inplace=True)
df_sub.drop("TeamID", axis=1, inplace=True)

# Merge average statistics for Team2 in the submission
df_sub = df_sub.merge(team_stats, left_on="Team2", right_on="TeamID", how="left", suffixes=("", "_T2"))
df_sub.rename(columns=rename_dict_T2, inplace=True)
df_sub.drop("TeamID", axis=1, inplace=True)

# Compute difference features for submission
for col in stat_cols:
    df_sub["Diff_" + col] = df_sub[col + "_T1"] - df_sub[col + "_T2"]

X_sub = df_sub[feature_cols]
X_sub = imputer.transform(X_sub)  # Apply the same imputation as on training data

# Predict probabilities for each matchup using the stacking model
submission_proba = stacking_model.predict_proba(X_sub)[:, 1]
df_submission["Pred"] = submission_proba

# Importance

In [0]:
# Feature importance from the LightGBM model (extracted from the stacking)
# Ensure that the LightGBM model is among your stacking estimators
lgb_model = stacking_model.named_estimators_["lgb"]
# Extract the feature importances
lgb_importance = lgb_model.feature_importances_
# Convert to a DataFrame for easier visualization
importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": lgb_importance
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis")
plt.title("Feature Importance from LightGBM")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# output

In [0]:
df_submission[["ID", "Pred"]].to_csv("submission.csv", index=False)
print("submission.csv")