In [None]:
import wandb
import pickle
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from Modules import plots
import xgboost
from sklearn.preprocessing import LabelEncoder


In [93]:
file_path = "dataframe/test_data_enriched.csv"  
df = pd.read_csv(file_path)

In [None]:
#preprocessing for xgboost
def preprocess_df_xgboost(df):    
    numerical_features = ['game_seconds', 'game_period', 'x_coord', 'y_coord', 
                        'shot_distance', 'shot_angle', 'distance_from_last_event', 
                        'friendly_skaters', 'opposing_skaters', 'shot_angle_change', 'speed']

    categorical_features = [
        'shot_type', 'empty_net', 'last_event_type', 'rebound', 
        'attacking_team_name', 'home_team'
    ]

    # Handle missing values
    df[numerical_features] = df[numerical_features].fillna(df[numerical_features].median())
    df[categorical_features] = df[categorical_features].fillna('unknown')

    # Encode categorical features using One-Hot Encoding
    df_encoded = pd.get_dummies(df[categorical_features], drop_first=True)

    # Combine numerical and encoded categorical features
    X = pd.concat([df[numerical_features], df_encoded], axis=1)
    y = df['is_goal']  
    y = y.fillna(0)
    return X,y


In [None]:
#preprocessing for lightgbm
def preprocess_lightgbm(df):
    data_categ = df.select_dtypes(include=['object']).drop(['attacking_team_name', 'home_team'], axis=1, errors='ignore')
    data_numer = df.select_dtypes(include=['float64', 'int64']).drop(
        ['game_id', 'game_seconds', 'game_period', 'time_since_last_event', 'attacking_team_id'], axis=1, errors='ignore'
    )
    data_bool = df.select_dtypes(include=['bool'])
    le = LabelEncoder()
    data_categ = data_categ.apply(le.fit_transform)
    df_final = pd.concat([data_categ, data_numer, data_bool], axis=1)
    features = [
        'speed', 'distance_from_last_event', 'shot_distance', 'shot_angle',
        'y_coord', 'last_event_y', 'last_event_x', 'shot_type', 'x_coord',
        'shot_angle_change', 'opposing_skaters', 'last_event_type', 'empty_net',
        'friendly_skaters', 'rebound'
    ]
    data_selected = df_final[features]
    X = data_selected
    y = df['is_goal']

    return X, y

**Creation d'une dataframe pour chaque model**

In [None]:
# List of useful columns
colonnes_utiles = [
    'shot_distance', 'shot_angle', 'is_goal',
]

# Ensure game_id is a string
df['game_id'] = df['game_id'].astype(str)

# Filter rows for the regular season
regular_df = df[df['game_id'].str[4:6] == '02']
playoff_df = df[df['game_id'].str[4:6] == '03'].reset_index(drop=True)

# Select only relevant columns for logistic regression
regular_logistic = regular_df[colonnes_utiles]
playoffs_logistic = playoff_df[colonnes_utiles]

X_xgboost_regular,y_xgboost_regular = preprocess_df_xgboost(regular_df)
X_xgboost_playoffs,y_xgboost_playoffs = preprocess_df_xgboost(playoff_df)

X_lightgbm_regular,y_lightgbm_regular = preprocess_lightgbm(regular_df)
X_lightgbm_playoffs,y_lightgbm_playoffs = preprocess_lightgbm(playoff_df)

# Feature and target separation for playoffs
X_playoff = playoff_df.drop('is_goal', axis=1)
y_playoff = playoff_df['is_goal']

# Logistic regression features and targets for regular season
X_log_regular = regular_logistic.drop('is_goal', axis=1).dropna()
y_log_regular = regular_logistic.loc[X_log_regular.index, 'is_goal']

# Logistic regression features and targets for playoffs
X_log_playoff = playoffs_logistic.drop('is_goal', axis=1).dropna()
y_log_playoff = playoffs_logistic.loc[X_log_playoff.index, 'is_goal']


# Distance-based features and targets
X_distance_regular = regular_df[['shot_distance']]
y_distance_regular = regular_df['is_goal']

X_distance_playoff = playoff_df[['shot_distance']]
y_distance_playoff = playoff_df['is_goal']

# Angle-based features and targets
X_angle_regular = regular_df[['shot_angle']]
y_angle_regular = regular_df['is_goal']

X_angle_playoff = playoff_df[['shot_angle']]
y_angle_playoff = playoff_df['is_goal']

In [43]:
# Connect to Wandb project
wandb.init(project="IFT6758.2024-A", name="Test_Evaluation")



**Download models from wandb**

In [None]:
# Define the model artifact names
model_artifacts = {
    "distance_model": "distance_model:v5",
    "angle_model": "angle_model:v5",
    "combined_model": "combined_model:v5",
    "XGBoost": "Best_XGBoost_Q5_2:v0",
    "LightGBM": "lightgbm_model:v1"  
}

import os

# Load models from Wandb
models = {}
for model_name, artifact_name in model_artifacts.items():
    print(f"Loading artifact: {artifact_name}...")
    artifact = wandb.use_artifact(artifact_name, type="model")
    artifact_dir = artifact.download()
    
    # List all files in the downloaded directory
    downloaded_files = os.listdir(artifact_dir)
    print(f"Files in artifact '{artifact_name}': {downloaded_files}")
    
    # Identify and load the relevant model file
    if "XGBoost" in model_name:
        model_file = next((f for f in downloaded_files if f.endswith(".json")), None)
        if model_file:
            model_path = os.path.join(artifact_dir, model_file)
            models[model_name] = xgb.Booster()
            models[model_name].load_model(model_path)
        else:
            raise FileNotFoundError(f"No .json file found for {model_name}")
    
    elif "LightGBM" in model_name:
        model_file = next((f for f in downloaded_files if f.endswith(".pkl")), None)
        if model_file:
            model_path = os.path.join(artifact_dir, model_file)
            with open(model_path, "rb") as f:
                models[model_name] = pickle.load(f)
        else:
            raise FileNotFoundError(f"No .pkl file found for {model_name}")
    
    else:  # Logistic Regression Models
        model_file = next((f for f in downloaded_files if f.endswith(".pkl")), None)
        if model_file:
            model_path = os.path.join(artifact_dir, model_file)
            with open(model_path, "rb") as f:
                models[model_name] = pickle.load(f)
        else:
            raise FileNotFoundError(f"No .pkl file found for {model_name}")

wandb.finish()


**Evaluate Models and plot***

In [78]:
# Evaluation helper function for separate plotting
def evaluate_and_generate_separate_plots(models, X_test, y_test, dataset_name):
    for model_name, model in models.items():
        print(f"Evaluating {model_name} on {dataset_name}...")
        
        # Generate predictions
        if isinstance(model, xgb.Booster):
            dtest = xgb.DMatrix(X_test)
            y_pred_proba = model.predict(dtest)
        elif isinstance(model, lgb.Booster):
            y_pred_proba = model.predict(X_test, raw_score=False)
        else:
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Generate data for plots
        roc_data = plots.generate_roc_auc_data(y_test, y_pred_proba)
        goal_rate_x, goal_rate_y = plots.generate_goal_rate_data(y_test, y_pred_proba)
        cumulative_x, cumulative_y = plots.generate_cumulative_goal_data(y_test, y_pred_proba)
        calibration_prob_pred, calibration_prob_true = plots.generate_calibration_data(y_test, y_pred_proba)
        
        # Generate individual plots
        print(f"Generating plots for {model_name}...")
        
        # ROC/AUC Plot
        plots.plot_roc_auc([roc_data], [model_name],f"roc_{model_name}_{dataset_name}.png")
        
        # Goal Rate Plot
        plots.plot_goal_rate([(goal_rate_x, goal_rate_y)], [model_name],f"goal_rate_{model_name}_{dataset_name}.png")
        
        # Cumulative Goals Plot
        plots.plot_cumulative_goals([(cumulative_x, cumulative_y)], [model_name],f"cumulative_goals_{model_name}_{dataset_name}.png")
        
        # Calibration Curve
        plots.plot_calibration([y_test], [(calibration_prob_true, calibration_prob_pred)], [model_name],f"calibration_{model_name}_{dataset_name}.png")



In [None]:
# Define a function to evaluate a model
def evaluate_model(model, X, y, model_name, dataset_name):
    
    # Predict probabilities for the positive class
    y_pred_proba = model.predict_proba(X)[:, 1]   # Extract positive class probabilities

    # Generate data for plots using helper functions
    roc_data = plots.generate_roc_auc_data(y, y_pred_proba)
    goal_rate_x, goal_rate_y = plots.generate_goal_rate_data(y, y_pred_proba)
    cumulative_x, cumulative_y = plots.generate_cumulative_goal_data(y, y_pred_proba)
    calibration_prob_true, calibration_prob_pred = plots.generate_calibration_data(y, y_pred_proba)

    # ROC/AUC Plot
    plots.plot_roc_auc([roc_data], [model_name], f"roc_{model_name}_{dataset_name}.png")

    # Goal Rate Plot
    plots.plot_goal_rate([(goal_rate_x, goal_rate_y)], [model_name], f"goal_rate_{model_name}_{dataset_name}.png")

    # Cumulative Goals Plot
    plots.plot_cumulative_goals([(cumulative_x, cumulative_y)], [model_name], f"cumulative_goals_{model_name}_{dataset_name}.png")

    # Calibration Curve
    plots.plot_calibration([y], [y_pred_proba], [model_name], f"calibration_{model_name}_{dataset_name}.png")


In [136]:
# List of columns to drop
columns_to_drop = [
    "last_event_type_period-end",
    "last_event_type_period-start",
    "last_event_type_delayed-penalty",
    "last_event_type_stoppage",
    "shot_type_unknown",
    "last_event_type_game-end"
]

# Drop the columns from the DataFrame
X_xgboost_regular = X_xgboost_regular.drop(columns=columns_to_drop, errors="ignore")
X_xgboost_playoffs = X_xgboost_playoffs.drop(columns=columns_to_drop, errors="ignore")


In [None]:
model_datasets = [
     ('distance_model', X_distance_regular, y_distance_regular, "Regular Season"),
    ('distance_model', X_distance_playoff, y_distance_playoff, "Playoffs"),
    ('angle_model', X_angle_regular, y_angle_regular, "Regular Season"),
    ('angle_model', X_angle_playoff, y_angle_playoff, "Playoffs"),
    ('combined_model', X_log_regular, y_log_regular, "Regular Season"),
    ('combined_model', X_log_playoff, y_log_playoff, "Playoffs"),
    ('XGBoost', X_xgboost_regular, y_xgboost_regular, "Regular Season"),
    #('XGBoost', X_xgboost_playoffs, y_xgboost_playoffs, "Playoffs"),
    ('LightGBM', X_lightgbm_regular, y_lightgbm_regular, "Regular Season"),
    ('LightGBM', X_lightgbm_playoffs, y_lightgbm_playoffs, "Playoffs")
]

for model_name, X, y, dataset_name in model_datasets:
    model = models[model_name]  # Retrieve the model from the dictionary

    print(f"Evaluating {model_name} on {dataset_name} dataset...")

    if model_name == 'XGBoost':
        # Directly use XGBClassifier's API
        dmatrix = xgboost.DMatrix(X) 
        y_pred_proba = model.predict(dmatrix)  

        # Debugging to confirm output
        print(f"y_pred_proba shape: {y_pred_proba.shape}, y_pred_proba example: {y_pred_proba[:5]}")

        # Generate plots
        roc_data = plots.generate_roc_auc_data(y, y_pred_proba)
        goal_rate_x, goal_rate_y = plots.generate_goal_rate_data(y, y_pred_proba)
        cumulative_x, cumulative_y = plots.generate_cumulative_goal_data(y, y_pred_proba)
        calibration_prob_pred = y_pred_proba  # Positive class probabilities
        calibration_prob_true = y  # True labels

        # Plot generation
        plots.plot_roc_auc([roc_data], [model_name], f"roc_{model_name}_{dataset_name}.png")
        plots.plot_goal_rate([(goal_rate_x, goal_rate_y)], [model_name], f"goal_rate_{model_name}_{dataset_name}.png")
        plots.plot_cumulative_goals([(cumulative_x, cumulative_y)], [model_name], f"cumulative_goals_{model_name}_{dataset_name}.png")
        plots.plot_calibration([y], [calibration_prob_pred], [model_name], f"calibration_{model_name}_{dataset_name}.png")
    else:
        # Evaluate using the common function for other models
        evaluate_model(model, X, y, model_name, dataset_name)



Evaluating distance_model on Regular Season dataset...
Shape of y (true labels): (94320,)
Shape of calibration_prob_pred (positive class probabilities): (2,)
Generating plots for distance_model on Regular Season...
Evaluating distance_model on Playoffs dataset...
Shape of y (true labels): (9996,)
Shape of calibration_prob_pred (positive class probabilities): (2,)
Generating plots for distance_model on Playoffs...
Evaluating angle_model on Regular Season dataset...
Shape of y (true labels): (94320,)
Shape of calibration_prob_pred (positive class probabilities): (1,)
Generating plots for angle_model on Regular Season...
Evaluating angle_model on Playoffs dataset...
Shape of y (true labels): (9996,)
Shape of calibration_prob_pred (positive class probabilities): (1,)
Generating plots for angle_model on Playoffs...
Evaluating combined_model on Regular Season dataset...
Shape of y (true labels): (94095,)
Shape of calibration_prob_pred (positive class probabilities): (2,)
Generating plots for

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>