In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import wandb
from sklearn.preprocessing import MinMaxScaler
import torch
from Modules import plots

In [2]:
file_path = "dataframe/train_data_enriched.csv"  
df = pd.read_csv(file_path)

# Identify numerical and categorical features
numerical_features = ['game_seconds', 'game_period', 'x_coord', 'y_coord', 
                      'shot_distance', 'shot_angle', 'distance_from_last_event', 
                      'friendly_skaters', 'opposing_skaters', 'shot_angle_change', 'speed']
categorical_features = [
    'shot_type', 'empty_net', 'last_event_type', 'rebound', 
    'attacking_team_name', 'home_team'
]

# Handle missing values
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].median())
df[categorical_features] = df[categorical_features].fillna('unknown')

# Encode categorical features using One-Hot Encoding
df_encoded = pd.get_dummies(df[categorical_features], drop_first=True)

# Combine numerical and encoded categorical features
X = pd.concat([df[numerical_features], df_encoded], axis=1)
y = df['is_goal']

# Ensure target variable has no missing values
y = y.fillna(0)

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val= scaler.transform(X_val)
X_test = scaler.transform(X_test)
# Convert data to NumPy arrays for TabNet

y_train_np = y_train.values
y_val_np = y_val.values
y_test_np = y_test.values


In [3]:
wandb.init(
    project="IFT6758.2024-A",
    name="TabNet Question 6",
    notes="Modele tabnet pour la question 6"
)


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: hicham-mazouzi (hicham-mazouzi-university-of-montreal). Use `wandb login --relogin` to force relogin


In [5]:
# Define and train the TabNet classifier
tabnet_clf = TabNetClassifier(
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="sparsemax"  # "entmax" can also be used
)

tabnet_clf.fit(
    X_train=X_train, y_train=y_train_np,
    eval_set=[(X_val, y_val_np)],
    eval_name=["validation"],
    eval_metric=["auc", "accuracy", "logloss"],  # Add multiple metrics here
    max_epochs=50,
    patience=5,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
)






epoch 0  | loss: 0.17642 | validation_auc: 0.8156  | validation_accuracy: 0.95014 | validation_logloss: 0.16577 |  0:01:08s
epoch 1  | loss: 0.16737 | validation_auc: 0.82583 | validation_accuracy: 0.9503  | validation_logloss: 0.16377 |  0:02:16s
epoch 2  | loss: 0.1658  | validation_auc: 0.82686 | validation_accuracy: 0.95008 | validation_logloss: 0.16263 |  0:03:25s
epoch 3  | loss: 0.16513 | validation_auc: 0.82706 | validation_accuracy: 0.95032 | validation_logloss: 0.16228 |  0:04:35s
epoch 4  | loss: 0.16471 | validation_auc: 0.82809 | validation_accuracy: 0.95026 | validation_logloss: 0.16212 |  0:05:59s
epoch 5  | loss: 0.16436 | validation_auc: 0.82936 | validation_accuracy: 0.95022 | validation_logloss: 0.16273 |  0:07:23s
epoch 6  | loss: 0.16447 | validation_auc: 0.82977 | validation_accuracy: 0.95023 | validation_logloss: 0.16248 |  0:08:47s
epoch 7  | loss: 0.16383 | validation_auc: 0.82726 | validation_accuracy: 0.95021 | validation_logloss: 0.16617 |  0:10:07s
epoch 8 



In [9]:
# Log metrics to Wandb dynamically
for epoch in range(len(tabnet_clf.history['loss'])):
    wandb.log({
        "epoch": epoch + 1,  # Epoch index starts at 1
        "train_loss": tabnet_clf.history['loss'][epoch],
        "learning_rate": tabnet_clf.history['lr'][epoch],
        "validation_auc": tabnet_clf.history['validation_auc'][epoch],
        "validation_accuracy": tabnet_clf.history['validation_accuracy'][epoch],
        "validation_logloss": tabnet_clf.history['validation_logloss'][epoch],
    })


# Log the best validation AUC for reference
wandb.log({"best_validation_auc": tabnet_clf.best_cost})

In [10]:
# Generate predictions
y_pred_proba = tabnet_clf.predict_proba(X_test)[:, 1]

# Compute ROC data
roc_data_tabnet = plots.generate_roc_auc_data(y_test, y_pred_proba)

# Generate data for other plots
goal_rate_x, goal_rate_y = plots.generate_goal_rate_data(y_test, y_pred_proba)
cumulative_x, cumulative_y = plots.generate_cumulative_goal_data(y_test, y_pred_proba)
calibration_prob_pred, calibration_prob_true = plots.generate_calibration_data(y_test, y_pred_proba)


In [11]:
# Plot ROC/AUC
plots.plot_roc_auc([roc_data_tabnet], ['XGBoost '],"roc_tabnet.png")

# Plot Goal Rate vs Percentile
plots.plot_goal_rate([(goal_rate_x, goal_rate_y)], ['XGBoost'],"goal_rate_tabnet.png")

# Plot Cumulative Goals vs Percentile
plots.plot_cumulative_goals([(cumulative_x, cumulative_y)], ['XGBoost'],"cumulative_goals_tabnet.png")

# Plot Calibration Curve
plots.plot_calibration([y_val], [y_pred_proba], ['XGBoost'],"calibration_tabnet.png")

<Figure size 1200x800 with 0 Axes>

In [None]:

# Log the image to W&B
wandb.log({"roc_curve": wandb.Image("roc_tabnet.png", caption="ROC Curve for Tabnet")})
wandb.log({"goal_rate": wandb.Image("goal_rate_tabnet.png", caption="Goal Rate Curve for Tabnet")})
wandb.log({"cumulative_goals": wandb.Image("cumulative_goals_tabnet.png", caption="Cumulative Goald Curve for Tabnet")})
wandb.log({"calibration": wandb.Image("calibration_tabnet.png", caption="Calibration Curve for Tabnet")})

tabnet_clf.save_model("Models/tabnet")  # Saves as Models/tabnet.zip
best_model_path = "Models/tabnet.zip"

# Log the model as a Wandb artifact
artifact = wandb.Artifact(
    name="TabNet_Q6",
    type="model",
    description=" TabNet model "
)
artifact.add_file(best_model_path)
wandb.log_artifact(artifact)

# Finish the Wandb run
wandb.finish()


Successfully saved model at Models/tabnet.zip


VBox(children=(Label(value='0.253 MB of 0.253 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_validation_auc,▁
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
learning_rate,██████████▁▁▁▁▁▁▁▁▁
train_loss,█▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁
validation_accuracy,▇█▆██▇▇▇█▆▁▆▄▄▄▁███
validation_auc,▁▅▅▅▆▆▆▅▆▇▇█▇██▇▇██
validation_logloss,▆▄▃▃▂▃▃▆▃▂▂█▂▁▂▃▅▁▁

0,1
best_validation_auc,0.1604
epoch,19.0
learning_rate,0.018
train_loss,0.1614
validation_accuracy,0.95032
validation_auc,0.83327
validation_logloss,0.16092
