# **Libraries**

In [1]:
!pip install timm



In [2]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0


In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
!pip install warnings

[0m

In [5]:
# Data Processing n' Visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Compute
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Data
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Random
import os
import random as rand
import timm

In [6]:
torch.cuda.empty_cache()

In [7]:
def set_seed(seed):
  rand.seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed = 123
set_seed(123)

--------------------
# **Data Sample**

In [8]:
data_dir = '/kaggle/input/wharton-basketball-dataset/games_2022.xlsx'

In [9]:
df = pd.read_excel(data_dir)
print(f"Dataset Type: {type(df)}")

Dataset Type: <class 'pandas.core.frame.DataFrame'>


In [10]:
df_ts = df

----------------------------
# **Data Preprocessing**

In [11]:
df_ts = df_ts.drop(columns = ['OT_length_min_tot', 'attendance', 'tz_dif_H_E', 'opponent_team_score', 
                        'team_score', 'home_away', 'notD1_incomplete', 'largest_lead'])
df_ts = df_ts.dropna()
df_ts['home_away_NS'] = df_ts['home_away_NS'].replace({
    1: 1, -1: 0, 0: 2
})

from sklearn.preprocessing import MinMaxScaler

# List of columns to normalize
stats_to_normalize = ['FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 
                      'FTA', 'FTM', 'AST', 'BLK', 'STL', 'TOV', 
                      'TOV_team', 'DREB', 'OREB', 'F_tech', 'F_personal', 
                      'rest_days', 'prev_game_dist', 'travel_dist']

# Initialize MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

# Apply MinMaxScaler only to the selected stats
df_ts[stats_to_normalize] = scaler.fit_transform(df_ts[stats_to_normalize])

print(df_ts.head())  # Check normalized values

          game_id  game_date                       team     FGA_2     FGM_2  \
0  game_2022_2011 2021-12-30      georgia_lady_bulldogs  0.661290  0.513514   
1  game_2022_2011 2021-12-30                 lsu_tigers  0.661290  0.567568   
2  game_2022_2012 2021-12-30            missouri_tigers  0.548387  0.405405   
3  game_2022_2012 2021-12-30   south_carolina_gamecocks  0.741935  0.540541   
4  game_2022_2013 2021-12-30  tennessee_lady_volunteers  0.516129  0.459459   

      FGA_3     FGM_3       FTA    FTM       AST  ...       TOV  TOV_team  \
0  0.196078  0.227273  0.125000  0.075  0.361111  ...  0.390244  0.000000   
1  0.196078  0.181818  0.312500  0.200  0.388889  ...  0.292683  0.333333   
2  0.274510  0.318182  0.333333  0.325  0.250000  ...  0.146341  0.166667   
3  0.392157  0.272727  0.187500  0.125  0.388889  ...  0.146341  0.000000   
4  0.274510  0.181818  0.312500  0.250  0.416667  ...  0.317073  0.166667   

       DREB     OREB  F_tech  F_personal  rest_days  prev_game

In [12]:
Data = {}
X = []
y = []
start_col = 'FGA_2'
end_col = 'F_personal'

start_col_nx = 'rest_days'
end_col_nx = 'travel_dist'
# Loop through all Teams in Dataset
for i ,team in enumerate(df_ts['team'].unique()):
    team_data = df_ts[df_ts['team'] == team]
    for idx in range(len(team_data) - 1):
        # Features = past game (FGA_2 to F_personal) + current game stats (rest_days to travel_dist)
        past_game = team_data.iloc[idx]
        next_game = team_data.iloc[idx + 1]
        
        past_stats = past_game.loc[start_col : end_col].values
        next_stats = next_game.loc[start_col_nx:end_col_nx].values

        # We predict next game FGA_2 to F_personal
        label = next_game.loc[start_col:end_col].values

        if idx == 0 and i == 0:
            combined_stats_x = np.concatenate((past_stats, next_stats))
            print(combined_stats_x)
            print(f"Type of combined_stats: {type(combined_stats_x)}")
            print(f"First Index: {combined_stats_x[0]}")
            print("-"*59)
            print(f"Label: {label}")
            print(f"Type of Label: {type(label)}")
            print(f"First Index: {label[0]}")

        combined_stats = np.concatenate((past_stats, next_stats))
        X.append(combined_stats)
        y.append(label)

[0.6612903225806451 0.5135135135135136 0.19607843137254904
 0.2272727272727273 0.125 0.07500000000000001 0.36111111111111105
 0.38888888888888884 0.25925925925925924 0.3902439024390244 0.0
 0.41025641025641024 0.34375 0.0 0.4516129032258064 0.05263157894736842
 0.06209905055170644 0 0.07113462669018225]
Type of combined_stats: <class 'numpy.ndarray'>
First Index: 0.6612903225806451
-----------------------------------------------------------
Label: [0.7580645161290323 0.6216216216216217 0.09803921568627451
 0.045454545454545456 0.47916666666666663 0.45 0.25 0.1111111111111111
 0.5555555555555556 0.3658536585365854 0.3333333333333333
 0.2564102564102564 0.5 0.0 0.3548387096774194]
Type of Label: <class 'numpy.ndarray'>
First Index: 0.7580645161290323


In [13]:
print(f"First Features: {X[0]}")
print(f"Number of Features: {len(X[0])}")

print("-"*59)
print(f"First Labels: {y[0]}")
print(f"Number of Features to Predict: {len(y[0])}")

print(f"Type: {type(X), type(y)}")

First Features: [0.6612903225806451 0.5135135135135136 0.19607843137254904
 0.2272727272727273 0.125 0.07500000000000001 0.36111111111111105
 0.38888888888888884 0.25925925925925924 0.3902439024390244 0.0
 0.41025641025641024 0.34375 0.0 0.4516129032258064 0.05263157894736842
 0.06209905055170644 0 0.07113462669018225]
Number of Features: 19
-----------------------------------------------------------
First Labels: [0.7580645161290323 0.6216216216216217 0.09803921568627451
 0.045454545454545456 0.47916666666666663 0.45 0.25 0.1111111111111111
 0.5555555555555556 0.3658536585365854 0.3333333333333333
 0.2564102564102564 0.5 0.0 0.3548387096774194]
Number of Features to Predict: 15
Type: (<class 'list'>, <class 'list'>)


In [14]:
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# **Dataset**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

print(X_train.shape)  # Should be (num_samples, num_features) → (N, 19)
print(y_train.shape)  # Should be (num_samples, num_outputs) → (N, 15)
print(f"Type of X_train and X_test: {type(X_train)} | {type(X_test)}")
print(X_train.dtype, y_train.dtype)
print(X_val.dtype, y_val.dtype)
print(X_test.dtype, y_test.dtype)

(6139, 19)
(6139, 15)
Type of X_train and X_test: <class 'numpy.ndarray'> | <class 'numpy.ndarray'>
float32 float32
float32 float32
float32 float32


In [16]:
import torch
from torch.utils.data import Dataset

class bkb_dataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = torch.tensor(self.data[idx], dtype=torch.float32)  # Use float32
        label = torch.tensor(self.label[idx], dtype=torch.float32)  # Use float32

        return {"input_ids": feature, "labels": label}

In [17]:
train_set = bkb_dataset(
    X_train,
    y_train,
)

val_set = bkb_dataset(
    X_val,
    y_val,
)

test_set = bkb_dataset(
    X_test,
    y_test,
)
print(f"Length of train_set: {len(train_set)}")
print(f"Length of test_set: {len(test_set)}")
print(f"Length of val_set: {len(val_set)}")

Length of train_set: 6139
Length of test_set: 1763
Length of val_set: 869


In [18]:
train_batch = 256
test_batch = 32

train_loader = DataLoader(
    train_set,
    batch_size = train_batch,
    shuffle = True
)

val_loader = DataLoader(
    val_set,
    batch_size = test_batch,
    shuffle = False
)

test_loader = DataLoader(
    test_set,
    batch_size = test_batch,
    shuffle = False
)
print(f"Length train_loader: {len(train_loader)}")
print(f"Length test_loader: {len(test_loader)}")
print(f"Length val_loader: {len(val_loader)}")

Length train_loader: 24
Length test_loader: 56
Length val_loader: 28


# **Model**

In [19]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
import torch.nn.functional as F

class FeatureGrouping(nn.Module):
    def __init__(self, num_features=19, max_groups=3, embed_dim=4, output_dim=15, resnet_model="resnet50", finetune=False):
        super().__init__()
        self.num_features = num_features
        self.max_groups = max_groups
        self.embed_dim = embed_dim
        self.output_dim = output_dim

        # Embedding for home_away (3 categories: Home, Away, Neutral)
        self.home_away_embed = nn.Embedding(3, embed_dim)

        # Adjust feature count after embedding replacement
        self.adjusted_num_features = num_features - 1 + embed_dim  # 22 total features

        # Calculate the maximum features per group to ensure consistent dimensions
        self.max_features_per_group = self.adjusted_num_features

        # Learnable logits for feature assignment
        self.assignment_logits = nn.Parameter(torch.randn(self.adjusted_num_features, max_groups))

        # Self-Attention layers for each possible number of groups
        self.attention_layers = nn.ModuleDict({
            f"attn_{g}": nn.MultiheadAttention(
                embed_dim=self.max_features_per_group,
                num_heads=1,
                batch_first=True
            )
            for g in range(1, max_groups + 1)
        })

        # Reduce channels before ResNet
        self.channel_reducer = nn.Conv2d(in_channels=max_groups, out_channels=3, kernel_size=1)

        # Pretrained ResNet model
        self.resnet = timm.create_model(resnet_model, pretrained=True)
        in_features = self.resnet.get_classifier().in_features
        self.resnet.reset_classifier(0)

        # Final regression head
        self.fc = nn.Linear(in_features, output_dim)

        # Set ResNet layers to trainable or frozen based on finetune flag
        if finetune:
            for param in self.resnet.parameters():
                param.requires_grad = True
        else:
            for param in self.resnet.parameters():
                param.requires_grad = False

    def forward(self, x):
        batch_size = x.shape[0]

        # Extract home_away index and convert to embeddings
        home_away_idx = x[:, 17].long().clamp(0, 2)
        home_away_embed = self.home_away_embed(home_away_idx)
        x = torch.cat([x[:, :17], home_away_embed, x[:, 18:]], dim=1)

        # Hard feature assignment
        assignment_hard = torch.argmax(self.assignment_logits, dim=1)

        all_group_outputs = []

        # Process different group configurations
        for num_groups in range(1, self.max_groups + 1):
            # Split features into groups
            groups = []
            features_per_group = self.adjusted_num_features // num_groups
            
            for g in range(num_groups):
                start_idx = g * features_per_group
                end_idx = min(start_idx + features_per_group, self.adjusted_num_features)
                group_features = x[:, start_idx:end_idx]
                
                # Pad to match max_features_per_group
                if group_features.shape[1] < self.max_features_per_group:
                    pad_size = self.max_features_per_group - group_features.shape[1]
                    padding = torch.zeros(batch_size, pad_size, device=x.device)
                    group_features = torch.cat([group_features, padding], dim=1)
                
                groups.append(group_features)

            # Process each group with attention
            processed_groups = []
            for g in range(num_groups):
                group_features = groups[g].unsqueeze(1)
                attn_output, _ = self.attention_layers[f"attn_{num_groups}"](
                    group_features, group_features, group_features)
                processed_groups.append(attn_output)

            # Combine processed groups
            group_output = torch.cat(processed_groups, dim=1)
            
            # Pad to match max_groups if necessary
            if num_groups < self.max_groups:
                padding = torch.zeros(
                    batch_size,
                    self.max_groups - num_groups,
                    self.max_features_per_group,
                    device=x.device
                )
                group_output = torch.cat([group_output, padding], dim=1)
            
            all_group_outputs.append(group_output)

        # Stack all configurations
        x_final = torch.stack(all_group_outputs, dim=1)  # [B, max_groups, max_groups, Features]
        
        # Reshape for channel reducer
        x_final = x_final.mean(dim=1)  # [B, max_groups, Features]
        x_final = x_final.permute(0, 2, 1)  # [B, Features, max_groups]
        x_final = x_final.mean(dim=1).unsqueeze(-1).unsqueeze(-1)  # [B, max_groups, 1, 1]
        
        # Apply channel reduction
        x_final = self.channel_reducer(x_final)  # [B, 3, 1, 1]
        
        # Prepare for ResNet
        x_final = x_final.expand(-1, -1, 224, 224)  # [B, 3, 224, 224]

        # Process through ResNet and final layer
        x_final = self.resnet(x_final)
        output = self.fc(x_final)

        return output

# Example usage
batch_size = 1
x = torch.randn(batch_size, 19)  # Example input features
model = FeatureGrouping(resnet_model="resnet50", finetune=False)  # Finetune ResNet
output = model(x)
print(output.shape)  # Expected: (B, 15)
print(output)

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

torch.Size([1, 15])
tensor([[-0.1114,  0.0481, -0.0171,  0.0011, -0.0177, -0.0382, -0.0536,  0.0398,
         -0.1094,  0.0083, -0.0136,  0.0496, -0.0190, -0.0583,  0.0661]],
       grad_fn=<AddmmBackward0>)


In [22]:
from transformers import PreTrainedModel, PretrainedConfig

class FeatureGroupingConfig(PretrainedConfig):
    model_type = "feature_grouping"

    def __init__(self, num_features=19, output_dim=15, **kwargs):
        super().__init__(**kwargs)
        self.num_features = num_features
        self.output_dim = output_dim

class FeatureGroupingModel(PreTrainedModel):
    config_class = FeatureGroupingConfig

    def __init__(self, config):
        super().__init__(config)
        self.model = FeatureGrouping(
            num_features=config.num_features, 
            output_dim=config.output_dim
        )

    def forward(self, input_ids, labels=None):
        output = self.model(input_ids)

        loss = None
        if labels is not None:
            loss = F.mse_loss(output, labels)  # Mean Squared Error for regression

        return {"loss": loss, "logits": output} if loss is not None else {"logits": output}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [23]:
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.array(predictions)
    labels = np.array(labels)

    mse_per_feature = ((predictions - labels) ** 2).mean(axis=0)  # (15,)
    mse_mean = mse_per_feature.mean()  # Overall MSE

    print(f"\n📢 Epoch MSE: {mse_mean:.4f}")  # Print MSE for each epoch

    return {"mse_mean": mse_mean}

In [24]:
from transformers import TrainerCallback

class LossLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"📢 Epoch {state.epoch:.0f}: Train Loss = {logs['loss']:.4f}")

In [25]:
import warnings
from transformers import Trainer, TrainingArguments

# Ignore all warnings
warnings.filterwarnings("ignore")

training_args = TrainingArguments(
    output_dir="./WhartonDS_RegressionModel",
    learning_rate=1e-5,
    eval_strategy="epoch",  # Evaluate at each epoch
    save_strategy="epoch",
    logging_strategy="epoch",  # Log every epoch
    logging_dir="./logs",
    logging_steps=1,  # Log at every step
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=60,
    load_best_model_at_end = True,
    push_to_hub=True,
    optim='adamw_torch',
    report_to="none"
)

# Initialize Model
model = FeatureGroupingModel(FeatureGroupingConfig())

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    #callbacks=[LossLoggerCallback()]  # Add the callback here
)

# Train the Model
trainer.train()

# Save Model
trainer.save_model("./WhartonDS_RegressionModel")

# Save Model to Hugging Face Hub
trainer.push_to_hub("KanWasTaken/WhartonDS_RegressionModel")

Epoch,Training Loss,Validation Loss
1,0.112,0.11392
2,0.0986,0.101506
3,0.0868,0.070105
4,0.077,0.077421
5,0.0688,0.066761
6,0.0616,0.06104
7,0.0552,0.053954
8,0.0503,0.048525
9,0.0458,0.045428
10,0.0422,0.041822


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/KanWasTaken/WhartonDS_RegressionModel/commit/04a50fa2fdac51ef95405f54b7f3fd03e64d04fe', commit_message='KanWasTaken/WhartonDS_RegressionModel', commit_description='', oid='04a50fa2fdac51ef95405f54b7f3fd03e64d04fe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/WhartonDS_RegressionModel', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/WhartonDS_RegressionModel'), pr_revision=None, pr_num=None)

# **Test**

In [26]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_val1, X_test1, y_val1, y_test1 = train_test_split(X_test1, y_test1, test_size=0.33, random_state=42)

print(X_train1.shape)  # Should be (num_samples, num_features) → (N, 19)
print(y_train1.shape)  # Should be (num_samples, num_outputs) → (N, 15)

(6139, 19)
(6139, 15)


In [27]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# Wrap XGBRegressor to handle multi-output regression
model = MultiOutputRegressor(XGBRegressor(
    objective='reg:squarederror',  
    n_estimators=100,  
    learning_rate=0.1  
))

# Train model
model.fit(X_train1, y_train1)  

y_pred = model.predict(X_test1)
print(y_pred.shape)  # Should be (N_test, 15)

from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test1, y_pred, multioutput='raw_values')  # RMSE for each label
print("RMSE per output:", rmse)
print("Average RMSE:", rmse.mean())  # Average RMSE across all 15 labels

(869, 15)
RMSE per output: [0.01580971 0.01553993 0.01405026 0.01751576 0.02033821 0.01815271
 0.0154973  0.01604958 0.01456579 0.01404391 0.02238351 0.01987477
 0.01867983 0.00606631 0.02056576]
Average RMSE: 0.016608888


In [28]:
X_pred_home = model.predict(np.array([X_train1[0]]))  # Ensures a 2D shape (1, 19)
X_pred_away = model.predict(np.array([X_train1[1]]))
print(X_pred_home)
print(X_pred_away)

[[0.44560626 0.2505317  0.35928938 0.27473179 0.35134402 0.24747607
  0.28863877 0.10484435 0.25987056 0.33177465 0.07613    0.41842663
  0.38874078 0.01177177 0.33863005]]
[[0.5044346  0.37466002 0.33185732 0.25413755 0.3480368  0.2988583
  0.3174013  0.15391321 0.26297748 0.31009695 0.07529958 0.3895041
  0.35768604 0.0132556  0.3995734 ]]


In [29]:
from pytorch_tabnet.tab_model import TabNetRegressor

X_train1 = X_train1.astype(np.float32)
y_train1 = y_train1.astype(np.float32)
X_val1 = X_val1.astype(np.float32)
y_val1 = y_val1.astype(np.float32)

X_test1 = X_test1.astype(np.float32)
y_test1 = y_test1.astype(np.float32)

if y_test1.ndim == 1:
    y_test1 = y_test1.reshape(-1, 1)

if y_train1.ndim == 1:
    y_train1 = y_train1.reshape(-1, 1)
if y_val1.ndim == 1:
    y_val1 = y_val1.reshape(-1, 1)
    
model = TabNetRegressor()
model.fit(X_train1, y_train1, eval_set=[(X_val1, y_val1)])
preds = model.predict(X_test1)

epoch 0  | loss: 1.12782 | val_0_mse: 0.1220100000500679|  0:00:00s
epoch 1  | loss: 0.22737 | val_0_mse: 0.12026000022888184|  0:00:01s
epoch 2  | loss: 0.12084 | val_0_mse: 0.11929000169038773|  0:00:01s
epoch 3  | loss: 0.10212 | val_0_mse: 0.1088699996471405|  0:00:01s
epoch 4  | loss: 0.08596 | val_0_mse: 0.0903099998831749|  0:00:01s
epoch 5  | loss: 0.07088 | val_0_mse: 0.07384999841451645|  0:00:02s
epoch 6  | loss: 0.05865 | val_0_mse: 0.05849999934434891|  0:00:02s
epoch 7  | loss: 0.04666 | val_0_mse: 0.044840000569820404|  0:00:02s
epoch 8  | loss: 0.03785 | val_0_mse: 0.03044999949634075|  0:00:02s
epoch 9  | loss: 0.03169 | val_0_mse: 0.023420000448822975|  0:00:03s
epoch 10 | loss: 0.02778 | val_0_mse: 0.02078000083565712|  0:00:03s
epoch 11 | loss: 0.02527 | val_0_mse: 0.018799999728798866|  0:00:03s
epoch 12 | loss: 0.02378 | val_0_mse: 0.018309999257326126|  0:00:03s
epoch 13 | loss: 0.02261 | val_0_mse: 0.0181099995970726|  0:00:04s
epoch 14 | loss: 0.02187 | val_0_m