# **Libraries**

In [1]:
!pip install timm



In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
# Data Processing n' Visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Compute
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Data
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Random
import os
import random as rand
import timm

In [4]:
torch.cuda.empty_cache()

In [5]:
def set_seed(seed):
  rand.seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed = 59
set_seed(59)

-------------------------
# **Data Sample**

In [6]:
data_dir = '/kaggle/input/wharton-bkb-dataset/games_2022 (1).xlsx'

In [7]:
df = pd.read_excel(data_dir)
print(f"Dataset Type: {type(df)}")

Dataset Type: <class 'pandas.core.frame.DataFrame'>


In [8]:
df_cls = df

-----------------------------
# **Data Preprocessing**

In [9]:
df_cls = df_cls.drop(columns = ['OT_length_min_tot', 'attendance', 'tz_dif_H_E',
                        'home_away', 'notD1_incomplete', 'largest_lead'])
df_cls = df_cls.dropna()
df_cls['home_away_NS'] = df_cls['home_away_NS'].replace({
    1: 1, -1: 0, 0: 2
})

from sklearn.preprocessing import MinMaxScaler

# List of columns to normalize
stats_to_normalize = ['FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 
                      'FTA', 'FTM', 'AST', 'BLK', 'STL', 'TOV', 
                      'TOV_team', 'DREB', 'OREB', 'F_tech', 'F_personal', 
                      'rest_days', 'prev_game_dist', 'travel_dist']

# Initialize MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

# Apply MinMaxScaler only to the selected stats
df_cls[stats_to_normalize] = scaler.fit_transform(df_cls[stats_to_normalize])

print(df_cls.head())  # Check normalized values

          game_id  game_date                       team     FGA_2     FGM_2  \
0  game_2022_2011 2021-12-30      georgia_lady_bulldogs  0.661290  0.513514   
1  game_2022_2011 2021-12-30                 lsu_tigers  0.661290  0.567568   
2  game_2022_2012 2021-12-30            missouri_tigers  0.548387  0.405405   
3  game_2022_2012 2021-12-30   south_carolina_gamecocks  0.741935  0.540541   
4  game_2022_2013 2021-12-30  tennessee_lady_volunteers  0.516129  0.459459   

      FGA_3     FGM_3       FTA    FTM       AST  ...      DREB     OREB  \
0  0.196078  0.227273  0.125000  0.075  0.361111  ...  0.410256  0.34375   
1  0.196078  0.181818  0.312500  0.200  0.388889  ...  0.410256  0.34375   
2  0.274510  0.318182  0.333333  0.325  0.250000  ...  0.564103  0.18750   
3  0.392157  0.272727  0.187500  0.125  0.388889  ...  0.461538  0.62500   
4  0.274510  0.181818  0.312500  0.250  0.416667  ...  0.641026  0.37500   

   F_tech  F_personal  team_score  opponent_team_score  rest_days  \

In [10]:
def preprocess_data_diff(data):
    
    """
    Preprocessed Data (1st Step)
        Input: 
        - data: Dataset File -> csv

        Output:
        - processed_df: Processed Dataset File -> pd
    """
    
    processed_data = []
    
    stats_to_diff = [
        'FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 'FTA',
        'FTM', 'AST', 'BLK', 'STL', 'TOV', 'TOV_team',
        'DREB', 'OREB', 'F_tech', 'F_personal', 'rest_days',
        'prev_game_dist', 'travel_dist'
    ]
    
    # Process each game
    for game_id in data['game_id'].unique():
        game_data = data[data['game_id'] == game_id]

        # Ensure the game has exactly 2 teams
        if len(game_data) != 2:
            print(f"Skipping game {game_id} due to missing teams.")
            continue

        # Extract teams
        teamA = game_data.iloc[0]
        teamB = game_data.iloc[1]

        entry = {
            'teamA': teamA['team'],
            'teamB': teamB['team'],
            'teamA_score': teamA['team_score'],
            'teamB_score': teamB['team_score'],

            # Embedding for Home/Away/Neutral
            'A_H/W/N': teamA['home_away_NS'], 
            'B_H/W/N': teamB['home_away_NS'],

            # 0: Lost | 1: Won | 2: Draw
            'W/L/D (teamA)': 0 if teamA['team_score'] < teamB['team_score']
                            else 1 if teamA['team_score'] > teamB['team_score']
                            else 2
        }

        # Compute stat differences
        for stat in stats_to_diff:
            # Handle NA values
            if pd.isna(teamB[stat]) and pd.isna(teamA[stat]):
                teamA[stat], teamB[stat] = 0, 0
                
            elif pd.isna(teamA[stat]):
                print(f"Team A ({teamA['team']}) {stat} has NA. Using Team B's value.")
                teamA[stat] = teamB[stat]
                
            elif pd.isna(teamB[stat]):
                print(f"Team B ({teamB['team']}) {stat} has NA. Using Team A's value.")
                teamB[stat] = teamA[stat]
                
            # Compute difference
            entry[f'{stat}_diff (A - B)'] = teamA[stat] - teamB[stat]
        
        processed_data.append(entry)

    # Convert to DataFrame
    processed_df = pd.DataFrame(processed_data)

    return processed_df

In [11]:
df_cls_diff = preprocess_data_diff(df_cls)

Skipping game game_2022_1320 due to missing teams.
Skipping game game_2022_2198 due to missing teams.
Skipping game game_2022_2621 due to missing teams.
Skipping game game_2022_3347 due to missing teams.
Skipping game game_2022_3744 due to missing teams.
Skipping game game_2022_4049 due to missing teams.
Skipping game game_2022_4745 due to missing teams.
Skipping game game_2022_181 due to missing teams.
Skipping game game_2022_1994 due to missing teams.
Skipping game game_2022_3906 due to missing teams.
Skipping game game_2022_4264 due to missing teams.
Skipping game game_2022_2441 due to missing teams.
Skipping game game_2022_182 due to missing teams.
Skipping game game_2022_183 due to missing teams.
Skipping game game_2022_219 due to missing teams.
Skipping game game_2022_220 due to missing teams.
Skipping game game_2022_221 due to missing teams.
Skipping game game_2022_222 due to missing teams.
Skipping game game_2022_320 due to missing teams.
Skipping game game_2022_468 due to miss

In [12]:
df_cls_diff = df_cls_diff.drop(columns = ['teamA', 'teamB', 'teamA_score', 'teamB_score'])

A_HWN = df_cls_diff['A_H/W/N']
B_HWN = df_cls_diff['B_H/W/N']
df_cls_diff = df_cls_diff.drop(columns = ['A_H/W/N', 'B_H/W/N'])
df_cls_diff['A_H/W/N'] = A_HWN
df_cls_diff['B_H/W/N'] = B_HWN

travel_dist = df_cls_diff['travel_dist_diff (A - B)']
df_cls_diff = df_cls_diff.drop(columns = ['travel_dist_diff (A - B)'])
df_cls_diff['travel_dist_diff (A - B)'] = travel_dist

In [13]:
df_cls_diff

Unnamed: 0,W/L/D (teamA),FGA_2_diff (A - B),FGM_2_diff (A - B),FGA_3_diff (A - B),FGM_3_diff (A - B),FTA_diff (A - B),FTM_diff (A - B),AST_diff (A - B),BLK_diff (A - B),STL_diff (A - B),...,TOV_team_diff (A - B),DREB_diff (A - B),OREB_diff (A - B),F_tech_diff (A - B),F_personal_diff (A - B),rest_days_diff (A - B),prev_game_dist_diff (A - B),A_H/W/N,B_H/W/N,travel_dist_diff (A - B)
0,0,0.000000,-0.054054,0.000000,0.045455,-0.187500,-0.125,-0.027778,0.277778,-0.296296,...,-0.333333,0.000000,0.00000,0.0,0.354839,0.157895,-0.105722,1,0,-0.121105
1,1,-0.193548,-0.135135,-0.117647,0.045455,0.145833,0.200,-0.138889,-0.388889,0.037037,...,0.166667,0.102564,-0.43750,0.0,-0.129032,-0.026316,-0.100462,1,0,-0.169606
2,1,-0.016129,0.216216,-0.274510,-0.045455,0.145833,0.125,0.222222,0.166667,0.000000,...,0.166667,0.282051,-0.18750,0.0,-0.096774,-0.368421,-0.056325,1,0,-0.064521
3,0,0.080645,0.027027,-0.058824,-0.045455,-0.187500,-0.050,-0.055556,-0.055556,-0.037037,...,-0.166667,-0.051282,-0.03125,-0.2,-0.032258,0.263158,-0.031050,0,1,0.028954
4,1,0.161290,0.189189,-0.647059,-0.318182,0.520833,0.350,0.194444,-0.111111,-0.259259,...,-0.166667,0.410256,-0.06250,0.0,-0.258065,-0.236842,0.118040,0,1,0.135215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4345,1,-0.129032,0.189189,0.039216,-0.045455,-0.083333,-0.025,0.222222,0.333333,-0.111111,...,-0.166667,0.230769,-0.15625,0.0,-0.064516,0.026316,-0.083269,1,0,-0.046884
4346,1,0.322581,0.243243,-0.039216,0.090909,0.062500,0.075,0.111111,-0.166667,0.222222,...,0.000000,0.051282,0.28125,0.2,-0.354839,0.000000,-0.080575,0,1,0.084950
4347,1,-0.064516,0.054054,0.254902,0.272727,0.000000,-0.025,0.277778,0.055556,0.222222,...,0.000000,0.102564,0.15625,0.0,0.000000,0.052632,0.051706,0,1,0.131393
4348,1,-0.032258,0.108108,0.254902,0.409091,0.062500,-0.050,0.361111,0.055556,0.333333,...,-0.333333,-0.025641,0.00000,0.0,0.000000,0.184211,-0.012959,0,1,0.025426


In [14]:
start_col = 'FGA_2_diff (A - B)'
end_col_test = 'F_personal_diff (A - B)'
df_test = df_cls_diff.loc[:,start_col : end_col_test]

In [15]:
# Our Regression Model Output 15 Labels.
len(df_test.columns) # => Should be 15

15

In [16]:
X = df_cls_diff.loc[:, start_col : 'travel_dist_diff (A - B)'].values

y = df_cls_diff.loc[:, 'W/L/D (teamA)'].values

X = np.array(X, dtype=np.float32)
y = np.array(y)

In [17]:
print(X[0])

print("-"*59)

print(X[0][17])
print(X[0][18])

print(f"Number of Features: {len(X[0])}")
A_HWN_idx = 17
B_HWN_idx = 18

[ 0.         -0.05405406  0.          0.04545455 -0.1875     -0.125
 -0.02777778  0.2777778  -0.2962963   0.09756097 -0.33333334  0.
  0.          0.          0.3548387   0.15789473 -0.10572235  1.
  0.         -0.12110523]
-----------------------------------------------------------
1.0
0.0
Number of Features: 20


------------------------
# **Dataset**

In [18]:
# 7/3 Train/Val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)  # Should be (num_samples, num_features) → (N, 19)
print(y_train.shape)  # Should be (num_samples, num_outputs) → (N, 15)
print(f"Type of X_train and X_test: {type(X_train)} | {type(X_val)}")

(3045, 20)
(3045,)
Type of X_train and X_test: <class 'numpy.ndarray'> | <class 'numpy.ndarray'>


In [19]:
import torch
from torch.utils.data import Dataset

class bkb_dataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = torch.tensor(self.data[idx], dtype=torch.float32)  # Use float32
        label = torch.tensor(self.label[idx], dtype=torch.long)  # Use float32

        return {"input_ids": feature, "labels": label}

In [20]:
train_set = bkb_dataset(
    X_train,
    y_train,
)

val_set = bkb_dataset(
    X_val,
    y_val,
)

print(f"Length of train_set: {len(train_set)}")
print(f"Length of val_set: {len(val_set)}")

Length of train_set: 3045
Length of val_set: 1305


In [21]:
train_batch = 256
test_batch = 32

train_loader = DataLoader(
    train_set,
    batch_size = train_batch,
    shuffle = True
)

val_loader = DataLoader(
    val_set,
    batch_size = test_batch,
    shuffle = False
)

print(f"Length train_loader: {len(train_loader)}")
print(f"Length val_loader: {len(val_loader)}")

Length train_loader: 12
Length val_loader: 41


-------------------
# **Model**

In [22]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
import torch.nn.functional as F

class FeatureGrouping_cls(nn.Module):
    def __init__(self, num_features=20, max_groups=3, embed_dim=4, output_dim=2, resnet_model="resnet50", finetune = True):
        super().__init__()
        self.num_features = num_features
        self.max_groups = max_groups
        self.embed_dim = embed_dim
        self.output_dim = output_dim

        # Embedding for home_away (3 categories: Home, Away, Neutral)
        self.home_away_embed = nn.Embedding(3, embed_dim)

        # Adjust feature count after embedding replacement
        self.adjusted_num_features = num_features - 2 + 2*embed_dim  # 20 - 2 + 2*4 = 26

        # Calculate the maximum features per group to ensure consistent dimensions
        self.max_features_per_group = self.adjusted_num_features

        # Learnable logits for feature assignment
        self.assignment_logits = nn.Parameter(torch.randn(self.adjusted_num_features, max_groups))

        # Self-Attention layers for each possible number of groups
        self.attention_layers = nn.ModuleDict({
            f"attn_{g}": nn.MultiheadAttention(
                embed_dim=self.max_features_per_group,
                num_heads=1,
                batch_first=True
            )
            for g in range(1, max_groups + 1)
        })

        # Reduce channels before ResNet
        self.channel_reducer = nn.Conv2d(in_channels=max_groups, out_channels=3, kernel_size=1)

        # Pretrained ResNet model
        self.resnet = timm.create_model(resnet_model, pretrained=True)
        in_features = self.resnet.get_classifier().in_features
        self.resnet.reset_classifier(0)

        # Final regression head
        self.fc = nn.Linear(in_features, output_dim)
        
        if finetune:
            for param in self.resnet.parameters():
                param.requires_grad = True
        else:
            for param in self.resnet.parameters():
                param.requires_grad = False
                
    def forward(self, x):
        batch_size = x.shape[0]

        # Extract home_away index and convert to embeddings
        A_home_away_idx = x[:, 17].long().clamp(0, 2)
        B_home_away_idx = x[:, 18].long().clamp(0, 2)
        
        A_home_away_embed = self.home_away_embed(A_home_away_idx)
        B_home_away_embed = self.home_away_embed(B_home_away_idx)
        
        x = torch.cat([x[:, :17], A_home_away_embed, B_home_away_embed, x[:, 19:]], dim=1)

        # Hard feature assignment
        assignment_hard = torch.argmax(self.assignment_logits, dim=1)

        all_group_outputs = []

        # Process different group configurations
        for num_groups in range(1, self.max_groups + 1):
            # Split features into groups
            groups = []
            features_per_group = self.adjusted_num_features // num_groups
            
            for g in range(num_groups):
                start_idx = g * features_per_group
                end_idx = min(start_idx + features_per_group, self.adjusted_num_features)
                group_features = x[:, start_idx:end_idx]
                
                # Pad to match max_features_per_group
                if group_features.shape[1] < self.max_features_per_group:
                    pad_size = self.max_features_per_group - group_features.shape[1]
                    padding = torch.zeros(batch_size, pad_size, device=x.device)
                    group_features = torch.cat([group_features, padding], dim=1)
                
                groups.append(group_features)

            # Process each group with attention
            processed_groups = []
            for g in range(num_groups):
                group_features = groups[g].unsqueeze(1)
                attn_output, _ = self.attention_layers[f"attn_{num_groups}"](
                    group_features, group_features, group_features)
                processed_groups.append(attn_output)

            # Combine processed groups
            group_output = torch.cat(processed_groups, dim=1)
            
            # Pad to match max_groups if necessary
            if num_groups < self.max_groups:
                padding = torch.zeros(
                    batch_size,
                    self.max_groups - num_groups,
                    self.max_features_per_group,
                    device=x.device
                )
                group_output = torch.cat([group_output, padding], dim=1)
            
            all_group_outputs.append(group_output)

        # Stack all configurations
        x_final = torch.stack(all_group_outputs, dim=1)  # [B, max_groups, max_groups, Features]
        
        # Reshape for channel reducer
        x_final = x_final.mean(dim=1)  # [B, max_groups, Features]
        x_final = x_final.permute(0, 2, 1)  # [B, Features, max_groups]
        x_final = x_final.mean(dim=1).unsqueeze(-1).unsqueeze(-1)  # [B, max_groups, 1, 1]
        
        # Apply channel reduction
        x_final = self.channel_reducer(x_final)  # [B, 3, 1, 1]
        
        # Prepare for ResNet
        x_final = x_final.expand(-1, -1, 224, 224)  # [B, 3, 224, 224]

        # Process through ResNet and final layer
        x_final = self.resnet(x_final)
        output = self.fc(x_final)

        return output

batch_size = 1
x = torch.randn(batch_size, 20)  # Example input features
y = torch.randint(0, 2, (batch_size,))  # Shape: (batch_size,)
model = FeatureGrouping_cls(resnet_model="resnet50")  # Using ResNet50
output = model(x)

print(output.shape)  # Expected: (B, 2)
print(output)

sm = nn.Softmax(dim = -1)
print(sm(output))

loss = nn.CrossEntropyLoss()
pred = loss(output, y)
print(pred)

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

torch.Size([1, 2])
tensor([[-0.0300,  0.1020]], grad_fn=<AddmmBackward0>)
tensor([[0.4671, 0.5329]], grad_fn=<SoftmaxBackward0>)
tensor(0.7613, grad_fn=<NllLossBackward0>)


----------------------
# **Train**

In [25]:
from transformers import PreTrainedModel, PretrainedConfig

class FeatureGroupingConfig(PretrainedConfig):
    model_type = "feature_grouping"

    def __init__(self, num_features=20, output_dim=2, **kwargs):
        super().__init__(**kwargs)
        self.num_features = num_features
        self.output_dim = output_dim

class FeatureGroupingModel(PreTrainedModel):
    config_class = FeatureGroupingConfig

    def __init__(self, config):
        super().__init__(config)
        self.model = FeatureGrouping_cls(
            num_features=config.num_features, 
            output_dim=config.output_dim
        )

    def forward(self, input_ids, labels=None):
        output = self.model(input_ids)

        loss = None
        if labels is not None:
            CE_Loss = nn.CrossEntropyLoss()
            loss = CE_Loss(output, labels)  # Mean Squared Error for regression

        return {"loss": loss, "logits": output} if loss is not None else {"logits": output}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [26]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./WhartonDS_ClsModel",
    learning_rate = 1e-5,
    eval_strategy="epoch",  # Evaluate at each epoch
    save_strategy="epoch",
    logging_strategy="epoch",  # Log every epoch
    logging_dir="./logs",
    logging_steps=1,  # Log at every step
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=30,
    weight_decay=0.0005,
    report_to="none",
    push_to_hub=True,
    optim = 'adamw_torch'
)

# Initialize Model
model = FeatureGroupingModel(FeatureGroupingConfig())

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set
)

# Train the Model
trainer.train()

# Save Model
trainer.save_model("./WhartonDS_ClsModel")

# Save Model to Hugging Face Hub
trainer.push_to_hub("KanWasTaken/WhartonDS_ClsModel")



Epoch,Training Loss,Validation Loss
1,0.6878,0.693287
2,0.6732,0.6894
3,0.6619,0.691235
4,0.6517,0.688288
5,0.6406,0.685896
6,0.6335,0.689111
7,0.6272,0.689971
8,0.6179,0.685507
9,0.6119,0.66762
10,0.6038,0.622965


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/KanWasTaken/WhartonDS_ClsModel/commit/7088f044b76ce7ff1578fd10238830d4498a06f0', commit_message='KanWasTaken/WhartonDS_ClsModel', commit_description='', oid='7088f044b76ce7ff1578fd10238830d4498a06f0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/WhartonDS_ClsModel', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/WhartonDS_ClsModel'), pr_revision=None, pr_num=None)