# **Libraries**

In [1]:
!pip install torch==2.2.0 torchaudio torchsummary torchtext torchvision
!pip install numpy==1.24.3
!pip install timm
!pip install pytorch_tabnet

Collecting torch==2.2.0
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)


In [2]:
# Data Processing n' Visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Compute
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Data
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torchtext.vocab import build_vocab_from_iterator

# Random
import os
import random as rand
import timm
import spacy

In [3]:
def set_seed(seed):
  rand.seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed = 59
set_seed(59)

--------------------
# **Data Sample**

In [4]:
data_dir = '/kaggle/input/wharton-basketball-dataset/games_2022.xlsx'

In [5]:
df = pd.read_excel(data_dir)
print(f"Dataset Type: {type(df)}")

Dataset Type: <class 'pandas.core.frame.DataFrame'>


In [6]:
df
df_ts = df

----------------------------
# **Data Preprocessing**

In [7]:
df_ts = df_ts.drop(columns = ['OT_length_min_tot', 'attendance', 'tz_dif_H_E', 'opponent_team_score', 
                        'team_score', 'home_away', 'notD1_incomplete', 'largest_lead'])
df_ts = df_ts.dropna()

df_ts['home_away_NS'] = df_ts['home_away_NS'].replace({1: 'Home', -1: 'Away', 0: 'Neutral'})

from sklearn.preprocessing import MinMaxScaler

# List of columns to normalize
stats_to_normalize = ['FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 
                      'FTA', 'FTM', 'AST', 'BLK', 'STL', 'TOV', 
                      'TOV_team', 'DREB', 'OREB', 'F_tech', 'F_personal', 
                      'rest_days', 'prev_game_dist', 'travel_dist']

# Initialize MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

# Apply MinMaxScaler only to the selected stats
df_ts[stats_to_normalize] = scaler.fit_transform(df_ts[stats_to_normalize])

print(df_ts.head())  # Check normalized values

          game_id  game_date                       team     FGA_2     FGM_2  \
0  game_2022_2011 2021-12-30      georgia_lady_bulldogs  0.661290  0.513514   
1  game_2022_2011 2021-12-30                 lsu_tigers  0.661290  0.567568   
2  game_2022_2012 2021-12-30            missouri_tigers  0.548387  0.405405   
3  game_2022_2012 2021-12-30   south_carolina_gamecocks  0.741935  0.540541   
4  game_2022_2013 2021-12-30  tennessee_lady_volunteers  0.516129  0.459459   

      FGA_3     FGM_3       FTA    FTM       AST  ...       TOV  TOV_team  \
0  0.196078  0.227273  0.125000  0.075  0.361111  ...  0.390244  0.000000   
1  0.196078  0.181818  0.312500  0.200  0.388889  ...  0.292683  0.333333   
2  0.274510  0.318182  0.333333  0.325  0.250000  ...  0.146341  0.166667   
3  0.392157  0.272727  0.187500  0.125  0.388889  ...  0.146341  0.000000   
4  0.274510  0.181818  0.312500  0.250  0.416667  ...  0.317073  0.166667   

       DREB     OREB  F_tech  F_personal  rest_days  prev_game

In [8]:
# Step 1: Function to yield unique words
def get_tokens(data_iter):
    for word in data_iter:
        yield [word]  # Each word is a token

# Step 2: Build Vocabulary from Column
vocab = build_vocab_from_iterator(
    get_tokens(df_ts['home_away_NS']),
    specials=['<pad>', '<unk>'],  # Special tokens
    special_first=True
)
vocab.set_default_index(vocab['<unk>'])  # Handle unknown words

# Step 3: Convert Column to Integer Indices
df_ts['home_away_NS'] = df_ts['home_away_NS'].map(lambda x: vocab[x])

# Check if mapping worked
print("Vocabulary:", vocab.get_stoi())  # Prints word-to-index mapping

Vocabulary: {'Neutral': 4, 'Home': 3, 'Away': 2, '<unk>': 1, '<pad>': 0}


In [9]:
display(df_ts)
print(df_ts.columns)

Unnamed: 0,game_id,game_date,team,FGA_2,FGM_2,FGA_3,FGM_3,FTA,FTM,AST,...,TOV,TOV_team,DREB,OREB,F_tech,F_personal,rest_days,prev_game_dist,home_away_NS,travel_dist
0,game_2022_2011,2021-12-30,georgia_lady_bulldogs,0.661290,0.513514,0.196078,0.227273,0.125000,0.075,0.361111,...,0.390244,0.000000,0.410256,0.34375,0.0,0.451613,0.210526,0.000000,3,0.000000
1,game_2022_2011,2021-12-30,lsu_tigers,0.661290,0.567568,0.196078,0.181818,0.312500,0.200,0.388889,...,0.292683,0.333333,0.410256,0.34375,0.0,0.096774,0.052632,0.105722,2,0.121105
2,game_2022_2012,2021-12-30,missouri_tigers,0.548387,0.405405,0.274510,0.318182,0.333333,0.325,0.250000,...,0.146341,0.166667,0.564103,0.18750,0.0,0.225806,0.184211,0.047601,3,0.000000
3,game_2022_2012,2021-12-30,south_carolina_gamecocks,0.741935,0.540541,0.392157,0.272727,0.187500,0.125,0.388889,...,0.146341,0.000000,0.461538,0.62500,0.0,0.354839,0.210526,0.148063,2,0.169606
4,game_2022_2013,2021-12-30,tennessee_lady_volunteers,0.516129,0.459459,0.274510,0.181818,0.312500,0.250,0.416667,...,0.317073,0.166667,0.641026,0.37500,0.0,0.290323,0.052632,0.000000,3,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10433,game_2022_4795,2022-03-01,xavier_musketeers,0.564516,0.432432,0.215686,0.090909,0.250000,0.225,0.388889,...,0.195122,0.000000,0.410256,0.25000,0.0,0.258065,0.026316,0.000000,3,0.000000
10434,game_2022_4968,2022-03-04,harvard_crimson,0.338710,0.432432,0.745098,0.681818,0.208333,0.050,0.527778,...,0.097561,0.000000,0.487179,0.31250,0.0,0.129032,0.315789,0.048499,2,0.025426
10435,game_2022_4968,2022-03-04,dartmouth_big_green,0.370968,0.324324,0.490196,0.272727,0.145833,0.100,0.166667,...,0.463415,0.333333,0.512821,0.31250,0.0,0.129032,0.131579,0.061458,3,0.000000
10436,game_2022_5067,2022-03-06,harvard_crimson,0.467742,0.270270,0.666667,0.272727,0.208333,0.225,0.277778,...,0.219512,0.000000,0.358974,0.56250,0.0,0.516129,0.026316,0.022197,3,0.000000


Index(['game_id', 'game_date', 'team', 'FGA_2', 'FGM_2', 'FGA_3', 'FGM_3',
       'FTA', 'FTM', 'AST', 'BLK', 'STL', 'TOV', 'TOV_team', 'DREB', 'OREB',
       'F_tech', 'F_personal', 'rest_days', 'prev_game_dist', 'home_away_NS',
       'travel_dist'],
      dtype='object')


In [10]:
Data = {}
X = []
y = []
start_col = 'FGA_2'
end_col = 'F_personal'

start_col_nx = 'rest_days'
end_col_nx = 'travel_dist'
# Loop through all Teams in Dataset
for i ,team in enumerate(df_ts['team'].unique()):
    team_data = df_ts[df_ts['team'] == team]
    for idx in range(len(team_data) - 1):
        # Features = past game (FGA_2 to F_personal) + current game stats (rest_days to travel_dist)
        past_game = team_data.iloc[idx]
        next_game = team_data.iloc[idx + 1]
        
        past_stats = past_game.loc[start_col : end_col].values
        next_stats = next_game.loc[start_col_nx:end_col_nx].values

        # We predict next game FGA_2 to F_personal
        label = next_game.loc[start_col:end_col].values

        if idx == 0 and i == 0:
            combined_stats_x = np.concatenate((past_stats, next_stats))
            print(combined_stats_x)
            print(f"Type of combined_stats: {type(combined_stats_x)}")
            print(f"First Index: {combined_stats_x[0]}")
            print("-"*59)
            print(f"Label: {label}")
            print(f"Type of Label: {type(label)}")
            print(f"First Index: {label[0]}")

        combined_stats = np.concatenate((past_stats, next_stats))
        X.append(combined_stats)
        y.append(label)

[0.6612903225806451 0.5135135135135136 0.19607843137254904
 0.2272727272727273 0.125 0.07500000000000001 0.36111111111111105
 0.38888888888888884 0.25925925925925924 0.3902439024390244 0.0
 0.41025641025641024 0.34375 0.0 0.4516129032258064 0.05263157894736842
 0.06209905055170644 2 0.07113462669018225]
Type of combined_stats: <class 'numpy.ndarray'>
First Index: 0.6612903225806451
-----------------------------------------------------------
Label: [0.7580645161290323 0.6216216216216217 0.09803921568627451
 0.045454545454545456 0.47916666666666663 0.45 0.25 0.1111111111111111
 0.5555555555555556 0.3658536585365854 0.3333333333333333
 0.2564102564102564 0.5 0.0 0.3548387096774194]
Type of Label: <class 'numpy.ndarray'>
First Index: 0.7580645161290323


In [11]:
print(f"First 5 Games Features: {X[:5]}")
print(f"Number of Features: {len(X[0])}")

print("-"*59)
print(f"First 5 Games Labels: {y[:5]}")
print(f"Number of Features to Predict: {len(y[0])}")

print(f"Type: {type(X), type(y)}")

First 5 Games Features: [array([0.6612903225806451, 0.5135135135135136, 0.19607843137254904,
       0.2272727272727273, 0.125, 0.07500000000000001,
       0.36111111111111105, 0.38888888888888884, 0.25925925925925924,
       0.3902439024390244, 0.0, 0.41025641025641024, 0.34375, 0.0,
       0.4516129032258064, 0.05263157894736842, 0.06209905055170644, 2,
       0.07113462669018225], dtype=object), array([0.7580645161290323, 0.6216216216216217, 0.09803921568627451,
       0.045454545454545456, 0.47916666666666663, 0.45, 0.25,
       0.1111111111111111, 0.5555555555555556, 0.3658536585365854,
       0.3333333333333333, 0.2564102564102564, 0.5, 0.0,
       0.3548387096774194, 0.07894736842105263, 0.1216320246343341, 2,
       0.06819517930629042], dtype=object), array([0.6612903225806451, 0.6216216216216217, 0.1764705882352941,
       0.09090909090909091, 0.41666666666666663, 0.45,
       0.38888888888888884, 0.2222222222222222, 0.2962962962962963,
       0.41463414634146345, 0.1666666666

# **Dataset**

In [12]:
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

print(X_train.shape)  # Should be (num_samples, num_features) → (N, 19)
print(y_train.shape)  # Should be (num_samples, num_outputs) → (N, 15)

(6139, 19)
(6139, 15)


In [13]:
class bkb_dataset(Dataset):
    def __init__(self, data, label, max_seq_len = 1, transform=None):
        self.data = data
        self.label = label
        self.max_len = max_seq_len

    # Returns: Number of samples
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.data[idx] # Get question at index  
        feature = torch.tensor(feature, dtype = torch.long) # Converts to tensor as long type
        
        label = self.label[idx]
        label = torch.tensor(label, dtype = torch.long) # Converts to tensor as long type

        return feature, label

In [14]:
train_set = bkb_dataset(
    X_train,
    y_train,
)

val_set = bkb_dataset(
    X_val,
    y_val,
)

test_set = bkb_dataset(
    X_test,
    y_test,
)
print(f"Length of train_set:{len(train_set)}")
print(f"Length of test_set:{len(test_set)}")
print(f"Length of val_set:{len(val_set)}")

Length of train_set:6139
Length of test_set:1763
Length of val_set:869


In [15]:
train_batch = 256
test_batch = 32

train_loader = DataLoader(
    train_set,
    batch_size = train_batch,
    shuffle = True
)

val_loader = DataLoader(
    val_set,
    batch_size = test_batch,
    shuffle = False
)

test_loader = DataLoader(
    test_set,
    batch_size = test_batch,
    shuffle = False
)
print(f"Length train_loader: {len(train_loader)}")
print(f"Length test_loader: {len(test_loader)}")
print(f"Length val_loader: {len(val_loader)}")

Length train_loader: 24
Length test_loader: 56
Length val_loader: 28


# **Model**

In [16]:
class FeatureGrouping(nn.Module):
    def __init__(self, num_features=19, num_groups=[3, 2, 1], embed_dim=4, output_dim=15, resnet_model="resnet18"):
        super().__init__()
        self.num_features = num_features
        self.num_groups = num_groups
        self.output_dim = output_dim
        self.embed_dim = embed_dim

        # Learnable weight matrix for feature grouping
        self.group_weights = nn.ParameterDict({
            f'group_{g}': nn.Parameter(torch.randn(num_features + embed_dim, g))
            for g in num_groups
        })
        
        # Embedding for home_away (3 categories: Home, Away, Neutral)
        self.home_away_embed = nn.Embedding(3, embed_dim)

        # Self-Attention layers
        self.attention_layers = nn.ModuleDict({
            f'attn_{g}_{i}': nn.MultiheadAttention(embed_dim=1, num_heads=1, batch_first=True)
            for g in num_groups for i in range(g)
        })

        # Reduce channel dimension (6 → 3)
        self.channel_reducer = nn.Conv2d(in_channels=6, out_channels=3, kernel_size=1)

        # Pretrained ResNet
        self.resnet = timm.create_model(resnet_model, pretrained=True)
        in_features = self.resnet.get_classifier().in_features
        self.resnet.reset_classifier(0)

        # Final regression head
        self.fc = nn.Linear(in_features, output_dim)

    def forward(self, x, home_away_idx):
        batch_size = x.shape[0]

        # Embed home_away
        home_away_embed = self.home_away_embed(home_away_idx)
        x = torch.cat([x, home_away_embed], dim=1)

        grouped_features = []
        for g in self.num_groups:
            weights = F.softmax(self.group_weights[f'group_{g}'], dim=0)
            grouped_x = torch.matmul(x, weights)

            attn_outputs = []
            for i in range(g):
                attn_out, _ = self.attention_layers[f'attn_{g}_{i}'](
                    grouped_x[:, i].unsqueeze(1).unsqueeze(2),
                    grouped_x[:, i].unsqueeze(1).unsqueeze(2),
                    grouped_x[:, i].unsqueeze(1).unsqueeze(2)
                )
                attn_outputs.append(attn_out.squeeze(2))

            grouped_features.append(torch.cat(attn_outputs, dim=1))

        # Stack grouped features (Shape: B, 6)
        x_final = torch.cat(grouped_features, dim=1).unsqueeze(-1).unsqueeze(-1)  # (B, 6, 1, 1)

        # Reduce channel dimension (6 → 3)
        x_final = self.channel_reducer(x_final)  # (B, 3, 1, 1)
        x_final = x_final.expand(-1, 3, 224, 224)  # Resize for ResNet

        # Process through ResNet
        x_final = self.resnet(x_final)
        output = self.fc(x_final)

        return output

# **Test**

In [17]:
# Ensure X and y are not lists
X = np.array(X)  # If X was a list
y = np.array(y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_val1, X_test1, y_val1, y_test1 = train_test_split(X_test1, y_test1, test_size=0.33, random_state=42)

print(X_train1.shape)  # Should be (num_samples, num_features) → (N, 19)
print(y_train1.shape)  # Should be (num_samples, num_outputs) → (N, 15)

(6139, 19)
(6139, 15)


In [18]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
# Wrap XGBRegressor to handle multi-output regression
model = MultiOutputRegressor(XGBRegressor(
    objective='reg:squarederror',  
    n_estimators=100,  
    learning_rate=0.1  
))

# Train model
model.fit(X_train1, y_train1)  

y_pred = model.predict(X_test1)
print(y_pred.shape)  # Should be (N_test, 15)

from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test1, y_pred, multioutput='raw_values')  # RMSE for each label
print("RMSE per output:", rmse)
print("Average RMSE:", rmse.mean())  # Average RMSE across all 15 labels

(869, 15)
RMSE per output: [0.01580971 0.01553993 0.01405026 0.01751577 0.02033822 0.01815269
 0.01549731 0.01604957 0.01456579 0.01404391 0.02238352 0.01987478
 0.01867983 0.00606631 0.02056576]
Average RMSE: 0.016608892413439234


In [19]:
X_pred_home = model.predict(np.array([X_train1[0]]))  # Ensures a 2D shape (1, 19)
X_pred_away = model.predict(np.array([X_train1[1]]))
print(X_pred_home)
print(X_pred_away)

[[0.44560626 0.2505317  0.35928938 0.27473179 0.35134402 0.24747607
  0.28863877 0.10484435 0.25987056 0.33177465 0.07613    0.41842663
  0.38874078 0.01177177 0.33863005]]
[[0.5044346  0.37466002 0.33185732 0.25413755 0.3480368  0.2988583
  0.3174013  0.15391321 0.26297748 0.31009695 0.07529958 0.3895041
  0.35768604 0.0132556  0.3995734 ]]


In [20]:
from pytorch_tabnet.tab_model import TabNetRegressor

X_train1 = X_train1.astype(np.float32)
y_train1 = y_train1.astype(np.float32)
X_val1 = X_val1.astype(np.float32)
y_val1 = y_val1.astype(np.float32)

X_test1 = X_test1.astype(np.float32)
y_test1 = y_test1.astype(np.float32)

if y_test1.ndim == 1:
    y_test1 = y_test1.reshape(-1, 1)

if y_train1.ndim == 1:
    y_train1 = y_train1.reshape(-1, 1)
if y_val1.ndim == 1:
    y_val1 = y_val1.reshape(-1, 1)
    
model = TabNetRegressor()
model.fit(X_train1, y_train1, eval_set=[(X_val1, y_val1)])
preds = model.predict(X_test1)
print(preds)



epoch 0  | loss: 1.12782 | val_0_mse: 0.12161999940872192|  0:00:00s
epoch 1  | loss: 0.22737 | val_0_mse: 0.12221000343561172|  0:00:00s
epoch 2  | loss: 0.12084 | val_0_mse: 0.12197999656200409|  0:00:01s
epoch 3  | loss: 0.10212 | val_0_mse: 0.11307000368833542|  0:00:01s
epoch 4  | loss: 0.08596 | val_0_mse: 0.0919099971652031|  0:00:01s
epoch 5  | loss: 0.07088 | val_0_mse: 0.07126999646425247|  0:00:01s
epoch 6  | loss: 0.05865 | val_0_mse: 0.05177000164985657|  0:00:02s
epoch 7  | loss: 0.04666 | val_0_mse: 0.039489999413490295|  0:00:02s
epoch 8  | loss: 0.03785 | val_0_mse: 0.027580000460147858|  0:00:02s
epoch 9  | loss: 0.03169 | val_0_mse: 0.02191000059247017|  0:00:02s
epoch 10 | loss: 0.02778 | val_0_mse: 0.019840000197291374|  0:00:03s
epoch 11 | loss: 0.02527 | val_0_mse: 0.018479999154806137|  0:00:03s
epoch 12 | loss: 0.02378 | val_0_mse: 0.01834000088274479|  0:00:03s
epoch 13 | loss: 0.02261 | val_0_mse: 0.018120000138878822|  0:00:03s
epoch 14 | loss: 0.02187 | val

