In [1]:
# Data Processing n' Visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Compute
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Random
import os

-------------------------
# **Miscs**

In [2]:
data_dir = '/kaggle/input/wharton-dataset-basketball/data.csv'
dict_dir = '/kaggle/input/wharton-dataset-basketball/dict.csv'
infer_dir = '/kaggle/input/wharton-dataset-basketball/inference.csv'
region_dir = '/kaggle/input/wharton-dataset-basketball/region.csv'

In [3]:
df = pd.read_csv(data_dir)
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,game_id,game_date,team,FGA_2,FGM_2,FGA_3,FGM_3,FTA,FTM,AST,...,largest_lead,notD1_incomplete,OT_length_min_tot,rest_days,attendance,tz_dif_H_E,prev_game_dist,home_away,home_away_NS,travel_dist
0,game_2022_2011,2021-12-30,georgia_lady_bulldogs,61,27,11,5,6,3,14,...,1.0,False,,9.0,3241.0,0.0,0.0,home,1,0.0
1,game_2022_2011,2021-12-30,lsu_tigers,61,28,11,4,15,8,15,...,14.0,False,,3.0,3241.0,0.0,824.0,away,-1,824.0
2,game_2022_2012,2021-12-30,missouri_tigers,58,25,15,7,16,13,10,...,8.0,False,5.0,8.0,6139.0,0.0,371.0,home,1,0.0
3,game_2022_2012,2021-12-30,south_carolina_gamecocks,76,29,21,6,9,5,15,...,6.0,False,5.0,9.0,6139.0,0.0,1154.0,away,-1,1154.0
4,game_2022_2013,2021-12-30,tennessee_lady_volunteers,56,24,15,4,15,10,16,...,19.0,False,,3.0,8124.0,0.0,0.0,home,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10433,game_2022_4795,2022-03-01,xavier_musketeers,56,21,12,2,12,9,15,...,0.0,False,,2.0,156.0,0.0,0.0,home,1,0.0
10434,game_2022_4968,2022-03-04,harvard_crimson,69,34,39,15,10,2,20,...,40.0,False,,13.0,373.0,0.0,378.0,away,-1,173.0
10435,game_2022_4968,2022-03-04,dartmouth_big_green,58,21,26,6,7,4,7,...,3.0,False,,6.0,373.0,0.0,479.0,home,1,0.0
10436,game_2022_5067,2022-03-06,harvard_crimson,73,19,35,6,10,9,11,...,0.0,False,,2.0,757.0,0.0,173.0,home,1,0.0


-----------------------------
# **Data Preprocessing**

In [4]:
"""
    Example Code for the Cell Below.

    Data Structure:
        mock_data -> list
        mock_data[idx] -> dict
"""

game_id = df['game_id'].unique()
print(f"This is the game_id of the first game: {game_id[0]}")
print("-"*59)

game_data = df[df['game_id'] == game_id[0]]
print("Stats of teams in game_id: ")
print(game_data)
print("-"*59)

mock_data = []

teamA = game_data.iloc[0]
teamB = game_data.iloc[1]

home_away = {
    -1: 'A',
    0: 'N',
    1: 'H'
}

mock_entry = {
    'teamA': teamA['team'],
    'teamB': teamB['team'],
    'teamA_score': teamA['team_score'],
    'teamB_score': teamB['team_score'],

    # We will perform Embedding for Home/Away/Neutral
    # Similar to NLP 
    # Input -> List of words -> Embedding
    'A_B_home': [home_away[teamA['home_away_NS']],
                 home_away[teamB['home_away_NS']]],

    # 0: Lost | 1: Won | 2: Draw
    'W/L/D (teamA)': 0 if teamA['team_score'] - teamB['team_score'] < 0
                    else 1 if teamA['team_score'] - teamB['team_score'] > 0
                    else 2
}
stats_to_diff = [
    'FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 'FTA',
    'FTM', 'AST', 'BLK', 'STL', 'TOV', 'TOV_team',
    'DREB', 'OREB', 'F_tech', 'F_personal', 'rest_days',
    'prev_game_dist', 'travel_dist'
]

for stat in stats_to_diff:
    
    if teamA[stat] == 'NA':
        print(f'Team A ({teamA["team"]}) {stat} has NA value. Converting to Team B ({teamB["team"]})...')
        teamA[stat] = teamB[stat]
    elif teamB[stat] == 'NA':
        print(f'Team B ({teamB["team"]}) {stat} has NA value. Converting to Team A (({teamA["team"]}))...')
        teamB[stat] = teamA[stat]
    else:
        print("No abnormalities")
        
    # Compute difference (A to B)
    mock_entry[f'{stat}_diff (A - B)'] = teamA[stat] - teamB[stat]
    
# Append to List
mock_data.append(mock_entry)

# Change to type pd
pd.DataFrame(mock_data)

display(mock_data)
print("-"*59)

This is the game_id of the first game: game_2022_2011
-----------------------------------------------------------
Stats of teams in game_id: 
          game_id   game_date                   team  FGA_2  FGM_2  FGA_3  \
0  game_2022_2011  2021-12-30  georgia_lady_bulldogs     61     27     11   
1  game_2022_2011  2021-12-30             lsu_tigers     61     28     11   

   FGM_3  FTA  FTM  AST  ...  largest_lead  notD1_incomplete  \
0      5    6    3   14  ...           1.0             False   
1      4   15    8   15  ...          14.0             False   

   OT_length_min_tot  rest_days  attendance  tz_dif_H_E  prev_game_dist  \
0                NaN        9.0      3241.0         0.0             0.0   
1                NaN        3.0      3241.0         0.0           824.0   

   home_away  home_away_NS  travel_dist  
0       home             1          0.0  
1       away            -1        824.0  

[2 rows x 30 columns]
----------------------------------------------------------

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


[{'teamA': 'georgia_lady_bulldogs',
  'teamB': 'lsu_tigers',
  'teamA_score': 62,
  'teamB_score': 68,
  'A_B_home': ['H', 'A'],
  'W/L/D (teamA)': 0,
  'FGA_2_diff (A - B)': 0,
  'FGM_2_diff (A - B)': -1,
  'FGA_3_diff (A - B)': 0,
  'FGM_3_diff (A - B)': 1,
  'FTA_diff (A - B)': -9,
  'FTM_diff (A - B)': -5,
  'AST_diff (A - B)': -1,
  'BLK_diff (A - B)': 5,
  'STL_diff (A - B)': -8,
  'TOV_diff (A - B)': 4,
  'TOV_team_diff (A - B)': -2,
  'DREB_diff (A - B)': 0,
  'OREB_diff (A - B)': 0,
  'F_tech_diff (A - B)': 0,
  'F_personal_diff (A - B)': 11,
  'rest_days_diff (A - B)': 6.0,
  'prev_game_dist_diff (A - B)': -824.0,
  'travel_dist_diff (A - B)': -824.0}]

-----------------------------------------------------------


In [5]:
from sklearn.preprocessing import RobustScaler, StandardScaler

def normalize_wrt_category(data, stats_to_diff):
    """
    Normalize each category of features separately using Robust Scaling + Z-score normalization.
    """
    for stat in stats_to_diff:
        # Apply Robust Scaling first
        scaler_robust = RobustScaler()
        data[f'{stat}_diff (A - B)'] = scaler_robust.fit_transform(data[[f'{stat}_diff (A - B)']])

        # Apply Z-score normalization
        scaler_z = StandardScaler()
        data[f'{stat}_diff (A - B)'] = scaler_z.fit_transform(data[[f'{stat}_diff (A - B)']])
    
    return data


def preprocess_data(data):
    processed_data = []
    
    # Drop irrelevant columns
    data = data.drop(columns=[
        'notD1_incomplete', 'OT_length_min_tot', 'tz_dif_H_E', 'attendance'
    ])
    
    home_away = {-1: 'A', 0: 'N', 1: 'H'}

    stats_to_diff = [
        'FGA_2', 'FGM_2', 'FGA_3', 'FGM_3', 'FTA',
        'FTM', 'AST', 'BLK', 'STL', 'TOV', 'TOV_team',
        'DREB', 'OREB', 'F_tech', 'F_personal', 'rest_days',
        'prev_game_dist', 'travel_dist'
    ]

    # Process each game
    for game_id in data['game_id'].unique():
        game_data = data[data['game_id'] == game_id]

        # Ensure the game has exactly 2 teams
        if len(game_data) != 2:
            print(f"Skipping game {game_id} due to missing teams.")
            continue

        # Extract teams
        teamA = game_data.iloc[0]
        teamB = game_data.iloc[1]

        entry = {
            'teamA': teamA['team'],
            'teamB': teamB['team'],
            'teamA_score': teamA['team_score'],
            'teamB_score': teamB['team_score'],

            # Embedding for Home/Away/Neutral
            'A_B_home': [home_away[teamA['home_away_NS']], home_away[teamB['home_away_NS']]],

            # 0: Lost | 1: Won | 2: Draw
            'W/L/D (teamA)': 0 if teamA['team_score'] < teamB['team_score']
                            else 1 if teamA['team_score'] > teamB['team_score']
                            else 2
        }

        # Compute stat differences
        for stat in stats_to_diff:
            # Handle NA values
            if pd.isna(teamA[stat]):
                print(f"Team A ({teamA['team']}) {stat} has NA. Using Team B's value.")
                teamA[stat] = teamB[stat]
            elif pd.isna(teamB[stat]):
                print(f"Team B ({teamB['team']}) {stat} has NA. Using Team A's value.")
                teamB[stat] = teamA[stat]
            
            # Compute difference
            entry[f'{stat}_diff (A - B)'] = teamA[stat] - teamB[stat]
        
        processed_data.append(entry)

    # Convert to DataFrame
    processed_df = pd.DataFrame(processed_data)

    # Normalize the features
    processed_df = normalize_wrt_category(processed_df, stats_to_diff)

    return processed_df

In [6]:
data = preprocess_data(df)

Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team A (iowa_hawkeyes) travel_dist has NA. Using Team B's value.
Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team B (iowa_hawkeyes) travel_dist has NA. Using Team A's value.
Team A (colorado_state_rams) prev_game_dist has NA. Using Team B's value.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teamB[stat] = teamA[stat]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teamA[stat] = teamB[stat]


Team A (texas_am_aggies) rest_days has NA. Using Team B's value.
Team A (texas_am_aggies) prev_game_dist has NA. Using Team B's value.
Team B (southern_jaguars) rest_days has NA. Using Team A's value.
Team B (southern_jaguars) prev_game_dist has NA. Using Team A's value.
Team A (kent_state_golden_flashes) prev_game_dist has NA. Using Team B's value.
Team A (little_rock_trojans) prev_game_dist has NA. Using Team B's value.
Team A (little_rock_trojans) prev_game_dist has NA. Using Team B's value.
Team B (santa_clara_broncos) prev_game_dist has NA. Using Team A's value.
Team A (nc_state_wolfpack) rest_days has NA. Using Team B's value.
Team A (nc_state_wolfpack) prev_game_dist has NA. Using Team B's value.
Team A (duke_blue_devils) rest_days has NA. Using Team B's value.
Team A (duke_blue_devils) prev_game_dist has NA. Using Team B's value.
Team A (ohio_bobcats) rest_days has NA. Using Team B's value.
Team A (ohio_bobcats) prev_game_dist has NA. Using Team B's value.
Team A (boston_colleg

In [7]:
data

Unnamed: 0,teamA,teamB,teamA_score,teamB_score,A_B_home,W/L/D (teamA),FGA_2_diff (A - B),FGM_2_diff (A - B),FGA_3_diff (A - B),FGM_3_diff (A - B),...,STL_diff (A - B),TOV_diff (A - B),TOV_team_diff (A - B),DREB_diff (A - B),OREB_diff (A - B),F_tech_diff (A - B),F_personal_diff (A - B),rest_days_diff (A - B),prev_game_dist_diff (A - B),travel_dist_diff (A - B)
0,georgia_lady_bulldogs,lsu_tigers,62,68,"[H, A]",0,0.003834,-0.112506,0.005967,0.233130,...,-1.574011,0.588502,-1.562092,0.017398,0.025632,-0.01107,2.019331,1.699250,-1.046795,-0.956785
1,missouri_tigers,south_carolina_gamecocks,70,69,"[H, A]",1,-1.671288,-0.469738,-0.636107,0.233130,...,0.177621,0.007899,0.781495,0.511561,-2.088183,-0.01107,-0.789341,-0.267125,-0.994518,-1.337002
2,tennessee_lady_volunteers,alabama_crimson_tide,62,44,"[H, A]",1,-1.392101,0.840111,-1.492206,-0.237003,...,-0.017005,0.588502,0.781495,1.376346,-0.880288,-0.01107,-0.602097,-3.918963,-0.555903,-0.513199
3,auburn_tigers,alabama_crimson_tide,53,56,"[A, H]",0,0.189958,0.006571,-0.315070,-0.237003,...,-0.211631,0.443351,-0.780896,-0.229683,-0.125355,-2.07443,-0.227607,2.822893,-0.304720,0.219583
4,tennessee_lady_volunteers,arkansas_razorbacks,70,63,"[A, H]",1,-2.136600,0.006571,-3.525440,-1.647404,...,-1.379385,1.604558,-0.780896,1.994050,-0.276341,-0.01107,-1.538321,-2.514410,1.176881,1.052603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5214,binghamton_bearcats,umbc_retrievers,58,48,"[H, A]",1,-0.554540,0.721034,0.219991,-0.237003,...,-0.600882,0.443351,-0.780896,1.129265,-0.729302,-0.01107,-0.414852,0.294697,-0.823663,-0.374938
5215,south_dakota_coyotes,western_illinois_leathernecks,75,48,"[A, H]",1,1.678956,1.316420,-0.208058,0.468197,...,1.150749,-1.588761,0.000299,0.264480,1.384513,2.05229,-2.100055,0.013786,-0.796887,0.658560
5216,seton_hall_pirates,xavier_musketeers,74,53,"[A, H]",1,0.841395,0.959188,1.397127,1.408464,...,1.150749,-0.282403,0.000299,0.511561,0.780566,-0.01107,-0.040362,0.575607,0.517683,1.022647
5217,harvard_crimson,dartmouth_big_green,85,52,"[A, H]",1,1.027520,1.554574,1.397127,2.113664,...,1.734627,-2.169364,-1.562092,-0.106142,0.025632,-0.01107,-0.040362,1.980161,-0.124939,0.191930
