# Kaggle Men's 2019 March Madness

## Data preparation

Load the data.

In [17]:
import pandas as pd

pd.options.display.max_columns = 999

io_params = {'dtype': {'WLoc': 'category'}}
regular_results = pd.read_csv('data/women/WRegularSeasonDetailedResults.csv', **io_params)
tourney_results = pd.read_csv('data/women/WNCAATourneyDetailedResults.csv', **io_params)
team_names = pd.read_csv('data/women/WTeams.csv', index_col='TeamID')['TeamName'].to_dict()

Some sanity checks.

In [18]:
assert all(regular_results.columns == tourney_results.columns)

Augment results.

In [19]:
def augment_results(results):
    
    results = results.rename(columns={'WLoc': 'location'})
    win_cols = [col for col in results.columns if col.startswith('W')]
    lose_cols = [col for col in results.columns if col.startswith('L')]
    extra_cols = [col for col in results.columns if not col.startswith(('W', 'L'))]
    extra_cols.remove('location')
    
    return pd.concat(
        (
            pd.concat(
                (
                    results[extra_cols],
                    results['location'],
                    results[win_cols].rename(columns=lambda x: f'T1_{x[1:]}'),
                    results[lose_cols].rename(columns=lambda x: f'T2_{x[1:]}')
                ),
                axis='columns'
            ),
            pd.concat(
                (
                    results[extra_cols],
                    results['location'].map({'H': 'A', 'A': 'H', 'N': 'N'}).astype('category'),
                    results[lose_cols].rename(columns=lambda x: f'T1_{x[1:]}'),
                    results[win_cols].rename(columns=lambda x: f'T2_{x[1:]}')
                ),
                axis='columns'
            )
        ),
        axis='rows',
        ignore_index=True
    ).rename(columns={
        'T1_TeamID': 'T1',
        'T2_TeamID': 'T2'
    })

regular = augment_results(regular_results)
tourney = augment_results(tourney_results)

Initialize a dataframe to which we will append features. We will then use this dataframe for training.

In [20]:
df = tourney[['Season', 'T1', 'T1_Score', 'T2', 'T2_Score']].copy()
df = df.set_index(['Season', 'T1', 'T2'])
df['Victory'] = (df['T1_Score'] > df['T2_Score']).astype(float)
df = df.drop(columns=['T1_Score', 'T2_Score'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory
Season,T1,T2,Unnamed: 3_level_1
2010,3124,3201,1.0
2010,3173,3395,1.0
2010,3181,3214,1.0
2010,3199,3256,1.0
2010,3207,3265,1.0


Add all the 2019 tournament games for which a prediction has to be made.

In [22]:
sub = pd.read_csv('data/women/WSampleSubmissionStage2.csv')
sub = sub['ID'].str.split('_', expand=True).astype(int)
sub.columns = ['Season', 'T1', 'T2']
sub['Victory'] = None
sub = sub.set_index(['Season', 'T1', 'T2'])
df = pd.concat((df, sub))

## Feature extraction

Seeds.

In [23]:
seeds = pd.read_csv('data/women/WNCAATourneySeeds.csv', index_col=['Season', 'TeamID'])
seeds['seed'] = seeds['Seed'].map(lambda x: int(x[1:3]))

df = df.join(seeds['seed'].rename('T1_seed'), on=['Season', 'T1'])
df = df.join(seeds['seed'].rename('T2_seed'), on=['Season', 'T2'])
df['seed_diff'] = df['T1_seed'] - df['T2_seed']
df = df.join(df.groupby('seed_diff').size().rename('n_seed_diff_occurrences'), on='seed_diff')

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory,T1_seed,T2_seed,seed_diff,n_seed_diff_occurrences
Season,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010,3124,3201,1.0,4,13,-9,97
2010,3173,3395,1.0,8,9,-1,234
2010,3181,3214,1.0,2,15,-13,65
2010,3199,3256,1.0,3,14,-11,79
2010,3207,3265,1.0,5,12,-7,119


Regular season aggregate statistics.

In [26]:
stats = [
    'WFGM', 
    'WFGA',
    'WFGM3',
    'WFGA3',
    'WFTM',
    'WFTA',
    'WOR',
    'WDR',
    'WAst',
    'WTO', 'WStl',
    'WBlk', 
    'WPF',
]

stats = regular.assign(
                    point_differential=lambda x: x['T1_Score'] - x['T2_Score'],
                    FGM=lambda x: x['T1_FGM'],
                    FGA=lambda x: x['T1_FGA'],
                    FTM=lambda x: x['T1_FTM'],
                    Stl=lambda x: x['T1_Stl'],
                    score=lambda x: x['T1_Score']
                )\
               .groupby(['Season', 'T1'])\
               .agg({
                   'point_differential': ['mean', 'std'],
                   'FGM': ['median'],
                   'FGA': ['median'],
                   'FTM': ['mean'],
                   'Stl': ['median']
               })

stats.columns = ['_'.join(combo) for combo in stats.columns]

stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,point_differential_mean,point_differential_std,FGM_median,FGA_median,FTM_mean,Stl_median
Season,T1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010,3102,-19.964286,14.662833,19.0,53.0,8.964286,6.0
2010,3103,2.6,15.477681,23.0,54.5,13.833333,8.0
2010,3104,-2.862069,18.502363,24.0,61.0,10.344828,7.0
2010,3105,-3.962963,14.46875,20.0,50.0,17.0,9.0
2010,3106,-2.275862,11.572432,19.0,53.0,15.551724,8.0


In [27]:
t1_stats = stats.add_prefix('T1_')
t2_stats = stats.add_prefix('T2_')

df = df.drop(columns=t1_stats.columns.intersection(df.columns)).join(t1_stats, on=['Season', 'T1'])
df = df.drop(columns=t2_stats.columns.intersection(df.columns)).join(t2_stats, on=['Season', 'T2'])

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory,T1_seed,T2_seed,seed_diff,n_seed_diff_occurrences,T1_point_differential_mean,T1_point_differential_std,T1_FGM_median,T1_FGA_median,T1_FTM_mean,T1_Stl_median,T2_point_differential_mean,T2_point_differential_std,T2_FGM_median,T2_FGA_median,T2_FTM_mean,T2_Stl_median
Season,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010,3124,3201,1.0,4,13,-9,97,15.25,24.166092,26.0,56.0,17.4375,6.0,12.878788,17.129575,26.0,60.0,12.939394,10.0
2010,3173,3395,1.0,8,9,-1,234,11.269231,14.191709,25.5,61.0,13.538462,8.0,12.0,19.280149,26.0,60.0,13.766667,9.0
2010,3181,3214,1.0,2,15,-13,65,16.53125,23.107022,25.5,64.0,13.34375,13.0,7.7,14.985395,22.5,58.0,13.533333,9.5
2010,3199,3256,1.0,3,14,-11,79,14.366667,19.893654,26.0,58.5,15.566667,8.0,9.935484,15.196349,27.0,63.0,16.0,7.0
2010,3207,3265,1.0,5,12,-7,119,9.666667,12.844785,24.0,59.0,14.033333,14.0,10.272727,12.597799,25.0,56.0,14.787879,8.0


Regular season recent win ratios of both teams.

In [28]:
recent_win_ratio = regular.query('DayNum > 118')\
                          .assign(win=lambda x: x['T1_Score'] > x['T2_Score'])\
                          .groupby(['Season', 'T1'])['win']\
                          .mean()

df = df.drop(columns='T1_RecentWinRatio', errors='ignore')\
       .join(recent_win_ratio.rename('T1_RecentWinRatio'), on=['Season', 'T1'])

df = df.drop(columns='T2_RecentWinRatio', errors='ignore')\
       .join(recent_win_ratio.rename('T2_RecentWinRatio'), on=['Season', 'T2'])

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory,T1_seed,T2_seed,seed_diff,n_seed_diff_occurrences,T1_point_differential_mean,T1_point_differential_std,T1_FGM_median,T1_FGA_median,T1_FTM_mean,T1_Stl_median,T2_point_differential_mean,T2_point_differential_std,T2_FGM_median,T2_FGA_median,T2_FTM_mean,T2_Stl_median,T1_RecentWinRatio,T2_RecentWinRatio
Season,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,3124,3201,1.0,4,13,-9,97,15.25,24.166092,26.0,56.0,17.4375,6.0,12.878788,17.129575,26.0,60.0,12.939394,10.0,0.5,0.75
2010,3173,3395,1.0,8,9,-1,234,11.269231,14.191709,25.5,61.0,13.538462,8.0,12.0,19.280149,26.0,60.0,13.766667,9.0,0.5,0.333333
2010,3181,3214,1.0,2,15,-13,65,16.53125,23.107022,25.5,64.0,13.34375,13.0,7.7,14.985395,22.5,58.0,13.533333,9.5,1.0,1.0
2010,3199,3256,1.0,3,14,-11,79,14.366667,19.893654,26.0,58.5,15.566667,8.0,9.935484,15.196349,27.0,63.0,16.0,7.0,0.0,0.8
2010,3207,3265,1.0,5,12,-7,119,9.666667,12.844785,24.0,59.0,14.033333,14.0,10.272727,12.597799,25.0,56.0,14.787879,8.0,0.5,1.0


Rating percentage index (RPI).

In [29]:
victory = lambda x: x['T1_Score'] > x['T2_Score']
win_rates = regular.assign(victory=victory).groupby(['Season', 'T1'])['victory'].agg(['mean', 'count']).to_dict()
matchups = regular.assign(victory=victory).groupby(['Season', 'T1', 'T2'])['victory'].agg(['sum', 'count']).to_dict()
n_matches = regular.groupby(['Season', 'T1']).size().to_dict()
opponents = regular.groupby(['Season', 'T1'])['T2'].unique().to_dict()


def update_mean(mean, count, removed_sum, removed_count):
    return (mean * count - removed_sum) / (count - removed_count)


def calc_wp(season, team):
    return win_rates['mean'][(season, team)]


def calc_owp(season, team):
    
    return 1 / n_matches[(season, team)] * sum(
        update_mean(
            mean=win_rates['mean'][(season, opponent)],
            count=win_rates['count'][(season, opponent)],
            removed_sum=matchups['sum'][(season, opponent, team)],
            removed_count=matchups['count'][(season, opponent, team)]
        ) * matchups['count'][(season, opponent, team)]
        for opponent in opponents[(season, team)]
    )


def calc_oowp(season, team, owps):
    return 1 / n_matches[(season, team)] * sum(
        owps[opponent] * matchups['count'][(season, opponent, team)]
        for opponent in opponents[(season, team)]
    )


def calc_rpi(wp, owp, oowp):
    return wp * .25 + owp * .5 + oowp * .25
    

seasons = regular['Season'].unique()

wps = {
    season: {
        team: calc_wp(season, team)
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
owps = {
    season: {
        team: calc_owp(season, team)
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
oowps = {
    season: {
        team: calc_oowp(season, team, owps[season])
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
rpis = {
    season: {
        team: calc_rpi(wps[season][team], owps[season][team], oowps[season][team])
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}

rpis = pd.DataFrame.from_dict(rpis, orient='columns').stack()
df = df.drop(columns='T1_rpi', errors='ignore').join(rpis.rename('T1_rpi'), on=['T1', 'Season'])
df = df.drop(columns='T2_rpi', errors='ignore').join(rpis.rename('T2_rpi'), on=['T2', 'Season'])

Simple rating sytem (SRS).

In [30]:
from scipy import optimize

srss = {}

for season in regular['Season'].unique():
    
    season_results = regular_results.query(f'Season == {season}')
    
    teams = season_results['WTeamID'].unique()
    
    G = pd.concat(
        [
            (
                (season_results['WTeamID'] == team).astype(int) - \
                (season_results['LTeamID'] == team).astype(int)
            ).rename(team)
            for team in teams
        ],
        axis='columns'
    )
    
    S = season_results['WScore'] - season_results['LScore']
    
    R = optimize.lsq_linear(G, S).x
    
    srss[season] = pd.Series(R, index=teams)
    
srss = pd.DataFrame.from_dict(srss).stack()
df = df.drop(columns='T1_srs', errors='ignore').join(srss.rename('T1_srs'), on=['T1', 'Season'])
df = df.drop(columns='T2_srs', errors='ignore').join(srss.rename('T2_srs'), on=['T2', 'Season'])

## Machine learning

Prepare the train and test sets.

In [31]:
is_train = df['Victory'].notnull()
train_seasons = df[is_train].index.get_level_values('Season')

X_train = df[is_train].drop(columns='Victory')
y_train = df.loc[is_train, 'Victory'].astype(bool)
X_test = df[~is_train].drop(columns='Victory')

Do the LightGBM dance zzz.

In [34]:
import lightgbm as lgb


class LGBModel(lgb.LGBMClassifier):
    
    def fit(self, X, y, **fit_params):
        return super().fit(X, y, **fit_params)
    
    def predict_proba(self, X):
        
        y_pred = super().predict_proba(X)[:, 1]
        
        # Upsets don't happen
        y_pred[X['seed_diff'] >= 11] = 0
        y_pred[X['seed_diff'] <= -11] = 1
        
        return y_pred

In [39]:
import numpy as np
from sklearn import metrics
from sklearn import model_selection
from sklearn import utils


def bake_model(random_state):
    return LGBModel(
        objective='binary',
        num_leaves=16,
        learning_rate=0.1,
        colsample_bytree=0.8,
        n_estimators=3000,
        min_child_samples=42,
        importance_type='gain',
        random_state=random_state
    )


rng = utils.check_random_state(42)
seasons = X_train.index.get_level_values('Season').unique()
n_repeats = 10

oof = pd.Series(0, index=X_train.index)
oof_scores = pd.DataFrame(index=seasons, columns=range(n_repeats))
sub_stage_2 = pd.DataFrame(0, index=X_test.index, columns=seasons)
importances = pd.DataFrame(index=X_train.columns)

for i in range(n_repeats):
    
    model = bake_model(rng.randint(10e10))

    for j, season in enumerate(seasons):
        
        fit_mask = X_train.index.get_level_values('Season') != season
        val_mask = X_train.index.get_level_values('Season') == season

        X_fit = X_train.loc[fit_mask]
        y_fit = y_train.loc[fit_mask]
        X_val = X_train.loc[val_mask]
        y_val = y_train.loc[val_mask]

        model = model.fit(
            X=X_fit,
            y=y_fit,
            eval_set=[(X_fit, y_fit), (X_val, y_val)],
            eval_names=('fit', 'val'),
            eval_metric='logloss',
            early_stopping_rounds=30,
            verbose=0
        ) 

        oof.loc[val_mask] += model.predict_proba(X_val)
        sub_stage_2[season] += model.predict_proba(X_test)
        importances[i * len(seasons) + j] = model.feature_importances_
        oof_scores.loc[season, i] = metrics.log_loss(y_val, model.predict_proba(X_val))
    
oof /= n_repeats
oof_scores = oof_scores.mean(axis='columns')
sub_stage_2 /= n_repeats
importances = importances.median(axis='columns')

print(oof_scores.to_string())
print()
print(f'Average OOF logloss: {oof_scores.mean():.5f} (±{oof_scores.std():.5f})')

Season
2010    0.434137
2011    0.384342
2012    0.370176
2013    0.416947
2014    0.355030
2015    0.366646
2016    0.461712
2017    0.421719
2018    0.437770

Average OOF logloss: 0.40539 (±0.03736)


In [41]:
print(importances.sort_values(0, ascending=False).to_string())

seed_diff                     1517.121554
n_seed_diff_occurrences        449.563922
T1_rpi                         323.534896
T2_rpi                         287.564114
T2_point_differential_mean     287.155131
T1_point_differential_mean     284.185965
T1_seed                        206.038950
T2_seed                        196.084978
T2_point_differential_std      113.642950
T1_point_differential_std      109.215772
T2_RecentWinRatio               86.110645
T1_FTM_mean                     84.786880
T1_RecentWinRatio               84.083296
T1_FGM_median                   83.552210
T2_FTM_mean                     80.486005
T2_FGM_median                   77.281927
T2_FGA_median                   67.627282
T1_FGA_median                   65.483986
T2_Stl_median                   62.028304
T1_srs                          61.569112
T1_Stl_median                   60.216832
T2_srs                          58.957683


Stage 1 submission.

In [44]:
sub_stage_1 = pd.read_csv('data/women/WSampleSubmissionStage1.csv')
id_parts = sub_stage_1['ID'].str.split('_', expand=True).astype(int)

sub_stage_1['Season'] = id_parts[0]
sub_stage_1['T1'] = id_parts[1]
sub_stage_1['T2'] = id_parts[2]
sub_stage_1 = sub_stage_1.set_index(['Season', 'T1', 'T2'])

sub_stage_1['Pred'].update(oof.rename('Pred'))

sub_stage_1.to_csv('subs/women_stage_1.csv', index=False)
!head subs/women_stage_1.csv

ID,Pred
2014_3103_3107,0.5
2014_3103_3113,0.5
2014_3103_3119,0.5
2014_3103_3124,0.5
2014_3103_3140,0.5
2014_3103_3143,0.5
2014_3103_3151,0.5
2014_3103_3163,0.5
2014_3103_3169,0.5


Stage 2 submission.

In [45]:
sub = sub_stage_2.mean(axis='columns')
sub = sub.to_frame('Pred')
sub['ID'] = ['_'.join(map(str, idx)) for idx in sub.index]
sub = sub[reversed(sub.columns)]
sub.to_csv('subs/women_stage_2.csv', index=False)
!head subs/men_stage_2.csv

ID,Pred
2019_1101_1113,0.24750658533170297
2019_1101_1120,0.2443348343648902
2019_1101_1124,0.49906228531415536
2019_1101_1125,0.5529687525100264
2019_1101_1133,0.6708233231818702
2019_1101_1138,0.2040019330391345
2019_1101_1153,0.23001297608815877
2019_1101_1159,0.6550857502094999
2019_1101_1181,0.05568959886909992
