# Kaggle Men's 2019 March Madness

## Data preparation

Load the data.

In [1]:
import pandas as pd

pd.options.display.max_columns = 999

io_params = {'dtype': {'WLoc': 'category'}}
regular_results = pd.read_csv('data/men/RegularSeasonDetailedResults.csv', **io_params)
tourney_results = pd.read_csv('data/men/NCAATourneyDetailedResults.csv', **io_params)
team_names = pd.read_csv('data/men/Teams.csv', index_col='TeamID')['TeamName'].to_dict()

Some sanity checks.

In [2]:
assert all(regular_results.columns == tourney_results.columns)

Augment results.

In [3]:
def augment_results(results):
    
    results = results.rename(columns={'WLoc': 'location'})
    win_cols = [col for col in results.columns if col.startswith('W')]
    lose_cols = [col for col in results.columns if col.startswith('L')]
    extra_cols = [col for col in results.columns if not col.startswith(('W', 'L'))]
    extra_cols.remove('location')
    
    return pd.concat(
        (
            pd.concat(
                (
                    results[extra_cols],
                    results['location'],
                    results[win_cols].rename(columns=lambda x: f'T1_{x[1:]}'),
                    results[lose_cols].rename(columns=lambda x: f'T2_{x[1:]}')
                ),
                axis='columns'
            ),
            pd.concat(
                (
                    results[extra_cols],
                    results['location'].map({'H': 'A', 'A': 'H', 'N': 'N'}).astype('category'),
                    results[lose_cols].rename(columns=lambda x: f'T1_{x[1:]}'),
                    results[win_cols].rename(columns=lambda x: f'T2_{x[1:]}')
                ),
                axis='columns'
            )
        ),
        axis='rows',
        ignore_index=True
    ).rename(columns={
        'T1_TeamID': 'T1',
        'T2_TeamID': 'T2'
    })

regular = augment_results(regular_results)
tourney = augment_results(tourney_results)

Initialize a dataframe to which we will append features. We will then use this dataframe for training.

In [4]:
df = tourney[['Season', 'T1', 'T1_Score', 'T2', 'T2_Score']].copy()
df = df.set_index(['Season', 'T1', 'T2'])
df['Victory'] = (df['T1_Score'] > df['T2_Score']).astype(float)
df = df.drop(columns=['T1_Score', 'T2_Score'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory
Season,T1,T2,Unnamed: 3_level_1
2003,1421,1411,1.0
2003,1112,1436,1.0
2003,1113,1272,1.0
2003,1141,1166,1.0
2003,1143,1301,1.0


Add all the 2019 tournament games for which a prediction has to be made.

In [5]:
sub = pd.read_csv('data/men/SampleSubmissionStage2.csv')
sub = sub['ID'].str.split('_', expand=True).astype(int)
sub.columns = ['Season', 'T1', 'T2']
sub['Victory'] = None
sub = sub.set_index(['Season', 'T1', 'T2'])
df = pd.concat((df, sub))

## Feature extraction

Seeds.

In [6]:
seeds = pd.read_csv('data/men/NCAATourneySeeds.csv', index_col=['Season', 'TeamID'])
seeds['seed'] = seeds['Seed'].map(lambda x: int(x[1:3]))

df = df.join(seeds['seed'].rename('T1_seed'), on=['Season', 'T1'])
df = df.join(seeds['seed'].rename('T2_seed'), on=['Season', 'T2'])

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory,T1_seed,T2_seed
Season,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2003,1421,1411,1.0,16,16
2003,1112,1436,1.0,1,16
2003,1113,1272,1.0,10,7
2003,1141,1166,1.0,11,6
2003,1143,1301,1.0,8,9


Regular season aggregate statistics.

In [7]:
stats = [
    'WFGM', 
    
    'WFGA',
    'WFGM3',
    'WFGA3',
    'WFTM',
    'WFTA',
    'WOR',
    'WDR',
    'WAst',
    'WTO', 'WStl',
    'WBlk', 
    'WPF',
]

stats = regular.assign(
                    FGM=lambda x: x['T1_FGM'],
                    FGA=lambda x: x['T1_FGA'],
                    FGM3=lambda x: x['T1_FGM3'],
                    DR=lambda x: x['T1_DR'],
                    Blk=lambda x: x['T1_Blk'],
                    PF=lambda x: x['T1_PF']
                )\
               .groupby(['Season', 'T1'])\
               .agg({
                   'FGM': ['mean'],
                   'FGA': ['mean'],
                   'FGM3': ['mean'],
                   'DR': ['mean'],
                   'Blk': ['median'],
                   'PF': ['mean']
               })

stats.columns = ['_'.join(combo) for combo in stats.columns]

stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FGM_mean,FGA_mean,FGM3_mean,DR_mean,Blk_median,PF_mean
Season,T1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003,1102,19.142857,39.785714,7.821429,16.821429,1.0,18.75
2003,1103,27.148148,55.851852,5.444444,19.925926,2.0,19.851852
2003,1104,24.035714,57.178571,6.357143,23.928571,4.0,18.035714
2003,1105,24.384615,61.615385,7.576923,23.115385,2.0,20.230769
2003,1106,23.428571,55.285714,6.107143,23.857143,3.0,18.178571


In [8]:
t1_stats = stats.add_prefix('T1_')
t2_stats = stats.add_prefix('T2_')

df = df.drop(columns=t1_stats.columns.intersection(df.columns)).join(t1_stats, on=['Season', 'T1'])
df = df.drop(columns=t2_stats.columns.intersection(df.columns)).join(t2_stats, on=['Season', 'T2'])

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Victory,T1_seed,T2_seed,T1_FGM_mean,T1_FGA_mean,T1_FGM3_mean,T1_DR_mean,T1_Blk_median,T1_PF_mean,T2_FGM_mean,T2_FGA_mean,T2_FGM3_mean,T2_DR_mean,T2_Blk_median,T2_PF_mean
Season,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2003,1421,1411,1.0,16,16,24.37931,56.793103,6.482759,23.172414,3.0,19.103448,24.733333,55.266667,5.933333,24.8,2.0,18.3
2003,1112,1436,1.0,1,16,30.321429,65.714286,7.035714,27.642857,4.0,17.75,24.827586,55.862069,5.275862,25.724138,3.0,15.896552
2003,1113,1272,1.0,10,7,27.206897,56.896552,4.0,23.310345,4.0,19.413793,26.275862,60.0,7.0,25.965517,4.0,18.758621
2003,1141,1166,1.0,11,6,26.62069,52.689655,6.827586,23.275862,3.0,20.965517,28.69697,57.454545,7.969697,23.181818,5.0,17.272727
2003,1143,1301,1.0,8,9,27.344828,58.724138,6.413793,24.37931,3.0,17.103448,24.333333,53.333333,7.966667,22.033333,2.0,18.666667


Rating percentage index (RPI).

In [9]:
victory = lambda x: x['T1_Score'] > x['T2_Score']
win_rates = regular.assign(victory=victory).groupby(['Season', 'T1'])['victory'].agg(['mean', 'count']).to_dict()
matchups = regular.assign(victory=victory).groupby(['Season', 'T1', 'T2'])['victory'].agg(['sum', 'count']).to_dict()
n_matches = regular.groupby(['Season', 'T1']).size().to_dict()
opponents = regular.groupby(['Season', 'T1'])['T2'].unique().to_dict()


def update_mean(mean, count, removed_sum, removed_count):
    return (mean * count - removed_sum) / (count - removed_count)


def calc_wp(season, team):
    return win_rates['mean'][(season, team)]


def calc_owp(season, team):
    
    return 1 / n_matches[(season, team)] * sum(
        update_mean(
            mean=win_rates['mean'][(season, opponent)],
            count=win_rates['count'][(season, opponent)],
            removed_sum=matchups['sum'][(season, opponent, team)],
            removed_count=matchups['count'][(season, opponent, team)]
        ) * matchups['count'][(season, opponent, team)]
        for opponent in opponents[(season, team)]
    )


def calc_oowp(season, team, owps):
    return 1 / n_matches[(season, team)] * sum(
        owps[opponent] * matchups['count'][(season, opponent, team)]
        for opponent in opponents[(season, team)]
    )


def calc_rpi(wp, owp, oowp):
    return wp * .25 + owp * .5 + oowp * .25
    

seasons = regular['Season'].unique()

wps = {
    season: {
        team: calc_wp(season, team)
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
owps = {
    season: {
        team: calc_owp(season, team)
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
oowps = {
    season: {
        team: calc_oowp(season, team, owps[season])
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}
rpis = {
    season: {
        team: calc_rpi(wps[season][team], owps[season][team], oowps[season][team])
        for team in regular.query(f'Season == {season}')['T1'].unique()
    }
    for season in seasons
}

rpis = pd.DataFrame.from_dict(rpis, orient='columns').stack()
df = df.drop(columns='T1_rpi', errors='ignore').join(rpis.rename('T1_rpi'), on=['T1', 'Season'])
df = df.drop(columns='T2_rpi', errors='ignore').join(rpis.rename('T2_rpi'), on=['T2', 'Season'])

Massey AP ratings.

In [10]:
massey = pd.read_csv('data/men/MasseyOrdinals.csv')
ratings = massey.groupby(['Season', 'TeamID', 'SystemName']).last()['OrdinalRank'].unstack()

for col in ['AP']:
    df = df.join(ratings[col].rename(f'T1_{col}'), on=['Season', 'T1'])
    df = df.join(ratings[col].rename(f'T2_{col}'), on=['Season', 'T2'])

## Machine learning

Prepare the train and test sets.

In [11]:
is_train = df['Victory'].notnull()
train_seasons = df[is_train].index.get_level_values('Season')

X_train = df[is_train].drop(columns='Victory')
y_train = df.loc[is_train, 'Victory'].astype(bool)
X_test = df[~is_train].drop(columns='Victory')

Do the LightGBM dance zzz.

In [104]:
import lightgbm as lgb


class Model(lgb.LGBMClassifier):
    
    def fit(self, X, y, **fit_params):
        return super().fit(X, y, **fit_params)
    
    def predict_proba(self, X):
        
        y_pred = super().predict_proba(X)[:, 1]
        
        return y_pred

In [154]:
import numpy as np
from sklearn import metrics
from sklearn import model_selection
from sklearn import utils


def bake_model(random_state):
    return Model(
        objective='binary',
        num_leaves=16,
        learning_rate=0.1,
        colsample_bytree=1,
        n_estimators=3000,
        min_child_samples=20,
        importance_type='gain',
        random_state=random_state
    )


rng = utils.check_random_state(42)
seasons = X_train.index.get_level_values('Season').unique()
n_repeats = 10

oof = pd.Series(0, index=X_train.index)
oof_scores = pd.DataFrame(index=seasons, columns=range(n_repeats))
sub_stage_2 = pd.DataFrame(0, index=X_test.index, columns=seasons)
importances = pd.DataFrame(index=X_train.columns)

for i in range(n_repeats):
    
    model = bake_model(rng.randint(10e10))

    for j, season in enumerate(seasons):
        
        fit_mask = X_train.index.get_level_values('Season') != season
        val_mask = X_train.index.get_level_values('Season') == season

        X_fit = X_train.loc[fit_mask]
        y_fit = y_train.loc[fit_mask]
        X_val = X_train.loc[val_mask]
        y_val = y_train.loc[val_mask]

        model = model.fit(
            X=X_fit,
            y=y_fit,
            eval_set=[(X_fit, y_fit), (X_val, y_val)],
            eval_names=('fit', 'val'),
            eval_metric='logloss',
            early_stopping_rounds=30,
            verbose=0
        ) 

        oof.loc[val_mask] += model.predict_proba(X_val)
        sub_stage_2[season] += model.predict_proba(X_test)
        importances[i * len(seasons) + j] = model.feature_importances_
        oof_scores.loc[season, i] = metrics.log_loss(y_val, model.predict_proba(X_val))
    
oof /= n_repeats
oof_scores = oof_scores.mean(axis='columns')
sub_stage_2 /= n_repeats
importances = importances.median(axis='columns')

print(oof_scores.to_string())
print()
print(f'Average OOF logloss: {oof_scores.mean():.5f} (±{oof_scores.std():.5f})')

Season
2003    0.504321
2004    0.521618
2005    0.478160
2006    0.561826
2007    0.437973
2008    0.457525
2009    0.422723
2010    0.565463
2011    0.598038
2012    0.550500
2013    0.585996
2014    0.544790
2015    0.475425
2016    0.561269
2017    0.485388
2018    0.521766

Average OOF logloss: 0.51705 (±0.05332)


Average OOF logloss: 0.51705 (±0.05332)

In [127]:
print(importances.sort_values(0, ascending=False).to_string())

T1_AP            1227.106918
T2_AP            1155.727992
T2_rpi            750.972355
T1_rpi            727.936260
T2_DR_mean        332.975040
T1_DR_mean        326.885168
T2_FGM3_mean      275.738374
T1_FGM3_mean      243.557310
T1_FGM_mean       207.384340
T1_FGA_mean       206.197935
T2_FGM_mean       203.392761
T2_FGA_mean       195.094840
T2_PF_mean        191.185730
T1_PF_mean        178.805110
T2_seed           167.050512
T1_seed           149.072140
T2_Blk_median     102.294145
T1_Blk_median      87.019746


Stage 1 submission.

In [155]:
sub_stage_1 = pd.read_csv('data/men/SampleSubmissionStage1.csv')
id_parts = sub_stage_1['ID'].str.split('_', expand=True).astype(int)

sub_stage_1['Season'] = id_parts[0]
sub_stage_1['T1'] = id_parts[1]
sub_stage_1['T2'] = id_parts[2]
sub_stage_1 = sub_stage_1.set_index(['Season', 'T1', 'T2'])

sub_stage_1['Pred'].update(oof.rename('Pred'))

sub_stage_1.to_csv('subs/men_stage_1.csv', index=False)
!head subs/men_stage_1.csv

ID,Pred
2014_1107_1110,0.5
2014_1107_1112,0.5
2014_1107_1113,0.5
2014_1107_1124,0.5
2014_1107_1140,0.5
2014_1107_1142,0.5
2014_1107_1153,0.5
2014_1107_1157,0.5
2014_1107_1160,0.5


Stage 2 submission.

In [160]:
sub = sub_stage_2.mean(axis='columns')
sub = sub.to_frame('Pred')
sub['ID'] = ['_'.join(map(str, idx)) for idx in sub.index]
sub = sub[reversed(sub.columns)]
sub.to_csv('subs/men_stage_2.csv', index=False)
!head subs/men_stage_2.csv

ID,Pred
2019_1101_1113,0.24750658533170297
2019_1101_1120,0.2443348343648902
2019_1101_1124,0.49906228531415536
2019_1101_1125,0.5529687525100264
2019_1101_1133,0.6708233231818702
2019_1101_1138,0.2040019330391345
2019_1101_1153,0.23001297608815877
2019_1101_1159,0.6550857502094999
2019_1101_1181,0.05568959886909992
