In [96]:
from sportsdataverse import nba, wnba
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from scipy.stats.mstats import winsorize
from xgboost import XGBClassifier
from scipy.stats import ks_2samp
import pandas as pd
import xgboost
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 50)

In [70]:
nba_bs = nba.nba_loaders.load_nba_team_boxscore(range(2010,2022))
wnba_bs = wnba.wnba_loaders.load_wnba_team_boxscore(range(2010, 2022))

100%|██████████| 12/12 [00:03<00:00,  3.43it/s]
100%|██████████| 12/12 [00:03<00:00,  3.79it/s]


#### Examine frames

In [71]:
print("NBA Exclusives:")
print([i for i in nba_bs.columns if i not in wnba_bs.columns])
print("WNBA Exclusives:")
print([i for i in wnba_bs.columns if i not in nba_bs.columns])

NBA Exclusives:
['turnover_points', 'fast_break_points', 'points_in_paint']
WNBA Exclusives:
[]


Apparently women don't have those things in the game? Weird :unamused:

In [72]:
nba_bs = nba_bs[wnba_bs.columns]

In [73]:
print(wnba_bs.columns)

Index(['team_id', 'team_uid', 'team_slug', 'team_location', 'team_name',
       'team_abbreviation', 'team_display_name', 'team_short_display_name',
       'team_color', 'team_alternate_color', 'team_logo',
       'field_goals_made_field_goals_attempted', 'field_goal_pct',
       'three_point_field_goals_made_three_point_field_goals_attempted',
       'three_point_field_goal_pct', 'free_throws_made_free_throws_attempted',
       'free_throw_pct', 'total_rebounds', 'offensive_rebounds',
       'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers',
       'team_turnovers', 'total_turnovers', 'technical_fouls',
       'total_technical_fouls', 'flagrant_fouls', 'fouls', 'home_away',
       'opponent_id', 'opponent_name', 'opponent_mascot', 'opponent_abbrev',
       'game_id', 'season', 'season_type', 'game_date', 'largest_lead'],
      dtype='object')


In [74]:
print((wnba_bs.dropna().count()/wnba_bs.count()*1.0).mean())
print((nba_bs.dropna().count()/nba_bs.count()*1.0).mean())


0.48091236095085055
0.7240686602308063


While we're here, lets go ahead and figure out where the coverage gap is coming from.

In [75]:
print(1-wnba_bs.count()/wnba_bs.shape[0]*1.0)

team_id                                                           0.000000
team_uid                                                          0.000000
team_slug                                                         0.000000
team_location                                                     0.000000
team_name                                                         0.000000
team_abbreviation                                                 0.000000
team_display_name                                                 0.000000
team_short_display_name                                           0.000000
team_color                                                        0.003490
team_alternate_color                                              0.003490
team_logo                                                         0.095967
field_goals_made_field_goals_attempted                            0.000000
field_goal_pct                                                    0.000000
three_point_field_goals_m

In [76]:
1-wnba_bs.groupby('season').count()/wnba_bs.fillna(0).groupby('season').count()


Unnamed: 0_level_0,team_id,team_uid,team_slug,team_location,team_name,team_abbreviation,team_display_name,team_short_display_name,team_color,team_alternate_color,...,fouls,home_away,opponent_id,opponent_name,opponent_mascot,opponent_abbrev,game_id,season_type,game_date,largest_lead
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004525,0.004525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004444,0.004444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.959459
2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004505,0.004505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004484,0.004484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.995516
2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004425,0.004425,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013636
2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004545,0.004545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004525,0.004525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004525
2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004525,0.004525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004525


It appears that the problem was in largest_lead, and it was rectified around 2017.

In [77]:
print(1-nba_bs.count()/nba_bs.shape[0]*1.0)

team_id                                                           0.000000
team_uid                                                          0.000000
team_slug                                                         0.000000
team_location                                                     0.000000
team_name                                                         0.000000
team_abbreviation                                                 0.000000
team_display_name                                                 0.000000
team_short_display_name                                           0.000000
team_color                                                        0.000790
team_alternate_color                                              0.000790
team_logo                                                         0.031211
field_goals_made_field_goals_attempted                            0.000000
field_goal_pct                                                    0.000000
three_point_field_goals_m

In [78]:
1-nba_bs.groupby('season').count()/nba_bs.fillna(0).groupby('season').count()

Unnamed: 0_level_0,team_id,team_uid,team_slug,team_location,team_name,team_abbreviation,team_display_name,team_short_display_name,team_color,team_alternate_color,...,fouls,home_away,opponent_id,opponent_name,opponent_mascot,opponent_abbrev,game_id,season_type,game_date,largest_lead
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002288
2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001862
2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000763,0.000763,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99084
2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000758,0.000758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.997726
2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000762,0.000762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999238
2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000759,0.000759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000759
2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000764,0.000764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000764
2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001527,0.001527,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000763
2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001523,0.001523,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Slice data to minimize nans

In [79]:
wnba_bs = wnba_bs[wnba_bs.season>2016]
nba_bs = nba_bs[nba_bs.season>2016]

In [80]:
print((1-wnba_bs.dropna().count()/wnba_bs.count()).mean())
print((1-nba_bs.dropna().count()/nba_bs.count()).mean())

0.022752848540991223
0.0018447843114764717


In [81]:
wnba_bs = wnba_bs.dropna(how='any')
nba_bs = nba_bs.dropna(how='any')

In [82]:
print((1-wnba_bs.dropna().count()/wnba_bs.count()).mean())
print((1-nba_bs.dropna().count()/nba_bs.count()).mean())

0.0
0.0


In [83]:
print(wnba_bs.count().mean())
print(nba_bs.count().mean())

1992.0
12444.0


#### Preparation of frames for comparison training

In [84]:
print(wnba_bs.apply(pd.to_numeric, errors='ignore').dtypes)

X_w = wnba_bs.drop([
    'team_uid',
    'team_id',
    'team_slug',
    'team_location',
    'team_name',
    'team_abbreviation',
    'team_display_name',
    'team_short_display_name',
    'team_color',
    'team_alternate_color',
    'team_logo',
    'game_date',
    'season',
    'season_type',
    'opponent_name',
    'opponent_id',
    'opponent_mascot',
    'opponent_abbrev',          #i just think it's kind of funny that this line is abbreviated, y'know? Oh? It's just me? Okay then...
    ],
    axis=1
)

print(X_w.dtypes)

dash_cols = [
    'field_goals_made_field_goals_attempted', 
    'three_point_field_goals_made_three_point_field_goals_attempted',
    'free_throws_made_free_throws_attempted',
    ]

X_w[dash_cols] = (X_w[dash_cols].apply(lambda x: x.str.split('-')))

X_w[['field_goals_made', 'field_goals_attempted']] = pd.DataFrame(X_w.field_goals_made_field_goals_attempted.to_list(), index=X_w.index)
X_w[['three_point_field_goals_made', 'three_point_field_goals_attempted']] = pd.DataFrame(X_w.three_point_field_goals_made_three_point_field_goals_attempted.to_list(), index=X_w.index)
X_w[['free_throws_made', 'free_throws_attempted']] = pd.DataFrame(X_w.free_throws_made_free_throws_attempted.to_list(), index=X_w.index)

X_w = X_w.drop(dash_cols, axis=1).apply(pd.to_numeric, errors='ignore')

pm_cols = ['total_rebounds', 'offensive_rebounds', 'defensive_rebounds', 'assists',
           'steals', 'blocks', 'turnovers', 'team_turnovers', 'total_turnovers',
           'technical_fouls', 'total_technical_fouls', 'flagrant_fouls', 'fouls',
           'field_goals_made', 'field_goals_attempted', 'three_point_field_goals_made',
           'three_point_field_goals_attempted', 'free_throws_made', 'free_throws_attempted']

X_w[pm_cols] = X_w[pm_cols]/40
# we're lucky in that there's almost no asymmetry from dropnas. aka nans affected both home and away equally.
# this can be verified with >> X_w.groupby('home_away').count() and by counting the merge below vs the merging frame lengths

X_w = (
    pd.merge(X_w[X_w.home_away=='HOME'], X_w[X_w.home_away=='AWAY'], how='inner', on='game_id', suffixes=['_home', '_away'])
    .drop(['game_id', 'home_away_home', 'home_away_away'], axis=1)
)

team_id                                                             int64
team_uid                                                           object
team_slug                                                          object
team_location                                                      object
team_name                                                          object
team_abbreviation                                                  object
team_display_name                                                  object
team_short_display_name                                            object
team_color                                                         object
team_alternate_color                                               object
team_logo                                                          object
field_goals_made_field_goals_attempted                             object
field_goal_pct                                                    float64
three_point_field_goals_made_three_poi

In [85]:
#repeat for men's

print(nba_bs.apply(pd.to_numeric, errors='ignore').dtypes)

X_m = nba_bs.drop([
    'team_uid',
    'team_id',
    'team_slug',
    'team_location',
    'team_name',
    'team_abbreviation',
    'team_display_name',
    'team_short_display_name',
    'team_color',
    'team_alternate_color',
    'team_logo',
    'game_date',
    'season',
    'season_type',
    'opponent_name',
    'opponent_id',
    'opponent_mascot',
    'opponent_abbrev',          #i just think it's kind of funny that this line is abbreviated, y'know? Oh? It's just me? Okay then...
    ],
    axis=1
)

print(X_m.dtypes)

dash_cols = [
    'field_goals_made_field_goals_attempted', 
    'three_point_field_goals_made_three_point_field_goals_attempted',
    'free_throws_made_free_throws_attempted',
    ]

X_m[dash_cols] = (X_m[dash_cols].apply(lambda x: x.str.split('-')))

X_m[['field_goals_made', 'field_goals_attempted']] = pd.DataFrame(X_m.field_goals_made_field_goals_attempted.to_list(), index=X_m.index)
X_m[['three_point_field_goals_made', 'three_point_field_goals_attempted']] = pd.DataFrame(X_m.three_point_field_goals_made_three_point_field_goals_attempted.to_list(), index=X_m.index)
X_m[['free_throws_made', 'free_throws_attempted']] = pd.DataFrame(X_m.free_throws_made_free_throws_attempted.to_list(), index=X_m.index)

X_m = X_m.drop(dash_cols, axis=1).apply(pd.to_numeric, errors='ignore')

pm_cols = ['total_rebounds', 'offensive_rebounds', 'defensive_rebounds', 'assists',
           'steals', 'blocks', 'turnovers', 'team_turnovers', 'total_turnovers',
           'technical_fouls', 'total_technical_fouls', 'flagrant_fouls', 'fouls',
           'field_goals_made', 'field_goals_attempted', 'three_point_field_goals_made',
           'three_point_field_goals_attempted', 'free_throws_made', 'free_throws_attempted']

X_m[pm_cols] = X_m[pm_cols]/48
# we're lucky in that there's almost no asymmetry from dropnas. aka nans affected both home and away equally.
# this can be verified with >> X_m.groupby('home_away').count() and by counting the merge below vs the merging frame lengths

X_m = (
    pd.merge(X_m[X_m.home_away=='HOME'], X_m[X_m.home_away=='AWAY'], how='inner', on='game_id', suffixes=['_home', '_away'])
    .drop(['game_id', 'home_away_home', 'home_away_away'], axis=1)
)

team_id                                                             int64
team_uid                                                           object
team_slug                                                          object
team_location                                                      object
team_name                                                          object
team_abbreviation                                                  object
team_display_name                                                  object
team_short_display_name                                            object
team_color                                                         object
team_alternate_color                                               object
team_logo                                                          object
field_goals_made_field_goals_attempted                             object
field_goal_pct                                                    float64
three_point_field_goals_made_three_poi

In [86]:
pd.concat([X_w.mean(), X_m.mean()], axis=1)

Unnamed: 0,0,1
field_goal_pct_home,43.440552,52.235744
three_point_field_goal_pct_home,34.093156,35.485969
free_throw_pct_home,79.962921,68.562761
total_rebounds_home,0.85046,1.059085
offensive_rebounds_home,0.214019,0.205772
defensive_rebounds_home,0.63644,0.702634
assists_home,0.464837,0.406247
steals_home,0.178805,0.185709
blocks_home,0.096936,0.165826
turnovers_home,0.327017,0.223983


In [87]:
df_tmp = pd.concat([X_w.mean(), X_m.mean()], axis=1)

print(((df_tmp[1]-df_tmp[0])).abs().mean())

df_tmp = pd.concat([X_w.median(), X_m.median()], axis=1)

print(((df_tmp[1]-df_tmp[0])).abs().mean())

1.0633250446032505
0.6541666666666666


That's a big enough dispersion for me between mean and median. Let's winsorize.

In [88]:
X_m_t = X_m.apply(lambda x: winsorize(x, limits=(0.2, 0.2)), axis=0)
X_w_t = X_w.apply(lambda x: winsorize(x, limits=(0.2, 0.2)), axis=0)

df_tmp = pd.concat([X_w_t.mean(), X_m_t.mean()], axis=1)

print(((df_tmp[1]-df_tmp[0])).abs().mean())

df_tmp = pd.concat([X_w_t.median(), X_m_t.median()], axis=1)

print(((df_tmp[1]-df_tmp[0])).abs().mean())

0.8788560237094053
0.6541666666666666


Better. Now class balancing and training

In [89]:
X_m_t = pd.DataFrame(norm.fit_transform(X_m), index=X_m.index, columns=X_m.columns)
X_w_t = pd.DataFrame(norm.fit_transform(X_w), index=X_w.index, columns=X_w.columns)

X_m_t['label'] = 0
X_w_t['label'] = 1

norm = Normalizer()

X = pd.concat(
    [
        X_m_t.sample(800, random_state=42069), 
        X_w_t.sample(800, random_state=42069)
    ]
    ).reset_index(drop=True)
X, Y = X.drop('label', axis=1), X.label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42069)

In [90]:
Y_train

542     0
1031    1
919     1
1491    1
1182    1
       ..
527     0
846     1
752     0
1209    1
1385    1
Name: label, Length: 1200, dtype: int64

In [91]:
clf = XGBClassifier()

clf.fit(X_train, Y_train)

accuracy_score(Y_test, clf.predict(X_test))

0.95

Woah! What's up?

In [92]:
pd.Series(clf.feature_importances_, index=X_w_t.drop('label', axis=1).columns).sort_values()

flagrant_fouls_away                       0.000000
total_technical_fouls_away                0.000000
total_technical_fouls_home                0.000000
flagrant_fouls_home                       0.000000
largest_lead_away                         0.001676
three_point_field_goal_pct_home           0.003016
field_goals_attempted_away                0.003096
free_throws_made_away                     0.003941
defensive_rebounds_home                   0.004310
blocks_away                               0.004407
free_throws_attempted_home                0.004631
largest_lead_home                         0.006438
defensive_rebounds_away                   0.007368
field_goals_attempted_home                0.008617
free_throw_pct_home                       0.008662
offensive_rebounds_home                   0.008955
free_throws_attempted_away                0.009130
assists_home                              0.009204
blocks_home                               0.009664
offensive_rebounds_away        

In [93]:
pd.Series(permutation_importance(clf, X, Y, n_repeats=10, random_state=42069)['importances_mean'], index=X.columns).sort_values()

free_throws_attempted_home               -5.625000e-04
assists_home                             -5.625000e-04
three_point_field_goals_attempted_away   -5.625000e-04
field_goals_attempted_away               -5.625000e-04
blocks_away                              -5.000000e-04
largest_lead_away                        -3.125000e-04
free_throws_made_home                    -3.125000e-04
free_throws_attempted_away               -2.500000e-04
field_goals_attempted_home               -2.500000e-04
offensive_rebounds_away                  -1.875000e-04
largest_lead_home                        -1.250000e-04
free_throws_made_away                    -6.250000e-05
steals_home                               0.000000e+00
three_point_field_goal_pct_away           0.000000e+00
flagrant_fouls_away                       0.000000e+00
total_technical_fouls_away                0.000000e+00
flagrant_fouls_home                       0.000000e+00
total_technical_fouls_home                0.000000e+00
fouls_away

In [94]:
X_train_t = X_train.drop(['field_goals_made_home', 'field_goals_made_away', 'total_rebounds_away', 'total_rebounds_home', 'free_throw_pct_home', 'free_throw_pct_away'], axis=1)
X_test_t = X_test.drop(['field_goals_made_home', 'field_goals_made_away', 'total_rebounds_away', 'total_rebounds_home', 'free_throw_pct_home', 'free_throw_pct_away'], axis=1)

clf_t = XGBClassifier()

clf_t.fit(X_train_t, Y_train)

print(accuracy_score(Y_test, clf_t.predict(X_test_t)))

pd.Series(clf_t.feature_importances_, index=X_train_t.columns).sort_values()

0.9175


flagrant_fouls_away                       0.000000
total_technical_fouls_away                0.000000
total_technical_fouls_home                0.000000
flagrant_fouls_home                       0.000000
largest_lead_away                         0.003616
free_throws_attempted_away                0.004482
three_point_field_goal_pct_home           0.004939
blocks_home                               0.005374
free_throws_made_home                     0.007381
free_throws_made_away                     0.007407
fouls_away                                0.007689
steals_away                               0.008035
steals_home                               0.008273
offensive_rebounds_home                   0.008495
blocks_away                               0.009037
largest_lead_home                         0.009641
free_throws_attempted_home                0.010073
assists_home                              0.011217
three_point_field_goal_pct_away           0.012804
technical_fouls_away           

#### Time to apply K-S test

In [101]:
X.iloc[0:800]
Y.iloc[0:800]

0      0
1      0
2      0
3      0
4      0
      ..
796    0
797    0
798    0
799    0
800    1
Name: label, Length: 801, dtype: int64

Unnamed: 0,field_goal_pct_home,three_point_field_goal_pct_home,offensive_rebounds_home,defensive_rebounds_home,assists_home,steals_home,blocks_home,turnovers_home,team_turnovers_home,total_turnovers_home,...,technical_fouls_away,total_technical_fouls_away,flagrant_fouls_away,fouls_away,largest_lead_away,field_goals_attempted_away,three_point_field_goals_made_away,three_point_field_goals_attempted_away,free_throws_made_away,free_throws_attempted_away
3,0.343361,0.256114,0.001026,0.004398,0.003371,0.001466,0.000733,0.002052,0.000000,0.002052,...,0.000000,0.000000,0.000000,0.002785,0.140722,0.010994,0.001906,0.004837,0.006303,0.007622
5,0.350273,0.263804,0.002290,0.005649,0.002748,0.001527,0.000458,0.001527,0.000000,0.001527,...,0.000000,0.000000,0.000000,0.004275,0.043967,0.013282,0.002137,0.005649,0.003206,0.003969
8,0.361344,0.253917,0.001110,0.006289,0.004069,0.001295,0.000555,0.002774,0.000185,0.002959,...,0.000185,0.000185,0.000000,0.002589,0.195321,0.015537,0.002405,0.006844,0.001665,0.003329
9,0.505709,0.379092,0.001106,0.005055,0.000790,0.002527,0.003633,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.001580,0.000000,0.113728,0.014058,0.002211,0.005213,0.002685,0.004265
17,0.295608,0.239480,0.001715,0.005145,0.003118,0.001715,0.000780,0.002495,0.000000,0.002495,...,0.000000,0.000000,0.000000,0.004054,0.089805,0.014812,0.001715,0.005301,0.002650,0.003586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782,0.309364,0.278354,0.001231,0.006153,0.003384,0.001077,0.001384,0.002769,0.000000,0.002769,...,0.000000,0.000000,0.000000,0.001384,0.059067,0.014921,0.001846,0.004615,0.002769,0.003384
786,0.567739,0.344191,0.001478,0.005027,0.000591,0.002070,0.002957,0.000000,0.000296,0.000296,...,0.000148,0.000148,0.001035,0.000000,0.042580,0.011828,0.001626,0.004288,0.003401,0.004583
787,0.560313,0.346635,0.001484,0.006925,0.000660,0.003627,0.004122,0.000000,0.000165,0.000165,...,0.000000,0.000000,0.001979,0.000000,0.245335,0.015993,0.001979,0.005936,0.003298,0.005276
794,0.274822,0.238561,0.000530,0.003446,0.003181,0.000795,0.000663,0.002386,0.000000,0.002386,...,0.000000,0.000000,0.000000,0.002651,0.260826,0.012591,0.002121,0.004506,0.001988,0.002783


In [117]:
ks_2samp(
    clf_t.predict(X_test_t.reindex(range(0,800)).dropna()),
    clf_t.predict(X_test_t.reindex(range(800,1600)).dropna())
)

KstestResult(statistic=0.8360241295587094, pvalue=1.1487511551024986e-71, statistic_location=0, statistic_sign=1)

### TODO: Break down numbers per possession and re-run