## Load libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from custom_utils import transform_seeds, transform_data, enrich_data
from IPython.display import display

## Load and prepare data

In [2]:
wregularseason = pd.read_csv("data/WRegularSeasonCompactResults.csv")
wtourneyseason = pd.read_csv("data/WNCAATourneyCompactResults.csv")

mregularseason = pd.read_csv("data/MRegularSeasonCompactResults.csv")
mtourneyseason = pd.read_csv("data/MNCAATourneyCompactResults.csv")

wregularseason["isTourney"] = np.zeros(wregularseason.shape[0], dtype=int)
wtourneyseason["isTourney"] = np.ones(wtourneyseason.shape[0], dtype=int)

mregularseason["isTourney"] = np.zeros(mregularseason.shape[0], dtype=int)
mtourneyseason["isTourney"] = np.ones(mtourneyseason.shape[0], dtype=int)

wprep = pd.concat([transform_data(wregularseason), transform_data(wtourneyseason)], axis=0).reset_index(drop=True)
mprep = pd.concat([transform_data(mregularseason), transform_data(mtourneyseason)], axis=0).reset_index(drop=True)

mseeds = pd.read_csv("data/MNCAATourneySeeds.csv")
wseeds = pd.read_csv("data/WNCAATourneySeeds.csv")

In [14]:
mprep_enh = enrich_data(mprep, "M")
wprep_enh = enrich_data(wprep, "W")
display(mprep_enh.head())
wprep_enh.head()


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,isTourney,Result,SeedW,SeedL,SeedDiff
0,1985,20,1228,1328,0,1,3,1,2
1,1985,25,1106,1354,0,1,16,16,0
2,1985,25,1112,1223,0,1,10,16,-6
3,1985,25,1165,1432,0,1,16,16,0
4,1985,25,1192,1447,0,1,16,16,0


Unnamed: 0,Season,DayNum,WTeamID,LTeamID,isTourney,Result,SeedW,SeedL,SeedDiff
0,1998,18,3104,3202,0,1,2,16,-14
1,1998,18,3163,3221,0,1,2,14,-12
2,1998,18,3222,3261,0,1,16,16,0
3,1998,18,3307,3365,0,1,8,14,-6
4,1998,18,3349,3411,0,1,16,16,0


In [15]:
X_featm = mprep_enh.drop("Result", axis=1)
ym = mprep_enh.Result

X_featw = wprep_enh.drop("Result", axis=1)
yw = wprep_enh.Result

X_trainm, X_testm, y_trainm, y_testm = train_test_split(X_featm, ym, test_size=0.1)
X_trainw, X_testw, y_trainw, y_testw = train_test_split(X_featw, yw, test_size=0.1)

In [16]:
dt_clfm = DecisionTreeClassifier()
dt_clfm.fit(X_trainm, y_trainm)

dt_clfw = DecisionTreeClassifier()
_ = dt_clfw.fit(X_trainw, y_trainw)

In [17]:
pred_probsm = dt_clfm.predict_proba(X_testm)[:, 1]
pred_probsw = dt_clfw.predict_proba(X_testw)[:, 1]

print(f"Brier score for mens data: {np.round(brier_score_loss(y_testm, pred_probsm), 3).item()}")
print(f"Brier score for womens data: {np.round(brier_score_loss(y_testw, pred_probsw), 3).item()}")

Brier score for mens data: 0.393
Brier score for womens data: 0.366


In [27]:
values = dt_clfm.feature_importances_
feature = dt_clfm.feature_names_in_

pd.DataFrame({"Feature" : feature, "Importance": values}).sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
1,DayNum,0.284271
0,Season,0.248337
3,LTeamID,0.168085
2,WTeamID,0.147792
7,SeedDiff,0.136363
6,SeedL,0.007594
5,SeedW,0.007237
4,isTourney,0.000321


In [28]:
values = dt_clfw.feature_importances_
feature = dt_clfw.feature_names_in_

pd.DataFrame({"Feature" : feature, "Importance": values}).sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
1,DayNum,0.288202
3,LTeamID,0.219739
0,Season,0.206967
7,SeedDiff,0.158517
2,WTeamID,0.113574
5,SeedW,0.006409
6,SeedL,0.006293
4,isTourney,0.000299
