In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pd.set_option("display.max_columns",75)

In [2]:
## FEATURE FUNCS

def strip_and_make_0_float(x):

    return x.str.split(expand=True)[0].astype(float)

def featurize(table,process_name=True):

    table.columns = [
        'rk','team','conf','g','rec','adjoe','adjde','barthag','efg_pct','efgd_pct',
        'tor','tord','orb','drb','ftr','ftrd','2p_pct','2pd_pct','3p_pct','3pd_pct','adj_t','wab'
    ]

    table = table[table['rk']!='Rk'].copy()
    table[['w','l']]=table['rec'].str.split(expand=True)[0].str.split("–",expand=True).astype(int)
    table['win_perc'] = table['w']/(table['w']+table['l'])

    if process_name:
        table[['team','tourney']]=table['team'].str.split(",",expand=True)
        table['team'] = table['team'].str.replace(" seed","").apply(
            lambda x: ''.join([i for i in x if (i.isalpha())|(i==" ")])
        ).str.strip()
        table['tourney'] = table['tourney'].fillna('no tourney').str.strip()

        tourney_values = {
            'no tourney':9,
            'R68':8,
            'R64':7,
            'R32':6,
            'Sweet Sixteen':5,
            'Elite Eight':4,
            'Final Four':3,
            'Finals':2,
            'CHAMPS':1,
        }

        table['OUTCOME'] = table['tourney'].map(tourney_values)

    table['adjoe'] = strip_and_make_0_float(table['adjoe'])
    table['adjde'] = strip_and_make_0_float(table['adjde'])
    table['barthag'] = strip_and_make_0_float(table['barthag'])
    table['efg_pct'] = strip_and_make_0_float(table['efg_pct'])
    table['efgd_pct'] = strip_and_make_0_float(table['efgd_pct'])
    table['tor'] = strip_and_make_0_float(table['tor'])
    table['tord'] = strip_and_make_0_float(table['tord'])
    table['orb'] = strip_and_make_0_float(table['orb'])
    table['drb'] = strip_and_make_0_float(table['drb'])
    table['ftr'] = strip_and_make_0_float(table['ftr'])
    table['ftrd'] = strip_and_make_0_float(table['ftrd'])
    table['2p_pct'] = strip_and_make_0_float(table['2p_pct'])
    table['2pd_pct'] = strip_and_make_0_float(table['2pd_pct'])
    table['3p_pct'] = strip_and_make_0_float(table['3p_pct'])
    table['3pd_pct'] = strip_and_make_0_float(table['3pd_pct'])
    table['adj_t'] = strip_and_make_0_float(table['adj_t'])
    table['wab'] = strip_and_make_0_float(table['wab'])


    conferences = ['WCC', 'Amer', 'B12', 'ACC', 'SEC', 'BE', 'P12', 'B10', 'MWC',
        'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC', 'WAC', 'Sum', 'CAA',
        'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW', 'BSth', 'BSky',
        'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd']


    for c in conferences:
        table[c] = np.where(table['conf']==c,1,0)

    return table

In [3]:
years = [
    2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,
    2022
         ]

out = pd.DataFrame()
for year in years:
    print(year)

    url = f"https://barttorvik.com/trank.php?year={year}&type=R"

    tables = pd.read_html(url)

    table = tables[0]

    tmp = featurize(table)

    tmp['year']=year

    out = pd.concat([out,tmp])

2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2021
2022


In [4]:
out

Unnamed: 0,rk,team,conf,g,rec,adjoe,adjde,barthag,efg_pct,efgd_pct,tor,tord,orb,drb,ftr,ftrd,2p_pct,2pd_pct,3p_pct,3pd_pct,adj_t,wab,w,l,win_perc,tourney,OUTCOME,WCC,Amer,B12,ACC,SEC,BE,P12,B10,MWC,MVC,A10,OVC,CUSA,AE,SC,WAC,Sum,CAA,MAAC,MAC,Ivy,ASun,Pat,SB,BW,BSth,BSky,NEC,Horz,SWAC,MEAC,Slnd,year
0,1,Kansas,B12,33,30–3 13–3,121.0,85.6,0.9816,56.3,44.8,18.7,22.9,38.0,29.0,37.5,30.8,54.8,40.9,39.9,34.0,69.5,9.9,30,3,0.909091,CHAMPS,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
1,2,Memphis,CUSA,34,33–1 16–0,113.3,83.8,0.9697,53.0,42.5,17.2,23.3,37.8,29.3,38.6,32.7,53.3,41.6,35.0,29.8,70.7,8.9,33,1,0.970588,Finals,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
2,3,UCLA,P10,33,30–3 16–2,116.4,86.9,0.9664,52.3,48.0,18.8,22.7,39.2,25.9,38.0,25.7,52.9,46.8,33.8,34.2,66.2,10.8,30,3,0.909091,Final Four,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
3,4,Wisconsin,B10,33,29–4 16–2,112.2,84.9,0.9609,50.6,43.1,19.4,22.2,36.2,28.5,39.3,25.4,49.2,41.3,35.9,31.1,63.5,8.3,29,4,0.878788,Sweet Sixteen,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
4,5,Duke,ACC,32,27–5 13–3,117.2,88.8,0.9607,54.1,47.6,18.2,24.9,34.3,33.5,40.4,32.0,51.8,47.1,38.4,32.8,73.7,8.9,27,5,0.843750,R32,6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,354,Maine,AE,26,3–23 3–15,91.3,113.6,0.0745,46.9,54.9,19.3,18.3,24.0,30.4,20.4,32.8,47.0,53.3,31.0,38.6,67.1,-18.3,3,23,0.115385,no tourney,9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022
367,355,Mississippi Valley St,SWAC,28,2–26 2–16,92.7,115.7,0.0724,44.5,56.7,18.1,20.7,23.4,33.9,31.4,36.5,43.8,55.1,30.5,40.0,71.8,-21.3,2,26,0.071429,no tourney,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2022
368,356,Delaware St,MEAC,25,0–25 0–14,85.8,110.7,0.0509,45.0,51.1,25.6,19.3,27.0,34.7,32.4,34.2,43.4,48.6,32.2,36.1,69.1,-21.8,0,25,0.000000,no tourney,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2022
369,357,Eastern Illinois,OVC,29,3–26 3–15,82.5,107.8,0.0444,44.2,53.4,24.1,18.2,20.6,32.6,31.2,24.2,42.3,55.8,31.8,33.5,65.9,-20.3,3,26,0.103448,no tourney,9,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022


In [5]:
training_mask = out['year'].isin([2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,])
validation_mask = out['year'].isin([2022])


In [6]:
features = ['adjoe', 'adjde', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd',
       '2p_pct', '2pd_pct', '3p_pct', '3pd_pct',
       'win_perc', 
       'WCC', 'Amer', 'B12', 'ACC', 'SEC',
       'BE', 'P12', 'B10', 'MWC', 'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC',
       'WAC', 'Sum', 'CAA', 'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW',
       'BSth', 'BSky', 'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd'
       ]
target = 'OUTCOME'

In [7]:
X = out[training_mask][features].copy()
y = out[training_mask][target].copy()

In [8]:
model = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',RandomForestRegressor(n_estimators=500,random_state=50,#max_depth=8
                                     ))
    ]
)

In [9]:
model.fit(X,y)

In [10]:
yfit = model.predict(X)

In [11]:
Xx = out[validation_mask][features].copy()
yy = out[validation_mask][target].copy()
yval = model.predict(Xx)

In [12]:
val_result = pd.DataFrame({
    'team':out[validation_mask]['team'],
    'OUTCOME':yy})
val_result['PREDICTION_NUMERIC'] = yval
val_result['PREDICTION_RANK']=val_result['PREDICTION_NUMERIC'].rank(ascending=True,)

In [13]:
val_result.sort_values(by='PREDICTION_RANK',ascending=True).head(50)

Unnamed: 0,team,OUTCOME,PREDICTION_NUMERIC,PREDICTION_RANK
0,Gonzaga,5,3.486,1.0
1,Houston,4,3.544,2.0
2,Baylor,6,4.468,3.0
3,Kansas,1,4.508,4.5
4,Arizona,5,4.508,4.5
13,Iowa,7,4.57,6.0
5,Duke,3,4.602,7.0
12,UCLA,5,4.63,8.0
7,Kentucky,7,4.832,9.0
8,Tennessee,6,4.976,10.0


In [17]:
def rank_to_round(x):
    round_made = pd.Series(np.where(
        x==1,1,
    np.where(
        x==2,2,
    np.where(
        x<=4,3,
    np.where(
        x<=8,4,
    np.where(
        x<=16,5,
    np.where(
        x<=32,6,
    np.where(
        x<=64,7,
    np.where(
        x<=68,8,
    9
    )
    )
    )
    )
    )
    )
    )
    ),index = x.index)
    return round_made

val_result['OUTCOME_ROUND'] = val_result['OUTCOME']
val_result['PREDICTION_ROUND'] = rank_to_round(val_result['PREDICTION_RANK'])

tourney_val = val_result[val_result['OUTCOME']<=8].copy()

correlation = tourney_val[
    ['OUTCOME_ROUND','PREDICTION_ROUND']
    ].corr()['PREDICTION_ROUND'].values[0]

In [18]:
tourney_val

Unnamed: 0,team,OUTCOME,PREDICTION_NUMERIC,PREDICTION_RANK,OUTCOME_ROUND,PREDICTION_ROUND
0,Gonzaga,5,3.486,1.0,5,1
1,Houston,4,3.544,2.0,4,2
2,Baylor,6,4.468,3.0,6,3
3,Kansas,1,4.508,4.5,1,4
4,Arizona,5,4.508,4.5,5,4
...,...,...,...,...,...,...
191,Bryant,8,8.442,90.0,8,9
197,Texas Southern,7,8.784,143.0,7,9
204,Norfolk St,7,7.808,57.0,7,7
209,Wright St,7,8.796,146.0,7,9


In [19]:
correlation

0.49845408806645064

In [20]:
1-sum((tourney_val['OUTCOME_ROUND']-tourney_val['PREDICTION_ROUND'])**2)/sum((tourney_val['OUTCOME_ROUND']-np.mean(tourney_val['OUTCOME_ROUND']))**2)

-0.517616404214339

In [21]:
testyear = 2023

url = f"https://barttorvik.com/trank.php?year={testyear}&type=R"

tables = pd.read_html(url)

table = tables[0]

tmp = featurize(table,process_name=False)

tmp['year']=year


In [24]:
prediction_df = pd.DataFrame({
    'TEAM':tmp['team'],
    'OUTPUT_PREDICTION':model.predict(tmp[features])
})
    

In [25]:
prediction_df.sort_values(by='OUTPUT_PREDICTION',ascending=True).head(50)

Unnamed: 0,TEAM,OUTPUT_PREDICTION
0,Houston,2.962
1,Alabama,4.58
2,UCLA (A) 80 Colorado,4.752
4,Purdue,4.922
7,Arizona,4.95
14,Marquette,5.256
9,San Diego St.,5.304
3,Tennessee,5.314
11,Gonzaga,5.346
8,Kansas,5.432
