In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pd.set_option("display.max_columns",75)

In [2]:
## FEATURE FUNCS

def strip_and_make_0_float(x):

    return x.str.split(expand=True)[0].astype(float)

def featurize(table,process_name=True):

    
    table.columns = [
        'rk','team','conf','g','rec','adjoe','adjde','barthag','efg_pct','efgd_pct',
        'tor','tord','orb','drb','ftr','ftrd','2p_pct','2pd_pct','3p_pct','3pd_pct',
        '3pr','3prd','adj_t','wab'
    ]


    table = table[table['rk']!='Rk'].copy()
    table[['w','l']]=table['rec'].str.split(expand=True)[0].str.split("–",expand=True).astype(int)
    table['win_perc'] = table['w']/(table['w']+table['l'])

    if process_name:
        table[['team','tourney']]=table['team'].str.split(",",expand=True)
        table['team'] = table['team'].str.replace(" seed","").apply(
            lambda x: ''.join([i for i in x if (i.isalpha())|(i==" ")])
        ).str.strip()
        table['tourney'] = table['tourney'].fillna('no tourney').str.strip()

        tourney_values = {
            'no tourney':128,
            'R68':68,
            'R64':64,
            'R32':32,
            'Sweet Sixteen':16,
            'Elite Eight':8,
            'Final Four':4,
            'Finals':2,
            'CHAMPS':1,
        }

        table['OUTCOME'] = table['tourney'].map(tourney_values)

    table['adjoe'] = strip_and_make_0_float(table['adjoe'])
    table['adjde'] = strip_and_make_0_float(table['adjde'])
    table['barthag'] = strip_and_make_0_float(table['barthag'])
    table['efg_pct'] = strip_and_make_0_float(table['efg_pct'])
    table['efgd_pct'] = strip_and_make_0_float(table['efgd_pct'])
    table['tor'] = strip_and_make_0_float(table['tor'])
    table['tord'] = strip_and_make_0_float(table['tord'])
    table['orb'] = strip_and_make_0_float(table['orb'])
    table['drb'] = strip_and_make_0_float(table['drb'])
    table['ftr'] = strip_and_make_0_float(table['ftr'])
    table['ftrd'] = strip_and_make_0_float(table['ftrd'])
    table['2p_pct'] = strip_and_make_0_float(table['2p_pct'])
    table['2pd_pct'] = strip_and_make_0_float(table['2pd_pct'])
    table['3p_pct'] = strip_and_make_0_float(table['3p_pct'])
    table['3pd_pct'] = strip_and_make_0_float(table['3pd_pct'])
    table['3pr'] = strip_and_make_0_float(table['3pr'])
    table['3prd'] = strip_and_make_0_float(table['3prd'])
    table['adj_t'] = strip_and_make_0_float(table['adj_t'])
    table['wab'] = strip_and_make_0_float(table['wab'])


    conferences = ['WCC', 'Amer', 'B12', 'ACC', 'SEC', 'BE', 'P12', 'B10', 'MWC',
        'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC', 'WAC', 'Sum', 'CAA',
        'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW', 'BSth', 'BSky',
        'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd']


    for c in conferences:
        table[c] = np.where(table['conf']==c,1,0)

    return table

In [3]:
years = [
    2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,
    2022,
    2023,
    2024
         ]

out = pd.DataFrame()
for year in years:

    url = f"https://barttorvik.com/trank.php?year={year}&type=R"

    tables = pd.read_html(url)

    table = tables[0]

    tmp = featurize(table)

    tmp['year']=year

    out = pd.concat([out,tmp])
    
    print(year)

ValueError: Length mismatch: Expected axis has 24 elements, new values have 22 elements

In [4]:

url = f"https://barttorvik.com/trank.php?year={year}&type=R"

tables = pd.read_html(url)

table = tables[0]


table.columns = [
    'rk','team','conf','g','rec','adjoe','adjde','barthag','efg_pct','efgd_pct',
    'tor','tord','orb','drb','ftr','ftrd','2p_pct','2pd_pct','3p_pct','3pd_pct',
    '3pr','3prd','adj_t','wab'
]


In [5]:
table

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,D-I Avg:,101.4,101.4,0.493,Eff. FG% 49.8,Eff. FG% 49.8,Turnover% 21.2,Turnover% 21.2,Rebound% 32.8,Rebound% 32.8,FT Rate 36.4,FT Rate 36.4,2-Pt % 48.4,2-Pt % 48.4,3-Pt % 35.1,3-Pt % 35.1,3P Rate 34.4,3P Rate 34.4,67.3,Unnamed: 23_level_0
Unnamed: 0_level_1,Rk,Team,Conf,G,Rec,AdjOE,AdjDE,Barthag,EFG%,EFGD%,TOR,TORD,ORB,DRB,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,Adj T.,WAB
0,1,"Kansas 1 seed, CHAMPS",B12,33,30–3 13–3,121.0 1,85.6 3,.9816 1,56.3 4,44.8 18,18.7 41,22.9 70,38.0 17,29.0 34,37.5 140,30.8 56,54.8 7,40.9 4,39.9 13,34.0 106,29.2 287,38.1 283,69.5 91,+9.9 3
1,2,"Memphis 1 seed, Finals",CUSA,34,33–1 16–0,113.3 24,83.8 1,.9697 2,53.0 47,42.5 4,17.2 11,23.3 57,37.8 23,29.3 39,38.6 113,32.7 90,53.3 22,41.6 6,35.0 170,29.8 6,36.7 108,28.7 26,70.7 46,+8.9 5
2,3,"UCLA 1 seed, Final Four",P10,33,30–3 16–2,116.4 7,86.9 4,.9664 3,52.3 74,48.0 82,18.8 44,22.7 80,39.2 13,25.9 3,38.0 130,25.7 12,52.9 29,46.8 103,33.8 216,34.2 116,28.1 303,27.9 17,66.2 207,+10.8 2
3,4,"Wisconsin 3 seed, Sweet Sixteen",B10,33,29–4 16–2,112.2 33,84.9 2,.9609 4,50.6 142,43.1 6,19.4 61,22.2 105,36.2 55,28.5 26,39.3 94,25.4 9,49.2 132,41.3 5,35.9 128,31.1 24,31.1 243,32.8 117,63.5 299,+8.3 8
4,5,"Duke 2 seed, R32",ACC,32,27–5 13–3,117.2 6,88.8 9,.9607 5,54.1 28,47.6 69,18.2 28,24.9 19,34.3 111,33.5 202,40.4 65,32.0 81,51.8 54,47.1 113,38.4 40,32.8 55,39.1 63,24.8 1,73.7 13,+8.9 6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,338,Grambling St.,SWAC,24,7–17 7–11,89.3 324,115.1 340,.0510 338,46.6 287,48.2 94,23.4 281,16.4 337,32.9 163,33.0 171,31.4 289,42.2 282,45.1 286,47.7 134,33.4 232,32.9 60,28.6 295,28.4 23,70.3 68,-14.1 279
351,339,North Carolina Central,ind,26,3–23 0–0,81.7 340,106.2 260,.0466 339,41.5 338,53.9 311,29.1 341,21.0 171,27.8 306,39.1 338,31.8 279,39.3 223,40.0 337,54.3 323,29.7 330,35.5 189,32.3 212,35.6 217,71.6 31,-13.8 275
352,340,Maryland Eastern Shore,MEAC,32,4–28 2–14,84.4 337,112.6 332,.0351 340,39.4 341,53.4 302,22.2 231,22.6 85,34.5 103,39.3 339,37.8 134,40.7 256,39.2 340,51.4 279,26.6 341,38.3 311,28.4 298,33.8 151,69.8 79,-22.9 340
353,341,NJIT,ind,29,0–29 0–0,76.6 341,107.8 287,.0194 341,40.2 340,53.7 307,27.9 339,20.4 204,30.7 249,32.9 167,40.1 79,45.8 319,39.0 341,50.7 256,28.4 336,40.1 333,32.3 212,31.4 75,71.1 41,-25.2 341


In [4]:
out

Unnamed: 0,rk,team,conf,g,rec,adjoe,adjde,barthag,efg_pct,efgd_pct,tor,tord,orb,drb,ftr,ftrd,2p_pct,2pd_pct,3p_pct,3pd_pct,adj_t,wab,w,l,win_perc,tourney,OUTCOME,WCC,Amer,B12,ACC,SEC,BE,P12,B10,MWC,MVC,A10,OVC,CUSA,AE,SC,WAC,Sum,CAA,MAAC,MAC,Ivy,ASun,Pat,SB,BW,BSth,BSky,NEC,Horz,SWAC,MEAC,Slnd,year
0,1,Kansas,B12,33,30–3 13–3,121.0,85.6,0.9816,56.3,44.8,18.7,22.9,38.0,29.0,37.5,30.8,54.8,40.9,39.9,34.0,69.5,9.9,30,3,0.909091,CHAMPS,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
1,2,Memphis,CUSA,34,33–1 16–0,113.3,83.8,0.9697,53.0,42.5,17.2,23.3,37.8,29.3,38.6,32.7,53.3,41.6,35.0,29.8,70.7,8.9,33,1,0.970588,Finals,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
2,3,UCLA,P10,33,30–3 16–2,116.4,86.9,0.9664,52.3,48.0,18.8,22.7,39.2,25.9,38.0,25.7,52.9,46.8,33.8,34.2,66.2,10.8,30,3,0.909091,Final Four,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
3,4,Wisconsin,B10,33,29–4 16–2,112.2,84.9,0.9609,50.6,43.1,19.4,22.2,36.2,28.5,39.3,25.4,49.2,41.3,35.9,31.1,63.5,8.3,29,4,0.878788,Sweet Sixteen,16,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
4,5,Duke,ACC,32,27–5 13–3,117.2,88.8,0.9607,54.1,47.6,18.2,24.9,34.3,33.5,40.4,32.0,51.8,47.1,38.4,32.8,73.7,8.9,27,5,0.843750,R32,32,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,354,Maine,AE,26,3–23 3–15,91.3,113.6,0.0745,46.9,54.9,19.3,18.3,24.0,30.4,20.4,32.8,47.0,53.3,31.0,38.6,67.1,-18.3,3,23,0.115385,no tourney,128,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022
367,355,Mississippi Valley St,SWAC,28,2–26 2–16,92.7,115.7,0.0724,44.5,56.7,18.1,20.7,23.4,33.9,31.4,36.5,43.8,55.1,30.5,40.0,71.8,-21.3,2,26,0.071429,no tourney,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2022
368,356,Delaware St,MEAC,25,0–25 0–14,85.8,110.7,0.0509,45.0,51.1,25.6,19.3,27.0,34.7,32.4,34.2,43.4,48.6,32.2,36.1,69.1,-21.8,0,25,0.000000,no tourney,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2022
369,357,Eastern Illinois,OVC,29,3–26 3–15,82.5,107.8,0.0444,44.2,53.4,24.1,18.2,20.6,32.6,31.2,24.2,42.3,55.8,31.8,33.5,65.9,-20.3,3,26,0.103448,no tourney,128,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022


In [5]:
training_mask = out['year'].isin([2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,])
validation_mask = out['year'].isin([2022])


In [6]:
features = ['adjoe', 'adjde', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd',
       '2p_pct', '2pd_pct', '3p_pct', '3pd_pct',
       'win_perc', 
       'WCC', 'Amer', 'B12', 'ACC', 'SEC',
       'BE', 'P12', 'B10', 'MWC', 'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC',
       'WAC', 'Sum', 'CAA', 'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW',
       'BSth', 'BSky', 'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd'
       ]
target = 'OUTCOME'

In [7]:
X = out[training_mask][features].copy()
y = out[training_mask][target].copy()

In [8]:
model = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',RandomForestRegressor(n_estimators=500,random_state=50,#max_depth=8
                                     ))
    ]
)

In [9]:
model.fit(X,y)

In [10]:
yfit = model.predict(X)

In [11]:
Xx = out[validation_mask][features].copy()
yy = out[validation_mask][target].copy()
yval = model.predict(Xx)

In [12]:
val_result = pd.DataFrame({
    'team':out[validation_mask]['team'],
    'OUTCOME':yy})
val_result['PREDICTION_NUMERIC'] = yval
val_result['PREDICTION_RANK']=val_result['PREDICTION_NUMERIC'].rank(ascending=True,)

In [13]:
val_result.sort_values(by='PREDICTION_RANK',ascending=True).head(50)

Unnamed: 0,team,OUTCOME,PREDICTION_NUMERIC,PREDICTION_RANK
3,Kansas,1,10.674,1.0
0,Gonzaga,16,10.864,2.0
1,Houston,8,11.408,3.0
2,Baylor,32,14.876,4.0
7,Kentucky,64,17.504,5.0
4,Arizona,16,18.58,6.0
5,Duke,4,18.692,7.0
13,Iowa,64,19.974,8.0
9,Villanova,4,21.376,9.0
12,UCLA,16,21.802,10.0


In [14]:
testyear = 2023

url = f"https://barttorvik.com/trank.php?year={testyear}&type=R"

tables = pd.read_html(url)

table = tables[0]

tmp = featurize(table,process_name=False)

tmp['year']=year


In [15]:
prediction_df = pd.DataFrame({
    'team':tmp['team'],
    'rating':model.predict(tmp[features])
})
    
prediction_df

Unnamed: 0,team,rating
0,Houston,10.376
1,Alabama,14.460
2,UCLA (A) 80 Colorado,35.278
3,Tennessee,29.768
4,Purdue,22.216
...,...,...
371,Lamar,128.000
372,IUPUI,128.000
373,Green Bay,128.000
374,Hartford,128.000


In [16]:
prediction_df.sort_values(by='rating',ascending=True).head(50)

Unnamed: 0,team,rating
0,Houston,10.376
1,Alabama,14.46
7,Arizona,19.266
8,Kansas,21.73
4,Purdue,22.216
5,Saint Mary's,26.458
9,San Diego St.,29.23
3,Tennessee,29.768
11,Gonzaga,31.312
14,Marquette,34.864
