In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from io import StringIO
import requests
from sklearn.preprocessing import MinMaxScaler
import time
pd.set_option("display.max_columns",75)

In [4]:
## FEATURE FUNCS
    

def strip_and_make_0_float(x):

    return x.str.split(expand=True)[0].astype(float)

def featurize(table,process_name=True):

    table = table[table['rk']!='Rk'].copy()
    table[['w','l']]=table['rec'].str.split(expand=True)[0].str.replace('–','-').str.split("-",expand=True).astype(int)
    table['win_perc'] = table['w']/(table['w']+table['l'])

    if process_name:
        table[['team','tourney']]=table['team'].str.split(",",expand=True)
        table['team'] = table['team'].str.replace(" seed","").apply(
            lambda x: ''.join([i for i in x if (i.isalpha())|(i==" ")])
        ).str.strip()
        table['tourney'] = table['tourney'].fillna('no tourney').str.strip()

        tourney_values = {
            'no tourney':128,
            'R68':68,
            'R64':64,
            'R32':32,
            'Sweet Sixteen':16,
            'Elite Eight':8,
            'Final Four':4,
            'Finals':2,
            'CHAMPS':1,
        }

        table['OUTCOME'] = table['tourney'].map(tourney_values)

    table['adjoe'] = strip_and_make_0_float(table['adjoe'])
    table['adjde'] = strip_and_make_0_float(table['adjde'])
    table['barthag'] = strip_and_make_0_float(table['barthag'])
    table['efg_pct'] = strip_and_make_0_float(table['efg_pct'])
    table['efgd_pct'] = strip_and_make_0_float(table['efgd_pct'])
    table['tor'] = strip_and_make_0_float(table['tor'])
    table['tord'] = strip_and_make_0_float(table['tord'])
    table['orb'] = strip_and_make_0_float(table['orb'])
    table['drb'] = strip_and_make_0_float(table['drb'])
    table['ftr'] = strip_and_make_0_float(table['ftr'])
    table['ftrd'] = strip_and_make_0_float(table['ftrd'])
    table['2p_pct'] = strip_and_make_0_float(table['2p_pct'])
    table['2pd_pct'] = strip_and_make_0_float(table['2pd_pct'])
    table['3p_pct'] = strip_and_make_0_float(table['3p_pct'])
    table['3pd_pct'] = strip_and_make_0_float(table['3pd_pct'])
    table['3pr'] = strip_and_make_0_float(table['3pr'])
    table['3prd'] = strip_and_make_0_float(table['3prd'])
    table['adj_t'] = strip_and_make_0_float(table['adj_t'])
    table['wab'] = strip_and_make_0_float(table['wab'])


    conferences = ['WCC', 'Amer', 'B12', 'ACC', 'SEC', 'BE', 'P12', 'B10', 'MWC',
        'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC', 'WAC', 'Sum', 'CAA',
        'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW', 'BSth', 'BSky',
        'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd']


    for c in conferences:
        table[c] = np.where(table['conf']==c,1,0)

    return table

In [5]:
years = [
    2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,
    2022,
    2023,
    2024,
    2025
         ]

out = pd.DataFrame()
for year in years[:]:

    url = f"https://barttorvik.com/trank.php?year={year}&type=R"

    response = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})

    tables = pd.read_html(StringIO(response.text))

    table = tables[0]
    table.columns = [
        'rk','team','conf','g','rec','adjoe','adjde','barthag','efg_pct','efgd_pct',
        'tor','tord','orb','drb','ftr','ftrd','2p_pct','2pd_pct','3p_pct','3pd_pct',
        '3pr','3prd','adj_t','wab'
    ]
    table = table[table['rec']!='Rec'].copy()
    tmp = featurize(table)

    tmp['year']=year

    out = pd.concat([out,tmp])

    l1 = len(tmp)
    l2 = len(tmp[tmp['OUTCOME'].isna()])
    
    print(year,l1,l2)
    time.sleep(1)

2008 341 0
2009 344 0
2010 347 0
2011 345 0
2012 345 0
2013 347 0
2014 351 0
2015 351 0
2016 351 0
2017 351 0
2018 351 0
2019 353 0
2021 347 0
2022 358 0
2023 363 0
2024 362 0
2025 364 68


In [6]:
training_mask = out['year'].isin([2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,
    2022,
    2023,])
validation_mask = out['year'].isin([2024])


In [7]:
features = ['adjoe', 'adjde', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd',
       '2p_pct', '2pd_pct', '3p_pct', '3pd_pct',
       'win_perc', 
       'WCC', 'Amer', 'B12', 'ACC', 'SEC',
       'BE', 'P12', 'B10', 'MWC', 'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC',
       'WAC', 'Sum', 'CAA', 'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW',
       'BSth', 'BSky', 'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd'
       ]
target = 'OUTCOME'

In [8]:
X = out[training_mask][features].copy()
y = out[training_mask][target].copy()

In [9]:
model = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',RandomForestRegressor(n_estimators=500,random_state=50,#max_depth=8
                                     ))
    ]
)

In [10]:
model.fit(X,y)

In [11]:
yfit = model.predict(X)

In [12]:
Xx = out[validation_mask][features].copy()
yy = out[validation_mask][target].copy()
yval = model.predict(Xx)

In [13]:
val_result = pd.DataFrame({
    'team':out[validation_mask]['team'],
    'OUTCOME':yy})
val_result['PREDICTION_NUMERIC'] = yval
val_result['PREDICTION_RANK']=val_result['PREDICTION_NUMERIC'].rank(ascending=True,)

In [14]:
val_result.sort_values(by='PREDICTION_RANK',ascending=True).head(50)

Unnamed: 0,team,OUTCOME,PREDICTION_NUMERIC,PREDICTION_RANK
1,Connecticut,1.0,8.266,1.0
5,Iowa St,16.0,12.864,2.0
0,Houston,16.0,13.308,3.0
3,Auburn,64.0,13.314,4.0
4,Arizona,16.0,15.638,5.0
6,Tennessee,8.0,21.83,6.0
2,Purdue,2.0,21.968,7.0
10,Duke,8.0,27.368,8.0
8,Marquette,16.0,27.498,9.0
14,Gonzaga,16.0,27.988,10.0


In [22]:
testyear = 2025

tmp = out[out['year']==testyear].copy()

In [21]:
out

Unnamed: 0,rk,team,conf,g,rec,adjoe,adjde,barthag,efg_pct,efgd_pct,tor,tord,orb,drb,ftr,ftrd,2p_pct,2pd_pct,3p_pct,3pd_pct,3pr,3prd,adj_t,wab,w,l,win_perc,tourney,OUTCOME,WCC,Amer,B12,ACC,SEC,BE,P12,B10,MWC,MVC,A10,OVC,CUSA,AE,SC,WAC,Sum,CAA,MAAC,MAC,Ivy,ASun,Pat,SB,BW,BSth,BSky,NEC,Horz,SWAC,MEAC,Slnd,year
0,1,Kansas,B12,33,30–3 13–3,121.1,85.9,0.9810,56.3,44.8,18.7,22.9,38.0,29.0,37.5,30.8,54.8,40.9,39.9,34.0,29.2,38.1,69.5,9.9,30,3,0.909091,CHAMPS,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
1,2,Memphis,CUSA,34,33–1 16–0,113.4,84.0,0.9692,53.0,42.5,17.2,23.3,37.8,29.3,38.6,32.7,53.3,41.6,35.0,29.8,36.7,28.7,70.7,9.0,33,1,0.970588,Finals,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
2,3,UCLA,P10,33,30–3 16–2,116.4,87.2,0.9653,52.3,48.0,18.8,22.7,39.2,25.9,38.0,25.7,52.9,46.8,33.8,34.2,28.1,27.9,66.2,10.7,30,3,0.909091,Final Four,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
3,4,Wisconsin,B10,33,29–4 16–2,112.3,85.2,0.9599,50.6,43.1,19.4,22.2,36.2,28.5,39.3,25.4,49.2,41.3,35.9,31.1,31.1,32.8,63.5,8.2,29,4,0.878788,Sweet Sixteen,16.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
4,5,Duke,ACC,32,27–5 13–3,117.3,89.0,0.9597,54.1,47.6,18.2,24.9,34.3,33.5,40.4,32.0,51.8,47.1,38.4,32.8,39.1,24.8,73.7,8.9,27,5,0.843750,R32,32.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,360,Chicago St,NEC,32,4–28 4–12,91.8,114.7,0.0719,44.4,54.9,18.5,18.2,22.6,33.6,26.8,36.9,43.4,55.7,30.6,35.8,40.9,39.4,69.3,-21.8,4,28,0.125000,no tourney,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2025
373,361,The Citadel,SC,26,1–25 0–18,93.5,117.1,0.0697,46.9,54.5,19.5,15.2,27.2,32.0,32.8,32.8,49.5,56.3,29.2,34.7,45.2,42.7,65.2,-21.5,1,25,0.038462,no tourney,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025
374,362,Arkansas Pine Bluff,SWAC,29,4–25 3–15,95.9,122.4,0.0573,50.3,56.3,20.4,16.8,26.3,34.6,32.8,40.5,52.6,58.5,31.0,35.7,37.9,44.9,72.3,-20.1,4,25,0.137931,no tourney,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2025
375,363,Coppin St,MEAC,29,5–24 4–10,87.7,112.6,0.0535,44.0,55.6,21.6,20.5,26.5,33.9,36.4,37.1,44.2,56.3,29.0,36.3,32.1,38.8,68.7,-19.0,5,24,0.172414,no tourney,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2025


In [29]:
prediction_df = pd.DataFrame({
    'team':tmp['team'],
    'rating':model.predict(tmp[features])
})
prediction_df['rank']=prediction_df['rating'].rank(ascending=True,)
prediction_df

Unnamed: 0,team,rating,rank
0,Houston,10.990,4.0
1,Duke,7.792,1.0
2,Auburn,11.178,5.0
3,Florida,8.512,2.0
4,Alabama,24.482,10.0
...,...,...,...
372,Chicago St,127.808,227.0
373,The Citadel,128.000,307.0
374,Arkansas Pine Bluff,121.856,145.0
375,Coppin St,128.000,307.0


In [30]:
prediction_df.sort_values(by='rating',ascending=True).head(50)

Unnamed: 0,team,rating,rank
1,Duke,7.792,1.0
3,Florida,8.512,2.0
5,Tennessee,10.686,3.0
0,Houston,10.99,4.0
2,Auburn,11.178,5.0
21,Louisville,18.24,6.0
7,Iowa St,18.47,7.0
6,Texas Tech,19.466,8.0
9,Maryland,19.976,9.0
4,Alabama,24.482,10.0


In [37]:
tmp['PP']=model.predict(tmp[features])
tmp['PP_RANK']=tmp['PP'].rank(ascending=True,)

In [32]:
prediction_df.to_csv('predictions.csv',index=False)

In [39]:
tmp.sort_values(by='PP',ascending=True).head(50).reset_index(drop=True)

Unnamed: 0,rk,team,conf,g,rec,adjoe,adjde,barthag,efg_pct,efgd_pct,tor,tord,orb,drb,ftr,ftrd,2p_pct,2pd_pct,3p_pct,3pd_pct,3pr,3prd,adj_t,wab,w,l,win_perc,tourney,OUTCOME,WCC,Amer,B12,ACC,SEC,BE,P12,B10,MWC,MVC,A10,OVC,CUSA,AE,SC,WAC,Sum,CAA,MAAC,MAC,Ivy,ASun,Pat,SB,BW,BSth,BSky,NEC,Horz,SWAC,MEAC,Slnd,year,PP,PP_RANK
0,2,Duke,ACC,34,31–3 19–1,127.5,90.7,0.9805,57.4,44.5,14.4,17.7,35.2,26.5,32.1,25.4,58.0,43.4,37.7,30.9,45.4,37.9,66.5,9.6,31,3,0.911765,✅,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,7.792,1.0
1,4,Florida,SEC,34,30–4 14–4,127.0,94.0,0.9694,55.0,45.3,15.0,17.0,38.1,28.8,32.6,33.0,56.4,45.9,35.5,29.6,43.6,37.3,70.3,11.1,30,4,0.882353,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,8.512,2.0
2,6,Tennessee,SEC,34,27–7 12–6,118.8,90.4,0.9589,52.8,44.4,16.0,17.8,35.4,30.1,35.4,29.4,54.1,46.7,34.0,27.8,43.0,45.1,64.6,9.3,27,7,0.794118,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,10.686,3.0
3,1,Houston,B12,34,30–4 19–1,124.6,87.6,0.983,52.7,44.9,14.6,21.7,36.1,29.3,28.2,34.1,49.0,43.9,39.8,30.9,34.5,43.1,62.2,11.6,30,4,0.882353,✅,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,10.99,4.0
4,3,Auburn,SEC,33,28–5 15–3,129.7,93.4,0.9775,55.7,46.0,13.4,17.4,34.3,30.3,33.5,39.2,56.1,47.2,36.8,29.2,40.6,34.8,68.5,12.5,28,5,0.848485,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,11.178,5.0
5,22,Louisville,ACC,34,27–7 18–2,118.1,95.2,0.9225,53.4,49.2,16.5,18.6,32.2,27.2,34.5,26.8,57.0,48.3,33.0,33.7,48.1,39.7,69.6,5.5,27,7,0.794118,✅,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,18.24,6.0
6,8,Iowa St,B12,33,24–9 13–7,119.5,92.6,0.9496,54.6,48.4,16.9,21.8,32.2,28.4,38.5,28.9,55.1,47.5,35.8,33.2,36.9,40.7,69.3,5.4,24,9,0.727273,✅,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,18.47,7.0
7,7,Texas Tech,B12,33,25–8 15–5,125.7,96.9,0.9524,55.5,48.0,14.8,17.4,34.3,28.5,29.7,33.5,54.3,48.2,37.9,31.7,44.6,34.1,66.3,5.8,25,8,0.757576,✅,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,19.466,8.0
8,10,Maryland,B10,33,25–8 14–6,118.0,92.4,0.9437,53.8,47.0,14.5,20.2,31.0,27.2,32.9,25.9,52.7,47.6,37.2,30.7,35.5,35.8,70.6,4.6,25,8,0.757576,✅,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,19.976,9.0
9,5,Alabama,SEC,33,25–8 13–5,127.0,96.2,0.9605,56.3,47.9,16.7,13.5,34.7,29.2,40.1,33.9,59.7,48.8,35.0,30.8,46.2,35.1,75.4,9.8,25,8,0.757576,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,24.482,10.0


In [40]:
tmp.sort_values(by='wab',ascending=False).head(50).reset_index(drop=True)

Unnamed: 0,rk,team,conf,g,rec,adjoe,adjde,barthag,efg_pct,efgd_pct,tor,tord,orb,drb,ftr,ftrd,2p_pct,2pd_pct,3p_pct,3pd_pct,3pr,3prd,adj_t,wab,w,l,win_perc,tourney,OUTCOME,WCC,Amer,B12,ACC,SEC,BE,P12,B10,MWC,MVC,A10,OVC,CUSA,AE,SC,WAC,Sum,CAA,MAAC,MAC,Ivy,ASun,Pat,SB,BW,BSth,BSky,NEC,Horz,SWAC,MEAC,Slnd,year,PP,PP_RANK
0,3,Auburn,SEC,33,28–5 15–3,129.7,93.4,0.9775,55.7,46.0,13.4,17.4,34.3,30.3,33.5,39.2,56.1,47.2,36.8,29.2,40.6,34.8,68.5,12.5,28,5,0.848485,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,11.178,5.0
1,1,Houston,B12,34,30–4 19–1,124.6,87.6,0.983,52.7,44.9,14.6,21.7,36.1,29.3,28.2,34.1,49.0,43.9,39.8,30.9,34.5,43.1,62.2,11.6,30,4,0.882353,✅,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,10.99,4.0
2,4,Florida,SEC,34,30–4 14–4,127.0,94.0,0.9694,55.0,45.3,15.0,17.0,38.1,28.8,32.6,33.0,56.4,45.9,35.5,29.6,43.6,37.3,70.3,11.1,30,4,0.882353,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,8.512,2.0
3,5,Alabama,SEC,33,25–8 13–5,127.0,96.2,0.9605,56.3,47.9,16.7,13.5,34.7,29.2,40.1,33.9,59.7,48.8,35.0,30.8,46.2,35.1,75.4,9.8,25,8,0.757576,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,24.482,10.0
4,2,Duke,ACC,34,31–3 19–1,127.5,90.7,0.9805,57.4,44.5,14.4,17.7,35.2,26.5,32.1,25.4,58.0,43.4,37.7,30.9,45.4,37.9,66.5,9.6,31,3,0.911765,✅,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,7.792,1.0
5,6,Tennessee,SEC,34,27–7 12–6,118.8,90.4,0.9589,52.8,44.4,16.0,17.8,35.4,30.1,35.4,29.4,54.1,46.7,34.0,27.8,43.0,45.1,64.6,9.3,27,7,0.794118,✅,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,10.686,3.0
6,14,St Johns,BE,34,30–4 18–2,112.8,89.7,0.9334,49.7,46.6,15.6,21.9,37.3,28.9,30.2,30.4,51.5,45.8,30.4,31.8,30.3,40.8,70.9,7.8,30,4,0.882353,✅,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,25.506,12.0
7,12,Michigan St,B10,33,27–6 17–3,116.9,92.4,0.9369,51.2,46.1,16.6,16.4,35.3,24.6,37.8,33.1,53.6,49.1,30.8,27.9,32.7,41.4,68.6,7.8,27,6,0.818182,✅,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,31.972,16.0
8,26,Michigan,B10,34,25–9 14–6,116.2,94.4,0.9155,54.5,46.3,19.8,16.0,33.2,29.8,35.0,28.5,57.7,45.9,33.4,31.3,42.5,38.9,70.2,6.2,25,9,0.735294,✅,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,36.602,24.0
9,13,Wisconsin,B10,35,26–9 13–7,122.3,96.9,0.9357,53.6,47.9,14.1,14.5,28.0,26.6,33.8,28.1,54.7,46.8,34.9,33.0,48.1,39.1,68.2,5.9,26,9,0.742857,✅,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,33.95,20.0
