In [50]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pd.set_option("display.max_columns",75)

from xgboost import XGBRegressor

from sklearn.feature_selection import (SelectKBest,VarianceThreshold)
from sklearn.preprocessing import PolynomialFeatures

from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

import joblib


In [51]:
conn = sqlite3.Connection("../artifacts/data/db.sqlite3")

dat = pd.read_sql('select * from FEATURES',
con=conn)

In [52]:
features = ['adjoe', 'adjde', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd',
       '2p_pct', '2pd_pct', '3p_pct', '3pd_pct','3pr', '3prd',
       'win_perc', 
       'WCC', 'Amer', 'B12', 'ACC', 'SEC',
       'BE', 'P12', 'B10', 'MWC', 'MVC', 'A10', 'OVC', 'CUSA', 'AE', 'SC',
       'WAC', 'Sum', 'CAA', 'MAAC', 'MAC', 'Ivy', 'ASun', 'Pat', 'SB', 'BW',
       'BSth', 'BSky', 'NEC', 'Horz', 'SWAC', 'MEAC', 'Slnd'
       ]
target = 'OUTCOME'

In [53]:
training_mask = dat['year'].isin([2008,
    2009,
    2010,
    2011,
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2021,
    2022,])
validation_mask = dat['year'].isin([2023])
X = dat[training_mask][features].copy()
y = dat[training_mask][target].copy()

In [87]:
model = Pipeline(
    steps=[
    ('poly',PolynomialFeatures(degree=2,interaction_only=False)),
    ('variance_selector',VarianceThreshold(threshold=0)),
    ('selector',SelectKBest(k=10)),
    ('scaler',MinMaxScaler()),
    ('learner',RandomForestRegressor(n_estimators=200,random_state=42,max_depth=5)),
    #('learner',XGBRegressor()),
    #('learner',MLPRegressor(hidden_layer_sizes=(20,20,20),activation='relu',solver='sgd',
    #                        alpha=0,learning_rate='constant',learning_rate_init=1e-5,
    #                        max_iter=10000,verbose=True,random_state=42,
    #                        validation_fraction=0,early_stopping=False,
    #                        shuffle=False,batch_size=len(X),n_iter_no_change=150))
    ]
)

In [88]:
model.fit(X,y)

In [89]:
yfit = model.predict(X)

In [90]:
Xx = dat[validation_mask][features].copy()
yy = dat[validation_mask][target].copy()
yval = model.predict(Xx)

In [91]:
val_result = pd.DataFrame({
    'team':dat[validation_mask]['team'],
    'OUTCOME':yy})
val_result['PREDICTION_NUMERIC'] = yval
val_result['PREDICTION_RANK']=val_result['PREDICTION_NUMERIC'].rank(ascending=True,)

In [92]:

def rank_to_round(x):
    round_made = pd.Series(np.where(
        x<=1,1,
    np.where(
        x<=2,2,
    np.where(
        x<=4,3,
    np.where(
        x<=8,4,
    np.where(
        x<=16,5,
    np.where(
        x<=32,6,
    np.where(
        x<=64,7,
    np.where(
        x<=68,8,
    9
    )
    )
    )
    )
    )
    )
    )
    ),index = x.index)
    return round_made

In [93]:
val_result['OUTCOME_ROUND'] = rank_to_round(val_result['OUTCOME'])
val_result['PREDICTION_ROUND'] = rank_to_round(val_result['PREDICTION_RANK'])


In [94]:
tourney_val = val_result[val_result['OUTCOME']<=68].copy()

In [95]:
correlation = tourney_val[['OUTCOME_ROUND','PREDICTION_ROUND']].corr()['PREDICTION_ROUND'].values[0]

In [96]:
val_result.sort_values(by='OUTCOME',ascending=True).head(32)

Unnamed: 0,team,OUTCOME,PREDICTION_NUMERIC,PREDICTION_RANK,OUTCOME_ROUND,PREDICTION_ROUND
4886,Connecticut,1,22.786105,5.0,1,4
4893,San Diego St,2,26.056828,9.0,2,5
4921,Miami FL,4,53.652237,27.0,3,6
4906,Florida Atlantic,4,50.778975,21.0,3,6
4905,Kansas St,8,53.051704,26.0,4,6
4890,Texas,8,26.148872,11.0,4,5
4891,Gonzaga,8,22.922439,6.0,4,4
4895,Creighton,8,48.172052,19.0,4,6
4882,Houston,16,10.86237,1.0,5,1
4911,Michigan St,16,60.612624,33.0,5,7


In [97]:
correlation

0.47368011639579555

In [98]:
joblib.dump(model,"../artifacts/models/model.joblib")

['../artifacts/models/model.joblib']

In [99]:
testset = dat[dat['year']==2024].copy()
pred = model.predict(testset[features])

In [102]:
test_result = pd.DataFrame(
    {
        'team':testset['team'],
    }
)
test_result['OUTCOME_PREDICTED'] = pred


In [106]:
test_result.to_csv("../artifacts/predictions/result.csv",index=False)