In [270]:
import pandas as pd
import sys
import numpy as np

from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [271]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', -1)  # No column width truncation
pd.set_option('display.max_colwidth', 1000)  # Show full content of each column

# Display dataframe in Jupyter with horizontal scrolling
from IPython.core.display import display, HTML

In [272]:
df = pd.DataFrame()
for year in ['2021', '2022', '2023', '2024']:
    file = year + '_men.csv'
    year_data = pd.read_csv(file)
    year_data = year_data.loc[year_data['berth'] == 1]
    #Remove extra index column
    year_data = year_data.iloc[:, 1:]
    
    #Remove rows with certain NaNs (didn't play in 2020-2021 season, i.e.)
    #year_data = year_data.dropna(subset=['Rank_SOS_NET'])

    df = pd.concat([df, year_data])

#No wab from before 2025, replace with for default
df['wab'] = df['wab'].fillna(0)

print(df.shape)
display(HTML(df.to_html(notebook=True)))

(272, 23)


Unnamed: 0,Team,NET Rank,SOS Rank,Non-Conf SOS Rank,kpi,sor,wab,bpi,kenpom,trank,q1_wins,q1_losses,q2_wins,q2_losses,q3_wins,q3_losses,q4_wins,q4_losses,berth,seed,overall_seed,autobid,year
0,Gonzaga,1,79,46,1,1,0.0,1,1,1,12,1,6,0,6,0,7,0,1,1,1,1,2021
1,Baylor,2,50,211,2,3,0.0,2,4,2,13,2,2,0,8,0,5,0,1,1,2,0,2021
2,Michigan,3,19,261,3,4,0.0,6,2,4,10,4,6,1,4,0,3,0,1,1,4,0,2021
3,Illinois,4,7,36,5,2,0.0,4,3,3,12,6,5,1,5,0,2,0,1,1,3,1,2021
4,Houston,5,133,157,9,13,0.0,3,6,6,4,2,7,1,11,1,5,0,1,2,8,1,2021
5,USC,6,75,136,14,15,0.0,16,14,27,6,5,6,3,10,0,3,0,1,6,21,0,2021
6,Iowa,7,15,165,25,7,0.0,5,5,5,8,7,5,2,4,0,5,0,1,2,7,0,2021
7,Alabama,8,33,108,4,5,0.0,10,8,9,9,5,9,1,8,1,0,0,1,2,5,1,2021
8,Colorado,9,78,200,23,26,0.0,18,17,24,3,5,8,1,9,3,3,0,1,5,20,0,2021
9,Loyola-Chicago,10,143,102,29,40,0.0,21,9,37,4,2,4,3,4,0,12,0,1,8,30,1,2021


In [273]:
feature_columns = df.columns[1:18].tolist()
print(feature_columns)
X = df[feature_columns]
y = df['overall_seed']

#scaler = StandardScaler()
#scaler.fit(X)
#scaled_X = scaler.transform(X)

#pca = PCA(n_components=8)
#pca_X = pca.fit_transform(scaled_X)
#print(pca_X)
#pca_X.shape

['NET Rank', 'SOS Rank', 'Non-Conf SOS Rank', 'kpi', 'sor', 'wab', 'bpi', 'kenpom', 'trank', 'q1_wins', 'q1_losses', 'q2_wins', 'q2_losses', 'q3_wins', 'q3_losses', 'q4_wins', 'q4_losses']


In [274]:
reg = LinearRegression().fit(X, y)
#reg_1 = Ridge(alpha=100).fit(X, y)

In [275]:
#reg = LinearRegression().fit(pca_X, y)
#reg_1 = Ridge(alpha=100).fit(pca_X, y)

In [276]:
#No WAB for 2025 - average other resume metrics
reg.coef_[5] = 0.5* (reg.coef_[3] + reg.coef_[4])
#reg_1.coef_[5] = 0.5* (reg_1.coef_[3] + reg_1.coef_[4])

reg.coef_

array([-0.03725836,  0.05105249,  0.00661117, -0.01104781,  0.05585554,
        0.02240387,  0.08035262,  0.04895403,  0.02626276, -1.8238066 ,
        1.43022199, -1.16890806,  1.90787367,  0.26436158,  0.49015272,
       -0.06995636, -1.17514756])

In [277]:
file = '2025_men.csv'
file = 'field_predictions.csv'
current_year = pd.read_csv(file)
#Remove extra index column
current_year = current_year.iloc[:, 1:]

current_year = current_year.iloc[:68, :]

#No wab from before 2025, replace with for default
#current_year['wab'] = current_year['wab'].fillna(0)
display(HTML(current_year.to_html(notebook=True)))

Unnamed: 0,Team,NET Rank,SOS Rank,Non-Conf SOS Rank,kpi,sor,wab,bpi,kenpom,trank,q1_wins,q1_losses,q2_wins,q2_losses,q3_wins,q3_losses,q4_wins,q4_losses,berth,seed,overall_seed,autobid,year,predicted_proba,predictions
0,Duke,1,57,12,6,4,6,1,1,2,9,3,7,0,10,0,5,0,0,0,0,1,2025,1.0,1
1,Lipscomb,84,253,98,104,115,92,106,83,102,0,2,1,2,6,3,16,2,0,0,0,1,2025,1.0,1
2,UC San Diego,35,197,226,40,46,45,57,36,44,2,1,2,1,9,2,15,0,0,0,0,1,2025,1.0,1
3,Colorado State,47,84,139,44,49,47,53,42,38,3,5,7,2,4,2,11,0,0,0,0,1,2025,1.0,1
4,Memphis,50,90,4,14,14,23,46,52,61,6,1,5,2,10,2,7,0,0,0,0,1,2025,1.0,1
5,Drake,56,200,258,33,32,41,73,58,58,2,0,4,0,11,3,11,0,0,0,0,1,2025,1.0,1
6,McNeese,58,217,44,77,64,59,66,59,63,0,2,1,2,6,2,18,0,0,0,0,1,2025,1.0,1
7,Liberty,60,222,294,49,62,60,85,62,56,0,0,3,2,14,4,8,0,0,0,0,1,2025,1.0,1
8,Yale,72,263,216,74,75,77,67,74,79,0,1,2,1,6,3,13,2,0,0,0,1,2025,1.0,1
9,High Point,82,354,356,69,67,66,81,84,85,0,0,1,0,8,3,18,2,0,0,0,1,2025,1.0,1


In [278]:
predicting_values = current_year[feature_columns]
#scaled_predicting = scaler.transform(predicting_values)
#pca_predicting = pca.transform(scaled_predicting)

current_year["raw_predicted_seed"] = reg.predict(predicting_values)
#current_year["raw_predicted_seed_1"] = reg_1.predict(predicting_values)
#current_year["raw_predicted_seed"] = reg.predict(pca_predicting)
#current_year["raw_predicted_seed_1"] = reg_1.predict(pca_predicting)

In [279]:
sorted_seeds = current_year.sort_values(by="raw_predicted_seed").reset_index()
sorted_seeds.index += 1
display(HTML(sorted_seeds.to_html(notebook=True)))
#sorted_seeds["reg_1_predicted"] = sorted_seeds["raw_predicted_seed_1"].rank()


Unnamed: 0,index,Team,NET Rank,SOS Rank,Non-Conf SOS Rank,kpi,sor,wab,bpi,kenpom,trank,q1_wins,q1_losses,q2_wins,q2_losses,q3_wins,q3_losses,q4_wins,q4_losses,berth,seed,overall_seed,autobid,year,predicted_proba,predictions,raw_predicted_seed
1,30,Auburn,2,2,8,1,1,1,3,4,3,16,5,6,0,2,0,4,0,0,0,0,0,2025,1.0,1,-8.03293
2,22,Houston,3,24,74,5,2,2,2,3,1,14,3,6,1,5,0,5,0,0,0,0,1,2025,1.0,1,-3.240289
3,23,Florida,4,23,262,3,3,3,4,2,4,11,4,9,0,4,0,6,0,0,0,0,1,2025,1.0,1,-0.64208
4,32,Alabama,6,1,9,2,6,4,6,6,5,11,8,8,0,4,0,2,0,0,0,0,0,2025,0.999999,1,4.241047
5,31,Michigan State,11,30,135,7,8,7,11,8,13,13,5,5,1,5,0,4,0,0,0,0,0,2025,1.0,1,4.802509
6,0,Duke,1,57,12,6,4,6,1,1,2,9,3,7,0,10,0,5,0,0,0,0,1,2025,1.0,1,5.511407
7,34,Tennessee,5,8,129,4,5,5,5,5,6,11,7,5,0,5,0,6,0,0,0,0,0,2025,0.999995,1,7.33146
8,25,Saint John's,13,65,167,10,7,8,12,12,15,6,4,11,0,7,0,6,0,0,0,0,1,2025,1.0,1,9.792398
9,37,Wisconsin,14,31,114,15,9,10,18,11,12,8,8,10,1,2,0,6,0,0,0,0,0,2025,0.99999,1,11.952865
10,35,Iowa State,9,36,94,23,13,18,7,10,8,10,7,6,2,3,0,5,0,0,0,0,0,2025,0.999994,1,13.378572


In [280]:
sorted_seeds = sorted_seeds[['Team']]
display(HTML(sorted_seeds.to_html(notebook=True)))
sorted_seeds.to_csv("overall_seed_predictions.csv")

Unnamed: 0,Team
1,Auburn
2,Houston
3,Florida
4,Alabama
5,Michigan State
6,Duke
7,Tennessee
8,Saint John's
9,Wisconsin
10,Iowa State
