In [1]:
import numpy as np
import os
import pandas as pd

from os.path import join
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.max_columns', None)

In [2]:
# load data - sourced from (https://www.kaggle.com/datasets/nishaanamin/march-madness-data?resource=download)
cwd = os.getcwd()
data_dir = join(cwd, "data", "2024")

matchups = pd.read_csv(join(data_dir, "Tournament Matchups.csv")) # seeding and achieved round for each year
results = pd.read_csv(join(data_dir, "Team Results.csv")) # tournament round appearance history
shooting = pd.read_csv(join(data_dir, "Shooting Splits.csv"))
coach = pd.read_csv(join(data_dir, "Coach Results.csv"))

In [3]:
# join datasets
index_cols = ["TEAM NO", "TEAM", "YEAR"]
data = pd.merge(
    matchups.drop(["CURRENT ROUND", "SCORE"], axis=1).drop_duplicates(index_cols),
    shooting,
    on=index_cols
)

# drop non numeric columns
non_numeric_columns = ["TEAM", "TEAM ID", "CONF"]
data = data.drop(non_numeric_columns, axis=1)

# drop 
non_predictive_cols = ["BY YEAR NO", "BY ROUND NO"]
data = data.drop(non_predictive_cols, axis=1)

data = data[data["YEAR"] < 2024]

In [4]:
# set up training/test split
selection_vector = data["YEAR"] % 4 == 0
train = data[~selection_vector]
test = data[selection_vector]

train_y = train["ROUND"]
train_x = train.drop("ROUND", axis=1)
test_y = train["ROUND"]
test_x = train.drop("ROUND", axis=1)

### basic regression model

In [5]:
train_x

Unnamed: 0,YEAR,BY YEAR NO,BY ROUND NO,TEAM NO,SEED,DUNKS FG%,DUNKS SHARE,DUNKS FG%D,DUNKS D SHARE,CLOSE TWOS FG%,CLOSE TWOS SHARE,CLOSE TWOS FG%D,CLOSE TWOS D SHARE,FARTHER TWOS FG%,FARTHER TWOS SHARE,FARTHER TWOS FG%D,FARTHER TWOS D SHARE,THREES FG%,THREES SHARE,THREES FG%D,THREES D SHARE,DUNKS FG% RANK,DUNKS SHARE RANK,DUNKS FG%D RANK,DUNKS D SHARE RANK,CLOSE TWOS FG% RANK,CLOSE TWOS SHARE RANK,CLOSE TWOS FG%D RANK,CLOSE TWOS D SHARE RANK,FARTHER TWOS FG% RANK,FARTHER TWOS SHARE RANK,FARTHER TWOS FG%D RANK,FARTHER TWOS D SHARE RANK,THREES FG% RANK,THREES SHARE RANK,THREES FG%D RANK,THREES D SHARE RANK
68,2023,1888,1888,1011,1,88.7,13.0,85.2,5.3,60.7,38.9,49.7,32.0,34.6,13.9,33.4,38.1,33.5,47.2,28.3,30.0,160,5,92,145,115,95,3,50,310,361,24,363,198,9,4,10
69,2023,1887,1887,955,16,90.0,1.6,97.6,4.1,56.5,41.5,62.6,34.0,34.1,24.0,42.3,24.5,36.4,34.5,33.8,41.5,123,355,362,50,261,35,317,101,321,249,324,118,59,273,170,312
70,2023,1886,1886,979,8,80.2,6.7,86.7,5.8,59.9,36.4,55.9,37.0,40.8,26.8,37.3,31.1,32.8,36.9,32.3,31.9,330,116,137,191,127,175,75,221,81,175,135,328,245,198,82,31
71,2023,1885,1885,945,9,89.7,6.3,92.8,6.9,60.3,33.3,58.5,39.5,40.7,30.5,38.8,25.4,35.0,36.3,34.0,35.1,132,139,315,282,121,262,163,297,84,85,203,148,122,218,184,111
72,2023,1884,1884,961,5,87.5,7.8,75.0,5.4,61.5,31.3,56.8,34.7,36.7,34.8,38.9,25.3,34.0,33.9,28.3,40.0,200,78,5,156,96,315,110,125,243,20,209,144,175,284,4,280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,2010,319,1063,152,14,97.3,3.6,89.2,3.4,63.5,35.5,63.1,33.6,35.0,24.5,37.4,33.1,37.6,40.1,33.3,33.3,31,206,151,65,98,135,228,167,175,321,275,173,45,33,129,209
894,2010,318,1062,155,7,86.4,3.6,91.7,2.7,62.2,29.8,59.0,30.4,39.1,34.0,31.8,38.4,35.5,36.2,30.1,31.2,243,206,229,26,128,268,129,93,35,175,35,290,111,82,21,123
895,2010,317,1061,153,10,91.2,2.9,86.8,2.6,59.8,37.2,54.2,42.8,41.7,26.7,36.8,33.8,40.3,36.1,29.7,23.4,155,248,87,22,192,83,28,334,8,305,263,194,7,83,10,3
896,2010,316,1060,138,2,95.2,4.8,89.6,4.2,60.4,37.1,58.2,28.0,37.0,28.5,35.0,33.4,37.3,34.4,33.5,38.6,54,127,160,167,169,85,102,50,83,282,181,185,51,124,140,326


In [6]:
# fit model
model = LinearRegression()
model.fit(train_x, train_y)
pred = model.predict(test_x)

# Calculate the model performance
mse = mean_squared_error(test_y, pred)
r2 = r2_score(test_y, pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')

Mean Squared Error: 319.7268004364447
R² Score: 0.3837292775760208


### advanced ML model

In [7]:
# fit model

### Neural net model

In [8]:
# fit model