# Cleaning both Men and Women

In [1]:
import pandas as pd
season = pd.read_csv("./data/MRegularSeasonDetailedResults.csv")
wseason = pd.read_csv("./data/WRegularSeasonDetailedResults.csv")
season = pd.concat([season, wseason], axis=0)
tournament = pd.read_csv("./data/MNCAATourneyDetailedResults.csv")
wtournament = pd.read_csv("./data/WNCAATourneyDetailedResults.csv")
tournament = pd.concat([tournament, wtournament], axis=0)
conferences = pd.read_csv("./data/MTeamConferences.csv")
wconferences = pd.read_csv("./data/WTeamConferences.csv")
conferences = pd.concat([conferences, wconferences], axis=0)

In [2]:
all_matches = pd.concat([season, tournament], axis=0).sort_values(["Season", "DayNum"]).reset_index(drop=True)
all_matches.tail()
all_matches["LLoc"] = all_matches.WLoc
all_matches = all_matches.replace({"LLoc":{"H":"A", "A":"H", "N":"N"}})
all_matches["category"] = "men"
all_matches.loc[all_matches.WTeamID > 2000, "category"] = "women"
all_matches.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,LLoc,category
0,2003,10,1104,68,1328,62,N,0,27,58,...,22,10,22,8,18,9,2,20,N,men
1,2003,10,1272,70,1393,63,N,0,26,62,...,20,20,25,7,12,8,6,16,N,men
2,2003,11,1266,73,1437,61,N,0,24,58,...,23,31,22,9,12,2,5,23,N,men
3,2003,11,1296,56,1457,50,N,0,18,38,...,15,17,20,9,19,4,3,23,N,men
4,2003,11,1400,77,1208,71,N,0,30,61,...,27,21,15,12,10,7,1,14,N,men


In [3]:
cols = ["Season", "first_id", "second_id"]
all_matches["first_id"] = all_matches[['WTeamID','LTeamID']].min(axis=1)
all_matches["second_id"] = all_matches[['WTeamID','LTeamID']].max(axis=1)
all_matches["prob"] = 0
all_matches.loc[all_matches.first_id == all_matches.WTeamID, "prob"] = 1
all_matches["game_id"] = all_matches[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
all_matches.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,LLoc,category,first_id,second_id,prob,game_id
0,2003,10,1104,68,1328,62,N,0,27,58,...,18,9,2,20,N,men,1104,1328,1,2003_1104_1328
1,2003,10,1272,70,1393,63,N,0,26,62,...,12,8,6,16,N,men,1272,1393,1,2003_1272_1393
2,2003,11,1266,73,1437,61,N,0,24,58,...,12,2,5,23,N,men,1266,1437,1,2003_1266_1437
3,2003,11,1296,56,1457,50,N,0,18,38,...,19,4,3,23,N,men,1296,1457,1,2003_1296_1457
4,2003,11,1400,77,1208,71,N,0,30,61,...,10,7,1,14,N,men,1208,1400,0,2003_1208_1400


## Moving to first/second information and averages

In [4]:
adf = all_matches.drop(columns=["first_id", "second_id"])
winning_cols = [c for c in adf.columns if c.startswith("W")]
losing_cols = [c for c in adf.columns if c.startswith("L")]
neutral_cols = [c for c in adf.columns if not(c.startswith("W") or c.startswith("L"))]

#Figure out location for losing team

df_w = adf[neutral_cols+winning_cols+["LScore"]].copy()
df_l = adf[neutral_cols+losing_cols+["WScore"]].copy()

df_w = df_w.rename(columns=lambda x:x[1:] if x.startswith("W") else x)
df_l = df_l.rename(columns=lambda x:x[1:] if x.startswith("L") else x)
df_l = df_l.rename(columns={"WScore":"points_allowed"})
df_w = df_w.rename(columns={"LScore":"points_allowed"})

df_w["result"] = 1
df_l["result"] = 0

df = pd.concat([df_w, df_l], ignore_index=True)
df = df.sort_values(by=["Season", "TeamID", "DayNum"]).reset_index(drop=True)
display(df.isna().sum())
saved_df = df.copy()

Season            0
DayNum            0
NumOT             0
category          0
prob              0
game_id           0
TeamID            0
Score             0
Loc               0
FGM               0
FGA               0
FGM3              0
FGA3              0
FTM               0
FTA               0
OR                0
DR                0
Ast               0
TO                0
Stl               0
Blk               0
PF                0
points_allowed    0
result            0
dtype: int64

In [5]:
df = saved_df.copy()
df = df.sort_values(by=["Season", "TeamID", "DayNum"]).reset_index(drop=True)
stats = ["Score", "FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF", "points_allowed"]
cum_stats_cols = [f"cum_{s}" for s in stats]

for stat in stats:
    df[f"cum_{stat}"] = df.groupby(["Season", "TeamID"])[stat].cumsum().shift(fill_value=0)

df["games_won"] = df.groupby(["Season", "TeamID"])["result"].cumsum().shift(fill_value=0)

df["games_played"] = df.groupby(["Season", "TeamID"]).cumcount()

df["prev_TeamID"] = df["TeamID"].shift(1)
for stat in [*cum_stats_cols, "games_played", "games_won"]:
    df.loc[df["TeamID"] != df["prev_TeamID"], stat] = 0

df["games_lost"] = df["games_played"] - df["games_won"]

df["win_percentage"] = df["games_won"]/df["games_played"]

df = df.drop(columns=["prev_TeamID"])

df = pd.merge(df, conferences, how="left", left_on=["Season", "TeamID"], right_on=["Season", "TeamID"])

In [6]:
print(df.columns)
df.head()
cum_stats = df.drop(columns=stats)

averages = ["cum_Score", "cum_OR", "cum_DR", "cum_Ast", "cum_TO", "cum_Stl", "cum_Blk", "cum_PF", "cum_points_allowed"]
percentages = [("cum_FGM", "cum_FGA", "FG%"), ("cum_FGM3", "cum_FGA3", "FG3%"), ("cum_FTM", "cum_FTA", "FT%")]

print(cum_stats[cum_stats.games_played == 0].shape)

cum_stats = cum_stats[cum_stats.games_played != 0]

for col in averages:
    colname = "avg_" + col[4:]
    cum_stats[colname] = cum_stats[col] / cum_stats["games_played"]

for make, attempt, new_col in percentages:
    cum_stats[new_col] = cum_stats[make] / cum_stats[attempt]

averages = cum_stats.drop(columns=cum_stats_cols)

Index(['Season', 'DayNum', 'NumOT', 'category', 'prob', 'game_id', 'TeamID',
       'Score', 'Loc', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
       'Ast', 'TO', 'Stl', 'Blk', 'PF', 'points_allowed', 'result',
       'cum_Score', 'cum_FGM', 'cum_FGA', 'cum_FGM3', 'cum_FGA3', 'cum_FTM',
       'cum_FTA', 'cum_OR', 'cum_DR', 'cum_Ast', 'cum_TO', 'cum_Stl',
       'cum_Blk', 'cum_PF', 'cum_points_allowed', 'games_won', 'games_played',
       'games_lost', 'win_percentage', 'ConfAbbrev'],
      dtype='object')
(13583, 29)


## Pull this back to all_matches to have better historical data that we can calculate

In [7]:
unimportant_cols = ["prob", "NumOT", "result", "Loc", "category"]

averages_to_merge = averages.drop(columns=unimportant_cols)

In [8]:
safe = ['Season', 'DayNum', 'first_id', 'second_id', 'prob', 'game_id', 'NumOT', "category"]
fdf = all_matches[safe]
fdf.head()

df_merged = pd.merge(fdf, averages_to_merge, how="left", left_on=["Season", "DayNum", "game_id", "first_id"], right_on=["Season", "DayNum", "game_id", "TeamID"])
df_merged = df_merged.drop(columns=["TeamID"])
for col in df_merged.columns:
    if col not in safe:
        df_merged[f"first_{col}"] = df_merged[col]
        df_merged = df_merged.drop(columns=[col])

df_merged = pd.merge(df_merged, averages_to_merge, how="left", left_on=["Season", "DayNum", "game_id", "second_id"], right_on=["Season", "DayNum", "game_id", "TeamID"])
df_merged.drop(columns=["TeamID"])
for col in df_merged.columns:
    if col in safe or col.startswith("first_"):
        continue
    df_merged[f"second_{col}"] = df_merged[col]
    df_merged = df_merged.drop(columns=[col])

nfg_df = df_merged.dropna()
copy_dd = nfg_df.copy()
copy_dd.head()

Unnamed: 0,Season,DayNum,first_id,second_id,prob,game_id,NumOT,category,first_games_won,first_games_played,...,second_avg_DR,second_avg_Ast,second_avg_TO,second_avg_Stl,second_avg_Blk,second_avg_PF,second_avg_points_allowed,second_FG%,second_FG3%,second_FT%
7,2003,12,1186,1457,1,2003_1186_1457,0,men,0.0,1.0,...,20.0,9.0,19.0,4.0,3.0,23.0,56.0,0.367347,0.272727,0.533333
9,2003,12,1296,1458,0,2003_1296_1458,0,men,1.0,1.0,...,24.0,12.0,9.0,9.0,3.0,18.0,55.0,0.45614,0.5,0.851852
14,2003,14,1125,1135,1,2003_1125_1135,1,men,0.0,1.0,...,21.0,17.0,18.0,8.0,4.0,13.0,66.0,0.428571,0.315789,0.647059
15,2003,14,1156,1236,1,2003_1156_1236,0,men,0.0,1.0,...,21.0,11.0,30.0,10.0,4.0,28.0,80.0,0.463415,0.266667,0.714286
16,2003,14,1161,1194,1,2003_1161_1194,0,men,1.0,1.0,...,22.0,9.0,17.0,9.0,2.0,23.0,66.0,0.482759,0.454545,0.555556


# Modeling

## Cleaned to remove first games, so very beginner model

In [9]:
not_needed = ["Season", "first_id", "second_TeamID", "second_id", "game_id", "NumOT", "DayNum"]

nfg_df = copy_dd.copy()

nfg_df = nfg_df.drop(columns=not_needed)

display(nfg_df.head())
display(nfg_df.columns)

X = nfg_df.drop(columns=["prob"])
y = nfg_df["prob"]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from xgboost import XGBClassifier

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.2)

# Identify categorical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# ColumnTransformer to apply OneHotEncoder only to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
    ],
    remainder="passthrough"  # Keep non-categorical columns as they are
)


Unnamed: 0,prob,category,first_games_won,first_games_played,first_games_lost,first_win_percentage,first_ConfAbbrev,first_avg_Score,first_avg_OR,first_avg_DR,...,second_avg_DR,second_avg_Ast,second_avg_TO,second_avg_Stl,second_avg_Blk,second_avg_PF,second_avg_points_allowed,second_FG%,second_FG3%,second_FT%
7,1,men,0.0,1.0,1.0,0.0,big_sky,55.0,6.0,22.0,...,20.0,9.0,19.0,4.0,3.0,23.0,56.0,0.367347,0.272727,0.533333
9,0,men,1.0,1.0,0.0,1.0,mac,56.0,6.0,19.0,...,24.0,12.0,9.0,9.0,3.0,18.0,55.0,0.45614,0.5,0.851852
14,1,men,0.0,1.0,1.0,0.0,a_sun,48.0,14.0,26.0,...,21.0,17.0,18.0,8.0,4.0,13.0,66.0,0.428571,0.315789,0.647059
15,1,men,0.0,1.0,1.0,0.0,horizon,66.0,13.0,26.0,...,21.0,11.0,30.0,10.0,4.0,28.0,80.0,0.463415,0.266667,0.714286
16,1,men,1.0,1.0,0.0,1.0,mwc,80.0,13.0,18.0,...,22.0,9.0,17.0,9.0,2.0,23.0,66.0,0.482759,0.454545,0.555556


Index(['prob', 'category', 'first_games_won', 'first_games_played',
       'first_games_lost', 'first_win_percentage', 'first_ConfAbbrev',
       'first_avg_Score', 'first_avg_OR', 'first_avg_DR', 'first_avg_Ast',
       'first_avg_TO', 'first_avg_Stl', 'first_avg_Blk', 'first_avg_PF',
       'first_avg_points_allowed', 'first_FG%', 'first_FG3%', 'first_FT%',
       'second_games_won', 'second_games_played', 'second_games_lost',
       'second_win_percentage', 'second_ConfAbbrev', 'second_avg_Score',
       'second_avg_OR', 'second_avg_DR', 'second_avg_Ast', 'second_avg_TO',
       'second_avg_Stl', 'second_avg_Blk', 'second_avg_PF',
       'second_avg_points_allowed', 'second_FG%', 'second_FG3%', 'second_FT%'],
      dtype='object')

In [10]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

param_grid = {
    'n_estimators': Integer(10, 1000),
    'max_depth': Integer(3, 25),
    'learning_rate': Real(0.01, 0.5, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0)
}

classifier = XGBClassifier(device="cuda")

grid_search = BayesSearchCV(classifier, param_grid, scoring="neg_brier_score", cv=5, verbose=3, n_iter=25)

pipe = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", grid_search)  
])

pipe.fit(train_X, train_y)

preds = pipe.predict_proba(test_X)[:,1]

print("Testing set score: ", brier_score_loss(test_y, preds))
print("Best set of hyperparameters: ", pipe.named_steps["classifier"].best_params_)
print("Best score: ", -pipe.named_steps["classifier"].best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV 1/5] END colsample_bytree=0.5369647833578984, learning_rate=0.11457442192929342, max_depth=7, n_estimators=988, subsample=0.7804716737752281;, score=-0.198 total time=  24.6s
[CV 2/5] END colsample_bytree=0.5369647833578984, learning_rate=0.11457442192929342, max_depth=7, n_estimators=988, subsample=0.7804716737752281;, score=-0.198 total time=  17.9s
[CV 3/5] END colsample_bytree=0.5369647833578984, learning_rate=0.11457442192929342, max_depth=7, n_estimators=988, subsample=0.7804716737752281;, score=-0.200 total time=  21.5s
[CV 4/5] END colsample_bytree=0.5369647833578984, learning_rate=0.11457442192929342, max_depth=7, n_estimators=988, subsample=0.7804716737752281;, score=-0.199 total time=  18.2s
[CV 5/5] END colsample_bytree=0.5369647833578984, learning_rate=0.11457442192929342, max_depth=7, n_estimators=988, subsample=0.7804716737752281;, score=-0.199 total time=  21.3s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END colsample_bytree=0.70234700517733

In [22]:
import json
from datetime import datetime

with open(f"./model_params/{datetime.isoformat(datetime.now())}.json", "w") as f:
    temp = pipe.named_steps["classifier"].best_params_
    temp["score"] = -pipe.named_steps["classifier"].best_score_
    json.dump(temp, f)

## Trying with GPU

## Final Model

In [12]:
# params = pd.read_csv("./model_params/params.csv")
# #We want the best score, which is the smallest
# params.sort_values(by=["score"], inplace=True, ascending=True)
# params.drop(columns=["score"], inplace=True)
# best_params = params.iloc[0].to_dict()
# def convert_to_int_if_possible(d):
#     for k, v in d.items():
#         if isinstance(v, float) and v.is_integer():
#             d[k] = int(v)
#     return d
# best_params = convert_to_int_if_possible(best_params)
# best_params

In [13]:
# # Identify categorical columns
# categorical_features = X.select_dtypes(include=['object', 'category']).columns

# # ColumnTransformer to apply OneHotEncoder only to categorical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
#     ],
#     remainder="passthrough"  # Keep non-categorical columns as they are
# )

# model = XGBClassifier(**best_params)

# # Pipeline with OneHotEncoder and XGBoost
# pipe = Pipeline([
#     ("preprocessing", preprocessor),
#     ("classifier", model)  # Add your hyperparameters here
# ])

# pipe.fit(train_X, train_y)

# preds = pipe.predict_proba(test_X)[:,1]

# onehot_columns = pipe.named_steps['preprocessing'].named_transformers_['onehot'].get_feature_names_out(categorical_features)
# all_columns = list(onehot_columns) + list(X.select_dtypes(exclude=['object', 'category']).columns)

# # Map feature importance to feature names
# feature_important = model.get_booster().get_score(importance_type='weight')
# feature_importance = {all_columns[int(k[1:])]: v for k, v in feature_important.items()}

# keys = list(feature_importance.keys())
# values = list(feature_importance.values())

# data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
# display(data.head(25)) #Display the 25 most important features

# Predictions

## Making Predictions

In [23]:
import pandas as pd
submission = pd.read_csv("./data/SampleSubmissionStage2.csv")
teams = pd.DataFrame(submission["ID"].str.split("_").to_list())
cols = ["Season", "first_team_id", "second_team_id"]
teams.columns = cols
teams = pd.concat([teams, submission], axis=1)
for c in cols:
    teams[c] = teams[c].astype("int64")
teams["category"]  = "mens"
teams.loc[teams.first_team_id > 2001 , "category"]  = "women"
teams.tail()

Unnamed: 0,Season,first_team_id,second_team_id,ID,Pred,category
131402,2025,3477,3479,2025_3477_3479,0.5,women
131403,2025,3477,3480,2025_3477_3480,0.5,women
131404,2025,3478,3479,2025_3478_3479,0.5,women
131405,2025,3478,3480,2025_3478_3480,0.5,women
131406,2025,3479,3480,2025_3479_3480,0.5,women


In [24]:
averages.head()
temp_avgs = averages.drop(columns=["NumOT", "prob", "Loc", "result"])
data2025 = temp_avgs[temp_avgs.Season == 2025]
last_info = data2025.loc[data2025.groupby(["TeamID"])["DayNum"].idxmax()]
last_info = last_info.drop(columns=["category"])

In [25]:
d = pd.merge(teams, last_info, how="left", left_on="first_team_id", right_on="TeamID", suffixes=("", "_first"))
d=d.drop(columns=["Season_first", "DayNum", "TeamID", "game_id"])
d.columns = [f"first_{col}" if col not in teams.columns else col for col in d.columns ]
d.head()
d = pd.merge(d, last_info, how="left", left_on="second_team_id", right_on="TeamID", suffixes=("", "_second"))
d = d.drop(columns=["Season_second", "DayNum", "game_id", "TeamID"])
d.columns = [f"second_{col}" if col in last_info.columns else col for col in d.columns ]
d = d.drop(columns=["second_Season", "first_team_id", "second_team_id"])
d.head()

Unnamed: 0,ID,Pred,category,first_games_won,first_games_played,first_games_lost,first_win_percentage,first_ConfAbbrev,first_avg_Score,first_avg_OR,...,second_avg_DR,second_avg_Ast,second_avg_TO,second_avg_Stl,second_avg_Blk,second_avg_PF,second_avg_points_allowed,second_FG%,second_FG3%,second_FT%
0,2025_1101_1102,0.5,mens,11,25,14,0.44,wac,67.36,8.12,...,20.758621,13.37931,12.137931,5.689655,2.931034,17.724138,73.172414,0.424676,0.336705,0.640301
1,2025_1101_1103,0.5,mens,11,25,14,0.44,wac,67.36,8.12,...,25.518519,17.740741,12.185185,7.296296,3.703704,18.62963,75.592593,0.464675,0.355975,0.741573
2,2025_1101_1104,0.5,mens,11,25,14,0.44,wac,67.36,8.12,...,28.428571,16.964286,12.535714,6.0,4.607143,18.857143,80.142857,0.485187,0.350236,0.722071
3,2025_1101_1105,0.5,mens,11,25,14,0.44,wac,67.36,8.12,...,20.44,12.72,15.4,8.36,3.8,22.16,80.48,0.390707,0.30721,0.672213
4,2025_1101_1106,0.5,mens,11,25,14,0.44,wac,67.36,8.12,...,22.074074,11.037037,8.592593,7.296296,2.814815,18.703704,75.222222,0.398942,0.319728,0.69084


In [26]:
preds = pipe.predict_proba(d)[:,1]

teams["Pred"] = preds

teams.head()

copy_teams = teams.copy()

In [27]:
submission = teams[["ID", "Pred"]]


from datetime import datetime

submission.to_csv(f"./submissions/{datetime.isoformat(datetime.now())}.csv", index=False)

## Adding in Team Names from the actual brackets

In [28]:
teams = copy_teams.copy()

tms = pd.read_csv("./data/MTeams.csv")
tms["category"] = "mens"
wteams = pd.read_csv("./data/WTeams.csv")
wteams["category"] = "women"


tms = pd.concat([tms, wteams], axis=0)
cpy_tms = tms.copy()
tms = tms[["TeamID", "TeamName"]]

teams = teams.merge(tms, how="left", left_on="first_team_id", right_on="TeamID")
teams["first_team_name"] = teams.TeamName
teams = teams.drop(columns=["TeamName", "TeamID"])

teams = teams.merge(tms, how="left", left_on="second_team_id", right_on="TeamID")
display(teams.head())
teams["second_team_name"] = teams.TeamName
teams = teams.drop(columns=["TeamName", "TeamID"])

teams.head()

Unnamed: 0,Season,first_team_id,second_team_id,ID,Pred,category,first_team_name,TeamID,TeamName
0,2025,1101,1102,2025_1101_1102,0.724173,mens,Abilene Chr,1102,Air Force
1,2025,1101,1103,2025_1101_1103,0.17273,mens,Abilene Chr,1103,Akron
2,2025,1101,1104,2025_1101_1104,0.113784,mens,Abilene Chr,1104,Alabama
3,2025,1101,1105,2025_1101_1105,0.712637,mens,Abilene Chr,1105,Alabama A&M
4,2025,1101,1106,2025_1101_1106,0.559939,mens,Abilene Chr,1106,Alabama St


Unnamed: 0,Season,first_team_id,second_team_id,ID,Pred,category,first_team_name,second_team_name
0,2025,1101,1102,2025_1101_1102,0.724173,mens,Abilene Chr,Air Force
1,2025,1101,1103,2025_1101_1103,0.17273,mens,Abilene Chr,Akron
2,2025,1101,1104,2025_1101_1104,0.113784,mens,Abilene Chr,Alabama
3,2025,1101,1105,2025_1101_1105,0.712637,mens,Abilene Chr,Alabama A&M
4,2025,1101,1106,2025_1101_1106,0.559939,mens,Abilene Chr,Alabama St


In [29]:
import re

mgames = """
1 Auburn vs. 16 Alabama State/St. Francis PA
8 Louisville vs. 9 Creighton
5 Michigan vs. 12 UC San Diego
4 Texas A&M vs. 13 Yale
6 Ole Miss vs. 11 SDSU/North Carolina
3 Iowa State vs. 14 Lipscomb
7 Marquette vs. 10 New Mexico
2 Michigan State vs. 15 Bryant
1 Florida vs. 16 Norfolk State
8 UConn vs. 9 Oklahoma
5 Memphis vs. 12 Colorado State
4 Maryland vs. 13 Grand Canyon
6 Missouri vs. 11 Drake
3 Texas Tech vs. 14 UNC Wilmington
7 Kansas vs. 10 Arkansas
2 St. John’s vs. 15 Omaha
1 Duke vs. 16 American/Mount St. Mary’s
8 Mississippi State vs. 9 Baylor
5 Oregon vs. 12 Liberty
4 Arizona vs. 13 Akron
6 BYU vs. 11 VCU
3 Wisconsin vs. 14 Montana
7 St. Mary’s vs. 10 Vanderbilt
2 Alabama vs. 15 Robert Morris
1 Houston vs. 16 SIUE
8 Gonzaga vs. 9 Georgia
5 Clemson vs. 12 McNeese
4 Purdue vs. 13 High Point
6 Illinois vs. 11 Texas/Xavier
3 Kentucky vs. 14 Troy
7 UCLA vs. 10 Utah State
2 Tennessee vs. 15 Wofford
"""

matchups = mgames.split("\n")[1:-1]
m = [m.split(" vs. ") for m in matchups]
mbrck = [re.sub(r'^\d+\s+', '', team) for pair in m for team in pair]



mmap = {
    "Ole Miss":"Mississippi",
    "Iowa State":"Iowa St",
    "Michigan State":"Michigan St",
    "Norfolk State":"Norfolk St",
    "UConn":"Connecticut",
    "Colorado State":"Colorado St",
    "St. John’s":"St John's",
    "Omaha":"NE Omaha",
    "Mississippi State":"Mississippi St",
    "St. Mary’s":"St Mary's CA",
    "McNeese":"McNeese St",
    "Utah State":"Utah St",
    "SDSU":"South Dakota",
    "Mount St. Mary’s":"Mt St Mary's",
    "St. Francis PA":"St Francis PA",
    "Alabama State":"Alabama St",
    "American":"American Univ"}

wgames = """
No. 1 UCLA vs. No. 16 UC San Diego/Southern
No. 8 Richmond vs. No. 9 Georgia Tech
No. 4 Baylor vs. No. 13 Grand Canyon
No. 5 Ole Miss vs. No. 12 Ball State
No. 3 LSU vs. No. 14 San Diego State
No. 6 Florida State vs. No. 11 George Mason
No. 2 NC State vs. No. 15 Vermont
No. 7 Michigan State vs. No. 10 Harvard
No. 1 USC vs. No. 16 UNC Greensboro
No. 8 California vs. No. 9 Mississippi State
No. 4 Kentucky vs. No. 13 Liberty
No. 5 Kansas State vs. No. 12 Fairfield
No. 3 Oklahoma vs. No. 14 Florida Gulf Coast
No. 6 Iowa vs. No. 11 Murray State
No. 2 UConn vs. No. 15 Arkansas State
No. 7 Oklahoma State vs. No. 10 South Dakota State
No. 1 South Carolina vs. No. 16 Tennessee Tech
No. 8 Utah vs. No. 9 Indiana
No. 4 Maryland vs. No. 13 Norfolk State
No. 5 Alabama vs. No. 12 Green Bay
No. 3 North Carolina vs. No. 14 Oregon State
No. 6 West Virginia vs. No. 11 Columbia/Washington
No. 2 Duke vs. No. 15 Lehigh
No. 7 Vanderbilt vs. No. 10 Oregon
No. 1 Texas vs. No. 16 High Point/William & Mary
No. 8 Illinois vs. No. 9 Creighton
No. 4 Ohio State vs. No. 13 Montana State
No. 5 Tennessee vs. No. 12 South Florida
No. 3 Notre Dame vs. No. 14 Stephen F. Austin
No. 6 Michigan vs. No. 11 Iowa State/Princeton
No. 2 TCU vs. No. 15 Fairleigh Dickinson
No. 7 Louisville vs. No. 10 Nebraska
"""

matchups = wgames.split("\n")[1:-1]
m = [m.split(" vs. ") for m in matchups]
wbrck = [re.sub(r'^(No\.\s*)?\d+\s+', '', team).strip() for pair in m for team in pair]

cpy_tms[cpy_tms.category == "womens"]

wmap = {
    "Ole Miss":"Mississippi",
    "Ball State":"Ball St",
    "San Diego State":"San Diego St",
    "Florida State":"Florida St",
    "Michigan State":"Michigan St",
    "Mississippi State":"Mississippi St",
    "Kansas State":"Kansas St",
    "Florida Gulf Coast":"FGCU",
    "Murray State":"Murray St",
    "UConn":"Connecticut",
    "Arkansas State":"Arkansas St",
    "Oklahoma State":"Oklahoma St",
    "South Dakota State":"South Dakota",
    "Norfolk State":"Norfolk St",
    "Green Bay":"WI Green Bay",
    "Oregon State":"Oregon St",
    "Ohio State":"Ohio St",
    "Montana State":"Montana St",
    "Stephen F. Austin":"SF Austin",
    "Fairleigh Dickinson":"F Dickinson",
    "Iowa State":"Iowa St",
    "Southern":"Southern Univ"
}

In [30]:
def get_tm_id(cat, tm_name, cpy_tms):
    cpy_tms = cpy_tms[["category", "TeamName", "TeamID"]]
    mtms = cpy_tms[cpy_tms["category"] == cat]
    return mtms.loc[mtms.TeamName == tm_name, "TeamID"].iloc[0]

def get_result(round, team_df, map, cat, team_names):
    round = [
        [map.get(round[0], round[0]), 0], 
        [map.get(round[1], round[1]), 0]
    ]
    round[0][1] = get_tm_id(cat, round[0][0], team_names)
    round[1][1] = get_tm_id(cat, round[1][0], team_names)
    round.sort(key=lambda x:x[1])
    first = round[0][0]
    second = round[1][0]
    pred = team_df.loc[(team_df.first_team_name == first) & (team_df.second_team_name == second), "Pred"]
    # print(pred)
    pred = pred.iloc[0]
    print(first if pred > .5 else  second, "beat", second if pred > .5 else first, "with a prediction of", pred if pred > .5 else 1-pred)
    return first if pred > .5 else second


def team_prediction(bracket, cat, df, map, team_names):
    # One round for play ins:
    team_df = df[df.category == cat]
    next_round = []
    for tm in bracket:
        winner = tm
        if "/" in tm:
            winner = get_result(tm.split("/"), team_df, map, cat, team_names)
        next_round.append(winner)
    bracket = next_round

    #Loop through since they are all 0 when %2
    while len(bracket) != 1:
        next_round = []
        while len(bracket) != 0:
            curr_round = []
            for i in range(2):
                curr_round.append(bracket.pop(0))
            winner = get_result(curr_round, team_df, map, cat, team_names)
            next_round.append(winner)
        bracket = next_round

print("MENS BRACKET PREDICTIONS")
team_prediction(mbrck, "mens", teams, mmap, cpy_tms)
print("WOMENS BRACKET PREDICTIONS")
team_prediction(wbrck, "women", teams, wmap, cpy_tms)

MENS BRACKET PREDICTIONS
Alabama St beat St Francis PA with a prediction of 0.57993364
North Carolina beat South Dakota with a prediction of 0.7788093
American Univ beat Mt St Mary's with a prediction of 0.50358266
Xavier beat Texas with a prediction of 0.5242547
Auburn beat Alabama St with a prediction of 0.9574302
Louisville beat Creighton with a prediction of 0.5921706
Michigan beat UC San Diego with a prediction of 0.5298722
Yale beat Texas A&M with a prediction of 0.58564204
Mississippi beat North Carolina with a prediction of 0.5704852
Iowa St beat Lipscomb with a prediction of 0.6801576
Marquette beat New Mexico with a prediction of 0.5002581
Michigan St beat Bryant with a prediction of 0.8614514
Florida beat Norfolk St with a prediction of 0.84622407
Connecticut beat Oklahoma with a prediction of 0.7361547
Memphis beat Colorado St with a prediction of 0.5427419
Maryland beat Grand Canyon with a prediction of 0.7673852
Missouri beat Drake with a prediction of 0.60302055
Texas Te