In [39]:
#same as ELO.ipynb but for Womens
import pandas as pd
import numpy as np
import math
from sklearn.metrics import brier_score_loss
from collections import defaultdict
import warnings
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

In [40]:
tournament = "Womens"
if tournament == "Mens":
    elo = pd.read_csv("allELOSMens.csv")
    postseason = pd.read_csv("march-machine-learning-mania-2024/MNCAATourneyCompactResults.csv")
    seeds = pd.read_csv("march-machine-learning-mania-2024/MNCAATourneySeeds.csv")
else:
    elo = pd.read_csv("allELOSWomens.csv")
    postseason = pd.read_csv("march-machine-learning-mania-2024/WNCAATourneyCompactResults.csv")
    seeds = pd.read_csv("march-machine-learning-mania-2024/WNCAATourneySeeds.csv")
seeds["Seed"] = seeds["Seed"].str.extract('(\d+)').astype(int)

In [41]:
postseason["scorediff"] = postseason["WScore"] - postseason["LScore"]
postseasonseeds = pd.merge(postseason, seeds, left_on = ["WTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns={"Season_x": "Season", "Seed": "WSeed"}).drop(columns = ["TeamID"])
postseasonseeds = pd.merge(postseasonseeds, seeds, left_on = ["LTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns={"Season_x": "Season", "Seed": "LSeed"}).drop(columns = ["TeamID"])

In [42]:
def get_winning_percentage(winseed, loseseed):
    """
    Gets winning percentage of winseed against loseseed
    get_winning_percentage(winseed, loseseed) = 1 - get_winning_percentage(loseseed, winseed)
    """
    winsseeddf = postseasonseeds[(postseasonseeds["WSeed"] == winseed) & (postseasonseeds["LSeed"] == loseseed)]
    losesseeddf = postseasonseeds[(postseasonseeds["WSeed"] == loseseed) & (postseasonseeds["LSeed"] == winseed)]
    wins = winsseeddf.shape[0]
    loses = losesseeddf.shape[0]
    diff = winseed - loseseed
    if wins + loses == 0:
        if diff > 0: #loseseed higher
            return get_winning_percentage(winseed, loseseed + 1)
        else: #winseed higher
            return get_winning_percentage(winseed + 1, loseseed)
    if wins + loses >= 1:
        if wins / (wins + loses) == 1:
            return (wins / (wins + loses)) - (0.1 / get_num(winseed, loseseed))
        elif wins / (wins + loses) == 0:
            return (wins / (wins + loses)) + (0.1 / get_num(winseed, loseseed))
        else:
            return (wins / (wins + loses))
def get_num(winseed, loseseed):
    """
    Gets number of matchups between two seeds
    """
    winsseeddf = postseasonseeds[(postseasonseeds["WSeed"] == winseed) & (postseasonseeds["LSeed"] == loseseed)]
    losesseeddf = postseasonseeds[(postseasonseeds["WSeed"] == loseseed) & (postseasonseeds["LSeed"] == winseed)]
    wins = winsseeddf.shape[0]
    loses = losesseeddf.shape[0]
    return wins + loses
def expected_score(eloa, elob, isHome, home_advantage):
    """
    Gets the expected probabilty of winning for team A against team B
    eloa: the ELO of team A
    elob: the ELO of team B
    isHome: number which describes whether A is home(1), neutral(0), or away(-1)
    home_advantage: home advantage quantified in added winning odds
    """
    #from eloa pov
    expecta = 1/(1 + 10**((elob - eloa)/400)) #calculates expected
    homeadd = expecta + home_advantage
    awayloss = expecta - home_advantage
    if homeadd > 1: #if more than 1 or less than 0, set to 1 and 0, respectively(edge case)
        homeadd = 1
    if awayloss < 0:
        awayloss = 0
    
    if isHome == 1:
        return homeadd
    elif isHome == 0:
        return expecta
    else:
        return awayloss

In [43]:
postseason = postseason[["Season", "WTeamID", "LTeamID"]]
postseason = pd.merge(postseason, seeds, left_on = ["WTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns = {"Seed": "WSeed"})
postseason = pd.merge(postseason, seeds, left_on = ["LTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns = {"Seed": "LSeed"}).drop(columns = ["TeamID_x","TeamID_y"])
postseason = pd.merge(postseason, elo, left_on = ["WTeamID", "Season"], right_on = ["Team", "Season"]).rename(columns = {"ELO":"WELO", "VARS":"WVARS"})
postseason = pd.merge(postseason, elo, left_on = ["LTeamID", "Season"], right_on = ["Team", "Season"]).rename(columns = {"ELO":"LELO", "VARS":"LVARS"}).drop(columns = ["Team_x","Team_y","Unnamed: 0_x","Unnamed: 0_y"])


In [44]:
postseason["BasePercent"] = postseason.apply(lambda x: get_winning_percentage(x['WSeed'], x['LSeed']), axis=1)
postseason["NumTimes"] = postseason.apply(lambda x: get_num(x['WSeed'], x['LSeed']), axis=1)
postseason["EloPercent"] = postseason.apply(lambda x: expected_score(x['WELO'], x['LELO'], 0, 0), axis=1)
postseason["BaseVariance"] = postseason["BasePercent"] * (1 - postseason["BasePercent"])  #winner
postseason["BaseWeight"] = postseason["NumTimes"] / postseason["BaseVariance"] #winner
postseason.loc[postseason['BaseWeight'] > 2000, "BaseWeight"] = 2000
postseason["SDDiff"] = (postseason["WVARS"] + postseason["WVARS"]) / 2
postseason["SDDiffNorm"] = 1 / postseason["SDDiff"]
postseason["SDDiffNorm"] = (postseason["SDDiff"] - postseason["SDDiff"].min()) / (postseason["SDDiff"].max() - postseason["SDDiff"].min()) + 1
postseason["BaseWeightNorm"] = (postseason["BaseWeight"] - postseason["BaseWeight"].min()) / (postseason["BaseWeight"].max() - postseason["BaseWeight"].min()) + 1
postseason["RTTMPrediction"] = (postseason["BaseWeightNorm"] * postseason["BasePercent"] + postseason["SDDiffNorm"] * postseason["EloPercent"]) / (postseason["BaseWeightNorm"] + postseason["SDDiffNorm"])
#getting weights and values of model, making a prediction

In [45]:
postseason

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WELO,WVARS,LELO,LVARS,BasePercent,NumTimes,EloPercent,BaseVariance,BaseWeight,SDDiff,SDDiffNorm,BaseWeightNorm,RTTMPrediction
0,1998,3104,3422,2,15,1626.255674,14.287716,1550.419154,15.354211,0.999000,100,0.607437,0.000999,2000.000000,14.287716,1.738767,2.000000,0.816898
1,1998,3104,3417,2,7,1626.255674,14.287716,1574.344839,15.021610,0.787879,66,0.574155,0.167126,394.912088,14.287716,1.738767,1.194233,0.661177
2,1998,3112,3365,3,14,1596.114877,7.851472,1587.486950,4.805699,0.999000,100,0.512414,0.000999,2000.000000,7.851472,1.313418,2.000000,0.806120
3,1998,3112,3438,3,6,1596.114877,7.851472,1561.748158,14.446901,0.621212,66,0.549297,0.235308,280.483902,7.851472,1.313418,1.136789,0.582662
4,1998,3163,3193,2,15,1665.684849,3.993901,1550.560387,15.392165,0.999000,100,0.659869,0.000999,2000.000000,3.993901,1.058484,2.000000,0.881633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,2023,3405,3235,12,5,1731.947494,4.159656,1849.738741,13.903066,0.220000,100,0.336694,0.171600,582.750583,4.159656,1.069438,1.288529,0.272926
1579,2023,3417,3170,4,13,1848.113951,8.160966,1519.608262,6.531133,0.940000,100,0.868873,0.056400,1773.049645,8.160966,1.333871,1.886069,0.910535
1580,2023,3417,3328,4,5,1848.113951,8.160966,1834.929962,7.485608,0.541667,72,0.518964,0.248264,290.013986,8.160966,1.333871,1.141573,0.529434
1581,2023,3437,3156,4,13,1819.814432,6.015331,1616.268314,5.934049,0.940000,100,0.763453,0.056400,1773.049645,6.015331,1.192073,1.886069,0.871629


In [47]:
postseason["True"] = 1
from sklearn.metrics import brier_score_loss#Mens, Womens
print(brier_score_loss(postseason["True"], postseason["EloPercent"]))#0.1828579283664349, 0.1536095682291515
print(brier_score_loss(postseason["True"], postseason["RTTMPrediction"]))#0.176260171011094, 0.14178299013694134
print(brier_score_loss(postseason["True"], postseason["BasePercent"])) #0.18024775625696968, 0.14412701663242156

0.1536095682291515
0.14178299013694134
0.14412701663242156
