In [56]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import brier_score_loss
from collections import defaultdict
import warnings
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

In [57]:
def def_elo():
    return 1500
tournamentchange = {"NCAA": 1.5, "NIT": 1.3, "CBI": 1.2, "NIT ": 1.3, "V16": 1.1, "TBC": 1.1, "CIT": 1.1, "Regular":1} #ELO Changes


In [58]:
regularseason = pd.read_csv("march-machine-learning-mania-2024/MRegularSeasonCompactResults.csv")
postseason = pd.read_csv("march-machine-learning-mania-2024/MNCAATourneyCompactResults.csv")
secondary = pd.read_csv("march-machine-learning-mania-2024/MSecondaryTourneyCompactResults.csv")#importing csvs
conferences = pd.read_csv("march-machine-learning-mania-2024/MTeamConferences.csv")
regularseason["scorediff"] = regularseason["WScore"] - regularseason["LScore"]
postseason["scorediff"] = postseason["WScore"] - postseason["LScore"]
secondary["scorediff"] = secondary["WScore"] - secondary["LScore"]


In [59]:
postseason["Tournament"] = "NCAA"
secondary["Tournament"] =  secondary["SecondaryTourney"]
regularseason["Tournament"] = "Regular"
allgames = pd.concat([postseason, secondary, regularseason])
allgames.iloc[allgames["Tournament"].map(tournamentchange).argsort()] #get all games, with type labeled

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,scorediff,Tournament,SecondaryTourney
91540,2006,103,1449,70,1417,67,H,0,3,Regular,
123451,2012,120,1461,71,1395,59,H,0,12,Regular,
123452,2012,120,1464,77,1453,60,H,0,17,Regular,
123453,2012,121,1104,55,1120,49,H,0,6,Regular,
123454,2012,121,1110,57,1119,40,H,0,17,Regular,
...,...,...,...,...,...,...,...,...,...,...,...
1633,2010,143,1139,63,1393,59,N,0,4,NCAA,
1632,2010,139,1462,71,1338,68,N,0,3,NCAA,
1631,2010,139,1452,68,1281,59,N,0,9,NCAA,
1638,2010,144,1181,70,1345,57,N,0,13,NCAA,


In [60]:
def expected_score(eloa, elob, isHome, home_advantage):
    """
    Gets the expected probabilty of winning for team A against team B
    eloa: the ELO of team A
    elob: the ELO of team B
    isHome: number which describes whether A is home(1), neutral(0), or away(-1)
    home_advantage: home advantage quantified in added winning odds
    """
    #from eloa pov
    expecta = 1/(1 + 10**((elob - eloa)/400)) #calculates expected
    homeadd = expecta + home_advantage
    awayloss = expecta - home_advantage
    if homeadd > 1: #if more than 1 or less than 0, set to 1 and 0, respectively(edge case)
        homeadd = 1
    if awayloss < 0:
        awayloss = 0
    
    if isHome == 1:
        return homeadd
    elif isHome == 0:
        return expecta
    else:
        return awayloss

In [61]:
def elo_change(eloa, elob, scalar, mov, win, expected_score, mov_factor, tournament):
    """
    Determines the change in elo after a result
    """
    if win == 1:
        winelodiff = eloa - elob
    else:
        winelodiff = elob - eloa
    mov_factor = (math.log(mov, 10) + 1) * (2.2) * (mov_factor)/((winelodiff) * .001 + 2.2)
    change = scalar * mov_factor * (win - expected_score) * tournamentchange[tournament]
    return change

In [62]:
def clean_Home(WLoc):
    """
    Data Cleaning Function, turning whether Home or Not to integer
    """
    if WLoc == "H":
        return 1
    elif WLoc == "N":
        return 0
    else:
        return -1

In [63]:
regularseason["WLoc_Num"] = regularseason["WLoc"].apply(clean_Home)
postseason["WLoc_Num"] = postseason["WLoc"].apply(clean_Home)
allgames["WLoc_Num"] = allgames["WLoc"].apply(clean_Home)

In [64]:
def set_zero(df, cols):
    """
    Sets new columns to 0
    """
    for col in cols:
        df[col] = 0
colchange = ["WinnerElo", "LoserElo", "WinningOdds", "WChange", "LChange"]

In [65]:
set_zero(regularseason, colchange)
set_zero(postseason, colchange)
set_zero(allgames, colchange)

In [66]:
def simulate_game(elos, df, home_advantage):
    """
    Simulates games and changes ELOs based on imports
    df - dataframe containing all games
    """
    for game in range(df.shape[0]):
        winner = df["WTeamID"][game]
        loser = df["LTeamID"][game]
        winner_elo = elos[winner]
        loser_elo = elos[loser]
        df["WinnerElo"][game] = float(winner_elo)
        df["LoserElo"][game] = float(loser_elo)
        game_expected_score = expected_score(winner_elo, loser_elo, df["WLoc_Num"][game], home_advantage)
        change = elo_change(winner_elo, loser_elo, scalar, df["scorediff"][game], 1, game_expected_score, mov_factor, df["Tournament"][game])
        df["WChange"][game] = change
        df["LChange"][game] = -1 * change
        elos[winner] = elos[winner] + change
        elos[loser] = elos[loser] - change
        df["WinningOdds"][game] = game_expected_score

In [67]:
def get_conference(year, team):
    """
    Gets conference of team
    """
    conf = conferences[(conferences["Season"] == year) & (conferences["TeamID"] == team)].reset_index()
    if conf.shape[0] >= 1:
        return conf["ConfAbbrev"][0]
    else:
        return "none"

In [68]:
import time
def get_variance(iqrs, vars, means, teams, df):
    """
    Gets ELO variance for all teams
    """
    for team in teams:
        Wgames = df[df["WTeamID"] == team]
        Lgames = df[df["LTeamID"] == team]
        changes = pd.concat([Wgames["WChange"], Lgames["LChange"]], axis = 0)
        q1 = changes.quantile(0.25)
        q3 = changes.quantile(0.75)
        iqrs[team] = q3 - q1
        vars[team] = changes.std()
        means[team] = changes.median()
def apply_variance(iqrs, vars, means, df):
    """
    Assigns variance and iqrs to appropriate df
    """
    for game in range(df.shape[0]):
        df["WIQR"][game] = iqrs[df["WTeamID"][game]]
        df["LIQR"][game] = iqrs[df["LTeamID"][game]]
        df["WSD"][game] = vars[df["WTeamID"][game]]
        df["LSD"][game] = vars[df["LTeamID"][game]]
        df["WMedian"][game] = means[df["WTeamID"][game]]
        df["LMedian"][game] = means[df["LTeamID"][game]]

In [69]:
scalar = 10
mov_factor = 1
difference = 0.1
from copy import copy
def test_elo(scalar, mov_factor, regular, post, all):
    """
    Test scalars and mov_factor by applying elos and seeing Brier Score
    """
    count = 0
    brier_scores = []
    no_tournyelos = defaultdict(def_elo)
    elodf = pd.DataFrame()
    alltourneyelodf = pd.DataFrame()
    notourneyelodf = pd.DataFrame()
    global_iqrs = {}
    global_vars = {}
    global_means = {}
    for seasons in pd.unique(regular["Season"]):
        season = regular[regular["Season"] == seasons].reset_index()
        tournament = post[post["Season"] == seasons].reset_index()
        wholeseason = all[all["Season"] == seasons].reset_index()
        wholeseason = wholeseason.iloc[wholeseason["Tournament"].map(tournamentchange).argsort()]
        home_winning = season[season["WLoc_Num"] == 1].shape[0] / (season[season["WLoc_Num"] == 1].shape[0] + season[season["WLoc_Num"] == -1].shape[0])
        home_advantage = home_winning - 0.5 
        tournament["True"] = 1
        tournyelos = copy(no_tournyelos)
        seasonteams = pd.concat([season["WTeamID"], season["LTeamID"]], axis = 0)
        simulate_game(no_tournyelos, season, home_advantage)
        simulate_game(tournyelos, wholeseason, home_advantage)
        get_variance(global_iqrs, global_vars, global_means, seasonteams.unique(), season)
        pretournament = copy(no_tournyelos)
        set_zero(tournament, ["WSD", "WIQR", "LSD", "LIQR", "WMedian", "LMedian"])
        if tournament.shape[0] > 0:
            iqrs = {}
            vars = {}
            means = {}
            teams = pd.concat([tournament["WTeamID"], tournament["LTeamID"]], axis = 0)
            get_variance(iqrs, vars, means, teams.unique(), season)
            apply_variance(iqrs, vars, means, tournament)
            simulate_game(no_tournyelos, tournament, home_advantage)
            brier_scores.append(brier_score_loss(tournament["True"], tournament["WinningOdds"]))
            elodf = pd.concat([elodf, tournament], ignore_index = True)
        alltourneyelodf = pd.concat([alltourneyelodf, wholeseason], ignore_index = True)
        notourneyelodf = pd.concat([notourneyelodf, season])
        conferencemeans = {}
        conferencemeans["none"] = 1500
        no_tournyelos = copy(tournyelos)
        for key in no_tournyelos:
            keyconf = get_conference(seasons, key)
            if keyconf not in conferencemeans and keyconf != "none":
                conferenceteams = conferences[(conferences["ConfAbbrev"] == keyconf) & (conferences["Season"] == seasons)].reset_index()
                conferenceelos = []
                for team in conferenceteams["TeamID"].unique():
                    conferenceelos.append(no_tournyelos[team])
                conferencemeans[keyconf] = np.mean(conferenceelos)
            conferencemean = conferencemeans[keyconf]
            if no_tournyelos[key] > conferencemean:
                no_tournyelos[key] = no_tournyelos[key] - (abs(no_tournyelos[key] - conferencemean))/4
            elif no_tournyelos[key] < conferencemean:
                no_tournyelos[key] = no_tournyelos[key] + (abs(no_tournyelos[key] - conferencemean))/4
            
    #return np.mean(brier_scores), [scalar, mov_factor, np.mean(brier_scores)]
    return elodf, alltourneyelodf, notourneyelodf, no_tournyelos, pretournament, global_iqrs, global_vars, global_means


In [70]:
elodf, alltourneyelodf, notourneyelodf, tournyelos, pretournament, global_iqrs,global_vars, global_means = test_elo(6.5, 4.0, regularseason, postseason, allgames) 

#output.to_csv("ELO.csv", index = False)

KeyboardInterrupt: 

In [None]:
from heapq import nlargest
nlargest(10, tournyelos, key=tournyelos.get)#get largest elos
#max(tournyelos, key=tournyelos.get)

In [None]:
difference = 1
smallest_brier = 1
smallest_coord = []
coordinates = []
#testing variety of scalars and mov_factors to optimize
for scalar in np.arange(6.5, 8, .5):
    for mov_factor in np.arange(4, 4.3, .05):
        avg_brier, coord = test_elo(scalar, mov_factor)
        if avg_brier < smallest_brier:
            smallest_brier = avg_brier
            smallest_coord = coord
        coordinates.append(coord)
        print(avg_brier, coord)
print(smallest_brier)
print(smallest_coord)
print(coordinates) #scalar = 7, mov_factor = 4


In [None]:
def convert_to_df(elo, var, iqr, means):
    elos = pd.DataFrame.from_dict(elo,orient='index', columns=['ELO'])
    elos.index.name = "Team"
    elos = elos.reset_index()
    #gets ELOS of all teams as well as variance, means, and iqrs into one df
    elos_vars = pd.DataFrame.from_dict(var,orient='index', columns=['VARS'])
    elos_vars.index.name = "Team"
    elos_vars = elos_vars.reset_index()
    elos_iqr = pd.DataFrame.from_dict(iqr,orient='index', columns=['IQR'])
    elos_iqr.index.name = "Team"
    elos_iqr = elos_iqr.reset_index()
    elos_means = pd.DataFrame.from_dict(means,orient='index', columns=['MEAN'])
    elos_means.index.name = "Team"
    elos_means = elos_means.reset_index()
    elocombined = pd.merge(elos, elos_vars, on = "Team")
    elocombined = pd.merge(elocombined, elos_means, on = "Team")
    elocombined = pd.merge(elocombined, elos_iqr, on = "Team")
    elocombined = pd.merge(elos, elos_vars, on = "Team")
    #elocombined.to_csv("2024Elos.csv", index = False)
    return elocombined

In [71]:
def convert_all(season):
    seareg = regularseason[regularseason["Season"] <= season]
    seapost = postseason[postseason["Season"] <= season]
    seaall = allgames[allgames["Season"] <= season]
    seaelodf, seaalltourneyelodf, seanotourneyelodf, seatournyelos, seapretournament, sea_iqrs,sea_vars, sea_means = test_elo(7.0, 4.0, seareg, seapost,seaall) 
    seadf = convert_to_df(seatournyelos, sea_iqrs, sea_vars, sea_means)
    seadf["Season"] = season
    return seadf

#output.to_csv("ELO.csv", index = False)

In [72]:
allseasons = convert_all(1985)
for season in range(1986, 2025):
    seadf = convert_all(season)
    allseasons = pd.concat([allseasons, seadf])
    print(season)


1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


In [73]:
allseasons.to_csv("allELOSMens.csv")

In [None]:
def get_variance(iqrs, vars, means, teams, df):
    """
    Assigns variance, iqrs, and means into appropriate dictionary
    """
    for team in teams:
        Wgames = df[df["WTeamID"] == team]
        Lgames = df[df["LTeamID"] == team]
        changes = pd.concat([Wgames["WChange"], Lgames["LChange"]], axis = 0)
        q1 = changes.quantile(0.25)
        q3 = changes.quantile(0.75)
        iqrs[team] = q3 - q1
        vars[team] = changes.std()
        means[team] = changes.median()