In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import brier_score_loss
from sklearn.metrics import log_loss
from collections import defaultdict
import warnings
from pandas.errors import SettingWithCopyWarning
import re

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
postseason = pd.read_csv("csvs/MNCAATourneyCompactResults.csv")
seeds = pd.read_csv("csvs/MNCAATourneySeeds.csv")
seeds2024 = pd.read_csv("csvs/actual_2024_tourney_seeds.csv")
regularseason = pd.read_csv("2024MarchMadness/csvs/MRegularSeasonCompactResults.csv") #importing csvs, switch with mens to get mens output
elos = pd.read_csv("csvs/2024Elos.csv")
postseason["scorediff"] = postseason["WScore"] - postseason["LScore"]
postseasonseeds = pd.merge(postseason, seeds, left_on = ["WTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns={"Season_x": "Season", "Seed": "WSeed"}).drop(columns = ["TeamID"])
postseasonseeds = pd.merge(postseasonseeds, seeds, left_on = ["LTeamID", "Season"], right_on = ["TeamID", "Season"]).rename(columns={"Season_x": "Season", "Seed": "LSeed"}).drop(columns = ["TeamID"])
postseasonseeds["WSeed"] = postseasonseeds["WSeed"].str.extract('(\d+)').astype(int)
postseasonseeds["LSeed"] = postseasonseeds["LSeed"].str.extract('(\d+)').astype(int)#general cleaning

In [None]:
def get_winning_percentage(winseed, loseseed):
    """
    Gets winning percentage of winseed against loseseed
    get_winning_percentage(winseed, loseseed) = 1 - get_winning_percentage(loseseed, winseed)
    """
    winsseeddf = postseasonseeds[(postseasonseeds["WSeed"] == winseed) & (postseasonseeds["LSeed"] == loseseed)]
    losesseeddf = postseasonseeds[(postseasonseeds["WSeed"] == loseseed) & (postseasonseeds["LSeed"] == winseed)]
    wins = winsseeddf.shape[0]
    loses = losesseeddf.shape[0]
    diff = winseed - loseseed
    if wins + loses == 0:
        if diff > 0: #loseseed higher
            return get_winning_percentage(winseed, loseseed + 1)
        else: #winseed higher
            return get_winning_percentage(winseed + 1, loseseed)
    if wins + loses >= 1:
        if wins / (wins + loses) == 1:
            return (wins / (wins + loses)) - (0.1 / get_num(winseed, loseseed))
        elif wins / (wins + loses) == 0:
            return (wins / (wins + loses)) + (0.1 / get_num(winseed, loseseed))
        else:
            return (wins / (wins + loses))
def get_num(winseed, loseseed):
    """
    Gets number of matchups between two seeds
    """
    winsseeddf = postseasonseeds[(postseasonseeds["WSeed"] == winseed) & (postseasonseeds["LSeed"] == loseseed)]
    losesseeddf = postseasonseeds[(postseasonseeds["WSeed"] == loseseed) & (postseasonseeds["LSeed"] == winseed)]
    wins = winsseeddf.shape[0]
    loses = losesseeddf.shape[0]
    return wins + loses

In [None]:
data = pd.DataFrame(columns = range(1,17), index=range(1,17))
#getting all seed combinations
for seed_1 in range(1, 17):
    results = []
    for seed_2 in range(1, 17):
        win_percent = get_winning_percentage(seed_1, seed_2)
        print(seed_1, seed_2)
        print(win_percent)
        results.append(win_percent)
    data[seed_1] = results

In [None]:
import seaborn
import matplotlib.pyplot as plt
seaborn.heatmap(data, cmap =seaborn.cm.rocket_r)
plt.xlabel('Winning Seed')#heatmap
plt.ylabel('Losing Seed')

In [None]:
submission = pd.read_csv("2024MarchMadness/csvs/SampleSubmission2023.csv") #preparing submission csv
submission['Season'] = submission['ID'].apply(lambda x: int(x.split('_')[0]))
submission["T1_TeamID"] = submission['ID'].apply(lambda x: int(x.split('_')[1]))
submission["T2_TeamID"] = submission['ID'].apply(lambda x: int(x.split('_')[2]))
submission.head()

In [None]:
elos["Team"].sort_values()
submission[submission["T1_TeamID"].isin(elos["Team"])]
seeds2024["OriginalSeed"] = seeds2024["Seed"].str.extract('(\d+)').astype(int)
seeds2024 = seeds2024[["TeamID", "OriginalSeed"]]
eloseed = pd.merge(elos, seeds2024, left_on = "Team", right_on = "TeamID").drop(columns = ["Team"])
#more cleaning

In [None]:
submission_T1 = pd.merge(submission, eloseed, left_on = "T1_TeamID", right_on = "TeamID").rename(columns = {"ELO":"T1ELO", "VARS":"T1VARS","OriginalSeed":"T1Seed", "MEAN":"T1MEAN", "IQR":"T1IQR"})
submissionboth = pd.merge(submission_T1, eloseed, left_on = "T2_TeamID", right_on = "TeamID").rename(columns = {"ELO":"T2ELO", "VARS":"T2VARS","OriginalSeed":"T2Seed", "MEAN":"T2MEAN", "IQR":"T2IQR"})
submissionboth["BasePercent"] = submissionboth.apply(lambda x: get_winning_percentage(x['T1Seed'], x['T2Seed']), axis=1)
submissionboth["NumTimes"] = submissionboth.apply(lambda x: get_num(x['T1Seed'], x['T2Seed']), axis=1)
#more cleaning!

In [None]:
def expected_score(eloa, elob, isHome, home_advantage):
    """
    Gets the expected probabilty of winning for team A against team B
    eloa: the ELO of team A
    elob: the ELO of team B
    isHome: number which describes whether A is home(1), neutral(0), or away(-1)
    home_advantage: home advantage quantified in added winning odds
    """
    #from eloa pov
    expecta = 1/(1 + 10**((elob - eloa)/400)) #calculates expected
    homeadd = expecta + home_advantage
    awayloss = expecta - home_advantage
    if homeadd > 1: #if more than 1 or less than 0, set to 1 and 0, respectively(edge case)
        homeadd = 1
    if awayloss < 0:
        awayloss = 0
    
    if isHome == 1:
        return homeadd
    elif isHome == 0:
        return expecta
    else:
        return awayloss

In [None]:
submissionboth["EloPercent"] = submissionboth.apply(lambda x: expected_score(x['T1ELO'], x['T2ELO'], 0, 0), axis=1)
submissionboth["BaseVariance"] = submissionboth["BasePercent"] * (1 - submissionboth["BasePercent"])  #winner
submissionboth["BaseWeight"] = submissionboth["NumTimes"] / submissionboth["BaseVariance"] #winner
submissionboth.loc[submissionboth['BaseWeight'] > 2000, "BaseWeight"] = 2000
submissionboth["SDDiff"] = (submissionboth["T1VARS"] + submissionboth["T2VARS"]) / 2
submissionboth["SDDiffNorm"] = 1 / submissionboth["SDDiff"]
submissionboth["SDDiffNorm"] = (submissionboth["SDDiff"] - submissionboth["SDDiff"].min()) / (submissionboth["SDDiff"].max() - submissionboth["SDDiff"].min()) + 1
submissionboth["BaseWeightNorm"] = (submissionboth["BaseWeight"] - submissionboth["BaseWeight"].min()) / (submissionboth["BaseWeight"].max() - submissionboth["BaseWeight"].min()) + 1
submissionboth["RTTMPrediction"] = (submissionboth["BaseWeightNorm"] * submissionboth["BasePercent"] + submissionboth["SDDiffNorm"] * submissionboth["EloPercent"]) / (submissionboth["BaseWeightNorm"] + submissionboth["SDDiffNorm"])
#getting weights and values of model, making a prediction

In [None]:
submissionbothfinal = submissionboth[["ID","RTTMPrediction"]].rename(columns = {"RTTMPrediction":"Pred"})#cleaning
submissionbothfinal.to_csv("WSubmission_New.csv", index = False)

In [None]:
mens = pd.read_csv("2024MarchMadness/csvs/MTeams.csv")
womens = pd.read_csv("2024MarchMadness/csvs/WTeams.csv")
welos = pd.read_csv("2024MarchMadness/csvs/2024WElos.csv")
mens_rank = pd.merge(elos, mens, left_on = ["Team"], right_on = ["TeamID"])
womens_rank = pd.merge(welos, womens, left_on = ["Team"], right_on = ["TeamID"])
mens_rank.sort_values(by=['ELO'], ascending=False)
womens_rank.sort_values(by=['ELO'], ascending=False) #getting ELO rankings by writeup

In [None]:
monte = pd.read_csv("2024MarchMadness/csvs/submission.csv")
seeds = pd.read_csv("2024MarchMadness/csvs/actual_2024_tourney_seeds.csv")
monte = pd.merge(monte, seeds, left_on = ["Team", "Tournament"], right_on = ["Seed", "Tournament"])
mens = monte[monte["Tournament"] == "M"]
womens = monte[monte["Tournament"] == "W"]
mens = mens[["Bracket","Slot", "Team", "Tournament", "TeamID"]]
womens = womens[["Bracket", "Slot", "Team", "Tournament", "TeamID"]]
womensteams = pd.read_csv("2024MarchMadness/csvs/WTeams.csv")
mensteams = pd.read_csv("2024MarchMadness/csvs/MTeams.csv")
womens = pd.merge(womens, womensteams, on = "TeamID")
mens = pd.merge(mens, mensteams, on = "TeamID")
slots = pd.unique(womens["Slot"])
slots.sort()#analysis for writeup