In [1]:
# Created by Ian Cox | 2/21/2023
# AHEAD | March Madness 2023 Bracket Challenge
# Regular Season | Logical Rule Classifier
# Pick winners based on historical regular season performance

#data: MRegularSeasonCompactResults.csv
#      MTeams.csv

In [1]:
# import libs
import os
import pandas as pd

In [2]:
os.chdir('C:\\Users\\IanCox\\OneDrive - AHEAD\\Documents\\python\\march_madness\\mens-march-mania-2022\\MDataFiles_Stage1')

In [3]:
regseason = pd.read_csv('MRegularSeasonCompactResults.csv')
teams = pd.read_csv('MTeams.csv')

In [4]:
regseason

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0
...,...,...,...,...,...,...,...,...
174466,2022,98,1400,79,1242,76,H,0
174467,2022,98,1411,66,1126,63,A,0
174468,2022,98,1422,68,1441,49,A,0
174469,2022,98,1438,69,1181,68,A,0


In [5]:
regseason.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
dtype: object

In [6]:
teams

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2022
1,1102,Air Force,1985,2022
2,1103,Akron,1985,2022
3,1104,Alabama,1985,2022
4,1105,Alabama A&M,2000,2022
...,...,...,...,...
367,1468,Bellarmine,2021,2022
368,1469,Dixie St,2021,2022
369,1470,Tarleton St,2021,2022
370,1471,UC San Diego,2021,2022


In [7]:
teams.dtypes

TeamID            int64
TeamName         object
FirstD1Season     int64
LastD1Season      int64
dtype: object

In [8]:
# create a new dataframe with the summary counts of WTeamID
counts_df_w = pd.DataFrame(regseason['WTeamID'].value_counts())
counts_df_w = counts_df_w.reset_index()
counts_df_w.columns = ['WTeamID', 'win_total']

In [9]:
counts_df_w

Unnamed: 0,WTeamID,win_total
0,1181,958
1,1242,949
2,1246,898
3,1314,887
4,1112,878
...,...,...
367,1471,12
368,1289,6
369,1118,6
370,1472,5


In [10]:
# create a new dataframe with the summary counts of LTeamID
counts_df_l = pd.DataFrame(regseason['LTeamID'].value_counts())
counts_df_l = counts_df_l.reset_index()
counts_df_l.columns = ['LTeamID', 'loss_total']

In [11]:
counts_df_l

Unnamed: 0,LTeamID,loss_total
0,1152,795
1,1271,767
2,1363,736
3,1341,728
4,1224,714
...,...,...
367,1471,22
368,1118,21
369,1327,19
370,1468,18


In [12]:
# merge the total wins and total losses dfs
merge1 = pd.merge(counts_df_w, counts_df_l, left_on='WTeamID', right_on='LTeamID', how='left')

In [13]:
# view the merge
merge1

Unnamed: 0,WTeamID,win_total,LTeamID,loss_total
0,1181,958,1181,235
1,1242,949,1242,224
2,1246,898,1246,281
3,1314,887,1314,308
4,1112,878,1112,283
...,...,...,...,...
367,1471,12,1471,22
368,1289,6,1289,42
369,1118,6,1118,21
370,1472,5,1472,14


In [14]:
# drop one of the id columns because we don't need dupe data
merge2 = merge1.drop('LTeamID', axis=1)

In [15]:
merge2

Unnamed: 0,WTeamID,win_total,loss_total
0,1181,958,235
1,1242,949,224
2,1246,898,281
3,1314,887,308
4,1112,878,283
...,...,...,...
367,1471,12,22
368,1289,6,42
369,1118,6,21
370,1472,5,14


In [16]:
# rename the columns
merge2.columns = ['TeamID', 'wins','losses']

In [17]:
merge2

Unnamed: 0,TeamID,wins,losses
0,1181,958,235
1,1242,949,224
2,1246,898,281
3,1314,887,308
4,1112,878,283
...,...,...,...
367,1471,12,22
368,1289,6,42
369,1118,6,21
370,1472,5,14


In [18]:
# join the team names to TeamID
merge3 = pd.merge(merge2, teams[['TeamID','TeamName']], left_on='TeamID', right_on='TeamID', how='left')

In [19]:
merge3

Unnamed: 0,TeamID,wins,losses,TeamName
0,1181,958,235,Duke
1,1242,949,224,Kansas
2,1246,898,281,Kentucky
3,1314,887,308,North Carolina
4,1112,878,283,Arizona
...,...,...,...,...
367,1471,12,22,UC San Diego
368,1289,6,42,Morris Brown
369,1118,6,21,Armstrong St
370,1472,5,14,St Thomas MN


In [20]:
# create a new win percentage ratio for each team
merge3['win_pct'] = merge3['wins'] / (merge3['wins'] + merge3['losses'])

In [21]:
merge3

Unnamed: 0,TeamID,wins,losses,TeamName,win_pct
0,1181,958,235,Duke,0.803018
1,1242,949,224,Kansas,0.809037
2,1246,898,281,Kentucky,0.761662
3,1314,887,308,North Carolina,0.742259
4,1112,878,283,Arizona,0.756245
...,...,...,...,...,...
367,1471,12,22,UC San Diego,0.352941
368,1289,6,42,Morris Brown,0.125000
369,1118,6,21,Armstrong St,0.222222
370,1472,5,14,St Thomas MN,0.263158


In [22]:
# dupe merge3 as this is a df to refrence again, output it as well
regseason_win_pct = merge3
regseason_win_pct.to_csv('regseason_win_pct.csv', index=False)

In [23]:
# now create a random selection of two different teams, and pick a winner of a game based on historical performance
import random

# Select two random TeamID values
team1_id, team2_id = random.sample(regseason_win_pct['TeamID'].tolist(), 2)

# Create a smaller DataFrame with only the selected teams
selected_teams = regseason_win_pct[regseason_win_pct['TeamID'].isin([team1_id, team2_id])]

# Get the win_pct for each team
team1_win_pct = selected_teams.loc[selected_teams['TeamID'] == team1_id, 'win_pct'].values[0]
team2_win_pct = selected_teams.loc[selected_teams['TeamID'] == team2_id, 'win_pct'].values[0]

In [24]:
selected_teams

Unnamed: 0,TeamID,wins,losses,TeamName,win_pct
357,1465,47,45,Cal Baptist,0.51087
360,1466,32,64,North Alabama,0.333333


In [25]:
# Compare the win_pct and return the TeamID with the higher value
team1_name = selected_teams.loc[selected_teams['TeamID'] == team1_id, 'TeamName'].values[0]
team2_name = selected_teams.loc[selected_teams['TeamID'] == team2_id, 'TeamName'].values[0]

if team1_win_pct > team2_win_pct:
    print("Team %d (%s) has a higher win percentage (%.2f%%) than Team %d (%s) (%.2f%%)\n\n%s will win!" % 
          (team1_id, team1_name, team1_win_pct*100, team2_id, team2_name, team2_win_pct*100, team1_name))
else:
    print("Team %d (%s) has a higher win percentage (%.2f%%) than Team %d (%s) (%.2f%%)\n\n%s will win!" % 
          (team2_id, team2_name, team2_win_pct*100, team1_id, team1_name, team1_win_pct*100, team2_name))

Team 1465 (Cal Baptist) has a higher win percentage (51.09%) than Team 1466 (North Alabama) (33.33%)

Cal Baptist will win!


In [26]:
# wrap all of the above into a function that will return a winning team from a random match selection
def faceoff(dataframe):
    # Select two random TeamID values
    team1_id, team2_id = random.sample(dataframe['TeamID'].tolist(), 2)

    # Create a smaller DataFrame with only the selected teams
    selected_teams = dataframe[dataframe['TeamID'].isin([team1_id, team2_id])]

    # Get the win_pct for each team
    team1_win_pct = selected_teams.loc[selected_teams['TeamID'] == team1_id, 'win_pct'].values[0]
    team2_win_pct = selected_teams.loc[selected_teams['TeamID'] == team2_id, 'win_pct'].values[0]
    
    # Compare the win_pct and return the TeamID with the higher value
    team1_name = selected_teams.loc[selected_teams['TeamID'] == team1_id, 'TeamName'].values[0]
    team2_name = selected_teams.loc[selected_teams['TeamID'] == team2_id, 'TeamName'].values[0]

    if team1_win_pct > team2_win_pct:
        print("Team %d (%s) has a higher historical win percentage (%.2f%%) than Team %d (%s) (%.2f%%)\n\n%s will win!" % 
              (team1_id, team1_name, team1_win_pct*100, team2_id, team2_name, team2_win_pct*100, team1_name))
    else:
        print("Team %d (%s) has a higher historical win percentage (%.2f%%) than Team %d (%s) (%.2f%%)\n\n%s will win!" % 
              (team2_id, team2_name, team2_win_pct*100, team1_id, team1_name, team1_win_pct*100, team2_name))
        
    return selected_teams

In [28]:
faceoff(regseason_win_pct)

Team 1125 (Belmont) has a higher historical win percentage (67.55%) than Team 1308 (New Mexico St) (63.92%)

Belmont will win!


Unnamed: 0,TeamID,wins,losses,TeamName,win_pct
37,1308,698,394,New Mexico St,0.639194
212,1125,460,221,Belmont,0.675477


## Pros/Cons of Approach

### Pros

* Objective data. Historical performance is objective and quantifiable, this data is real

* Historical data can provide a baseline understanding of a team's strengths, weaknesses and trends over time

* Predictive power! While historical performance data may not be perfect in predicting game outcomes, it can still be a useful predictor when used in conjunction with other factors

* Patterns in historical performance <b>MAY</b> be useful in predicting future outcomes

### Cons

* Past performance is not always indicitive of future performance

* We only incorporated regular season play

* Sample size. New teams or teams without much historical D1 play time will have misleading ratios

* Lack of context. Does not account for injuries, weather conditions, and other ongoing factors

* Player variation. Players graduate, change teams, become injured, are benched/subbed, etc

* Gameplay specific factors (E.G. a team may be vulnerable to a certain playstyle or matchup)

* While we are using real data to inform our choices, we are not yet using a machince learning algorithm to arrive at our results...