In [30]:
## This Notebook creates the yearly_team_stats file.
# It takes the raw scraped game data from the MHSAA website and groups results by team,
# then calculates the yearly stats split into home and awy for each team.

# Dependencies

import pandas as pd
import numpy as np
import os

# File to Load

file_to_load = os.path.join("TEMP", "clean_tables", "game_level", "_2023_games.csv")

In [31]:
# load file as dataframe
df = pd.read_csv(file_to_load)

# ## Quick Check of the file

print(df.columns)

df.info()

# how many unique teams are there in the data set

print(df['teamName'].nunique())

df.sample(5)

df.head(20)



# show value count for all of these columns
keep = ['contestType', 'seasonType', 'postSeasonInfo','tournamentInfo', 'tournamentName', 'tournamentType', 'contestName',    'seasonTypeCode']

for col in df.columns:
    print(col)
    print(df[col].value_counts())
    print('-----------------------')

# create a new dataframe with only the columns we want to keep






Index(['teamName', 'teamId', 'leagueName', 'opponentName', 'opponentId',
       'gameDate', 'gameTime', 'homeOrAway', 'location', 'teamScore',
       'opponentScore', 'notes', 'contestType', 'seasonType', 'postSeasonInfo',
       'tournamentInfo', 'tournamentName', 'tournamentType', 'contestName',
       'seasonTypeCode'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22176 entries, 0 to 22175
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   teamName        22176 non-null  object 
 1   teamId          22176 non-null  int64  
 2   leagueName      20985 non-null  object 
 3   opponentName    21585 non-null  object 
 4   opponentId      21585 non-null  float64
 5   gameDate        22176 non-null  object 
 6   gameTime        22176 non-null  object 
 7   homeOrAway      22176 non-null  object 
 8   location        19581 non-null  object 
 9   teamScore       16343 non-null  float64
 10 

In [32]:
total_games = len(df)
games_with_score = len(df.dropna(subset=['teamScore', 'opponentScore']))
games_without_score = total_games - games_with_score

print(f'Total games: {total_games}\nGames with score: {games_with_score}\nGames without score: {games_without_score}\nProportion of games with score: {games_with_score / total_games}')


Total games: 22176
Games with score: 16343
Games without score: 5833
Proportion of games with score: 0.7369678932178932


In [33]:


# create an empty dataframe to hold the results
yearly_team_stats = pd.DataFrame()

# create a team_df with teamName, teanId and leagueName to use as a key later one
team_df = df[['teamName', 'teamId', 'leagueName']].drop_duplicates()




In [34]:
grouped = df.groupby('teamName')

summary_stats = grouped.agg({
    'teamScore': ['count', 'mean', 'median', 'sum'],
    'opponentScore': ['count', 'mean', 'median', 'sum']
}).reset_index()

## Show a sample of the summary stats
summary_stats.sample(5)

## Show the shape of the summary stats
summary_stats.shape

summary_stats.columns

#




MultiIndex([(     'teamName',       ''),
            (    'teamScore',  'count'),
            (    'teamScore',   'mean'),
            (    'teamScore', 'median'),
            (    'teamScore',    'sum'),
            ('opponentScore',  'count'),
            ('opponentScore',   'mean'),
            ('opponentScore', 'median'),
            ('opponentScore',    'sum')],
           )

In [35]:
## look at the columns in the summary stats dataframe
summary_stats.columns

# simplify the column names
summary_stats.columns = ['teamName', 
                        'games_played',
                        'runs_scored_mean',
                        'runs_scored_median',
                        'runs_scored_total',
                        'runs_allowed_count',
                        'runs_allowed_mean',
                        'runs_allowed_median',
                        'runs_allowed_total']


summary_stats.sample(5)

summary_stats.shape

summary_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646 entries, 0 to 645
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   teamName             646 non-null    object 
 1   games_played         646 non-null    int64  
 2   runs_scored_mean     624 non-null    float64
 3   runs_scored_median   624 non-null    float64
 4   runs_scored_total    646 non-null    float64
 5   runs_allowed_count   646 non-null    int64  
 6   runs_allowed_mean    624 non-null    float64
 7   runs_allowed_median  624 non-null    float64
 8   runs_allowed_total   646 non-null    float64
dtypes: float64(6), int64(2), object(1)
memory usage: 45.5+ KB


In [36]:
## Start to build the year stats dataframe
# Merge the team_df and summary_stats dataframes into the beginning of the yearly_team_stats dataframe

yearly_team_stats = pd.merge(team_df, summary_stats, on='teamName', how='left')



In [37]:

# # lookat the shape of the yearly_team_stats dataframe
yearly_team_stats.shape

# # look at the columns in the yearly_team_stats dataframe
yearly_team_stats.columns

# reorder the columns
# yearly_team_stats = yearly_team_stats[['teamName', 'leagueName', 'games_played', 'runs_scored_mean', 'runs_scored_median', 'runs_scored_total', 'runs_allowed_mean', 'runs_allowed_median', 'runs_allowed_total']]

# yearly_team_stats.sample(5)
# 

Index(['teamName', 'teamId', 'leagueName', 'games_played', 'runs_scored_mean',
       'runs_scored_median', 'runs_scored_total', 'runs_allowed_count',
       'runs_allowed_mean', 'runs_allowed_median', 'runs_allowed_total'],
      dtype='object')

In [38]:
def calculate_results(row):
    if row['teamScore'] > row['opponentScore']:
        result = 'win'
    elif row['teamScore'] < row['opponentScore']:
        result = 'loss'
    else:
        result = 'tie'

    if row['homeOrAway'] == 'H':
        return f'home_{result}'
    else:
        return f'away_{result}'


df['result'] = df.apply(calculate_results, axis=1)


# calculate total home and away games for each row
df['totalHomeGames'] = np.where(df['homeOrAway'] == 'H', 1, 0)
df['totalAwayGames'] = np.where(df['homeOrAway'] == 'A', 1, 0)

# calculate the total home and away wins for each row
df['home_wins'] = np.where(df['result'] == 'home_win', 1, 0)
df['away_wins'] = np.where(df['result'] == 'away_win', 1, 0)

# calculate the total home and away losses for each row
df['home_losses'] = np.where(df['result'] == 'home_loss', 1, 0)
df['away_losses'] = np.where(df['result'] == 'away_loss', 1, 0)

# calculate the total home and away ties for each row
df['home_ties'] = np.where(df['result'] == 'home_tie', 1, 0)
df['away_ties'] = np.where(df['result'] == 'away_tie', 1, 0)



## aggregate the results by team
grouped = df.groupby('teamName')

summary_stats = grouped.agg({
    'totalHomeGames': 'sum',
    'totalAwayGames': 'sum',
    'home_wins': 'sum',
    'away_wins': 'sum',
    'home_losses': 'sum',
    'away_losses': 'sum',
    'home_ties': 'sum',
    'away_ties': 'sum'
}).reset_index()


# df.sample(5)

# merge the summary stats into the yearly_team_stats dataframe
yearly_team_stats = pd.merge(yearly_team_stats, summary_stats, on='teamName', how='left')




In [39]:
# merge the results dataframe with the summary stats dataframe
# yearly_team_stats = pd.merge(yearly_team_stats, results, on='teamName', how='left')

yearly_team_stats.sample(5)


Unnamed: 0,teamName,teamId,leagueName,games_played,runs_scored_mean,runs_scored_median,runs_scored_total,runs_allowed_count,runs_allowed_mean,runs_allowed_median,runs_allowed_total,totalHomeGames,totalAwayGames,home_wins,away_wins,home_losses,away_losses,home_ties,away_ties
145,Battle Creek Lakeview,2713075,Southwestern Michigan Athletic Conference,42,7.785714,7.0,327.0,42,2.904762,2.0,122.0,24,22,20,15,1,6,3,1
560,Garden City,2713554,Western Wayne Athletic Conference,14,6.785714,4.5,95.0,14,5.357143,6.0,75.0,6,34,0,6,1,6,5,22
321,Brownstown Woodhaven,2713386,Downriver League,44,5.295455,4.0,233.0,44,2.659091,2.0,117.0,41,14,23,9,9,3,9,2
217,Jonesville,2713316,Big 8 Conference,24,5.958333,5.5,143.0,24,3.666667,3.0,88.0,19,16,9,9,5,1,5,6
285,Mount Pleasant Sacred Heart,2713020,Mid-State Activities Conference,39,9.641026,8.0,376.0,39,3.333333,2.0,130.0,26,19,17,14,5,3,4,2


In [40]:
######### PROBABLY UNNECESSARY #########

# Calculate the number of games played at home / away and the number of wins / losses / ties at home / away

# def calculate_home_away_results(row):
#     if row['homeOrAway'] == 'Home':
#         if row['result'] == 'win':
#             return 'win_home'
#         elif row['result'] == 'loss':
#             return 'loss_home'
#         else:
#             return 'tie_home'
#     else:
#         if row['result'] == 'win':
#             return 'win_away'
#         elif row['result'] == 'loss':
#             return 'loss_away'
#         else:
#             return 'tie_away'

            
# ## Show a sample of the summary stats
# # df.sample(5)
        

# df['home_away_result'] = df.apply(calculate_home_away_results, axis=1)


# home_away_results = df.groupby(['teamName', 'home_away_result']).size().unstack(fill_value=0).reset_index()

# home_away_results.sample(5)
# # df.sample(5)

# # location_summary = pd.crosstab(df['teamName'], df['homeOrAway'])

# # location_summary.sample(5)

# # # location_summary.shape




In [41]:
## LOOK AT THE SHAPE OF THE MASTER
yearly_team_stats.shape

## LOOK AT THE COLUMNS OF THE MASTER
yearly_team_stats.columns

## LOOK AT A SAMPLE OF THE MASTER
# yearly_team_stats.sample(5)

## check df
# df.sample(5)
# runs_sc


Index(['teamName', 'teamId', 'leagueName', 'games_played', 'runs_scored_mean',
       'runs_scored_median', 'runs_scored_total', 'runs_allowed_count',
       'runs_allowed_mean', 'runs_allowed_median', 'runs_allowed_total',
       'totalHomeGames', 'totalAwayGames', 'home_wins', 'away_wins',
       'home_losses', 'away_losses', 'home_ties', 'away_ties'],
      dtype='object')

In [42]:
### Calculate the runs socred and allowed home and away

# runs scored
runs_scored = df.groupby(['teamName', 'homeOrAway'])['teamScore'].sum().unstack(fill_value=0).reset_index()

# runs allowed
runs_allowed = df.groupby(['teamName', 'homeOrAway'])['opponentScore'].sum().unstack(fill_value=0).reset_index()





# merge the runs scored and runs allowed dataframes named runs_scored_home and runs_allowed_home, ect
runs_scored = pd.merge(runs_scored, runs_allowed, on='teamName', suffixes=('_runs_scored', '_runs_allowed'))

# get the count of records for each team
runs_scored['games'] = df.groupby('teamName')['teamScore'].count().values

# # calculate the run differential overall, home and away
runs_scored['run_differential_home'] = runs_scored['H_runs_scored'] - runs_scored['H_runs_allowed']
runs_scored['run_differential_away'] = runs_scored['A_runs_scored'] - runs_scored['A_runs_allowed']
runs_scored['run_differential'] = runs_scored['run_differential_home'] + runs_scored['run_differential_away']


# runs_scored.sample(5)
## Calculate the averages
runs_scored['runs_scored_avg'] = (runs_scored['H_runs_scored'] + runs_scored['A_runs_scored']) / runs_scored['games']
runs_scored['runs_allowed_avg'] = (runs_scored['H_runs_allowed'] + runs_scored['A_runs_allowed']) / runs_scored['games']

# # Runs scored home and away
runs_scored['runs_scored_avg_home'] = runs_scored['H_runs_scored'] / runs_scored['games']
runs_scored['runs_scored_avg_away'] = runs_scored['A_runs_scored'] / runs_scored['games']
# # allowed home and away
runs_scored['runs_allowed_avg_home'] = runs_scored['H_runs_allowed'] / runs_scored['games']
runs_scored['runs_allowed_avg_away'] = runs_scored['A_runs_allowed'] / runs_scored['games']


# games_with_score = df.groupby('teamName')['teamScore'].count().values

# # Calculate the averages
runs_scored['runs_scored_avg'] = (runs_scored['H_runs_scored'] + runs_scored['A_runs_scored']) / runs_scored['games']
runs_scored['runs_allowed_avg'] = (runs_scored['H_runs_allowed'] + runs_scored['A_runs_allowed']) / runs_scored['games']
runs_scored['run_differential_avg'] = runs_scored['run_differential'] / runs_scored['games']

runs_scored['home_runs_scored_avg'] = runs_scored['H_runs_scored'] / runs_scored['games']
runs_scored['home_runs_allowed_avg'] = runs_scored['H_runs_allowed'] / runs_scored['games']
runs_scored['home_run_differential_avg'] = runs_scored['run_differential_home'] / runs_scored['games']

runs_scored['away_runs_scored_avg'] = runs_scored['A_runs_scored'] / runs_scored['games']
runs_scored['away_runs_allowed_avg'] = runs_scored['A_runs_allowed'] / runs_scored['games']

print(runs_scored.columns)

# show a sample of the runs scored dataframe
# runs_scored.sample(5)

# runs_scored.shape

# runs_scored.info()





Index(['teamName', 'A_runs_scored', 'H_runs_scored', 'A_runs_allowed',
       'H_runs_allowed', 'games', 'run_differential_home',
       'run_differential_away', 'run_differential', 'runs_scored_avg',
       'runs_allowed_avg', 'runs_scored_avg_home', 'runs_scored_avg_away',
       'runs_allowed_avg_home', 'runs_allowed_avg_away',
       'run_differential_avg', 'home_runs_scored_avg', 'home_runs_allowed_avg',
       'home_run_differential_avg', 'away_runs_scored_avg',
       'away_runs_allowed_avg'],
      dtype='object', name='homeOrAway')


In [43]:
## Merge the runs scored and allowed to the master
yearly_team_stats = pd.merge(yearly_team_stats, runs_scored, on='teamName', how='outer')

# yearly_team_stats.sample(5)

# yearly_team_stats.shape

yearly_team_stats.info()

yearly_team_stats.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 665 entries, 0 to 664
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   teamName                   665 non-null    object 
 1   teamId                     665 non-null    int64  
 2   leagueName                 616 non-null    object 
 3   games_played               665 non-null    int64  
 4   runs_scored_mean           641 non-null    float64
 5   runs_scored_median         641 non-null    float64
 6   runs_scored_total          665 non-null    float64
 7   runs_allowed_count         665 non-null    int64  
 8   runs_allowed_mean          641 non-null    float64
 9   runs_allowed_median        641 non-null    float64
 10  runs_allowed_total         665 non-null    float64
 11  totalHomeGames             665 non-null    int32  
 12  totalAwayGames             665 non-null    int32  
 13  home_wins                  665 non-null    int32  

Index(['teamName', 'teamId', 'leagueName', 'games_played', 'runs_scored_mean',
       'runs_scored_median', 'runs_scored_total', 'runs_allowed_count',
       'runs_allowed_mean', 'runs_allowed_median', 'runs_allowed_total',
       'totalHomeGames', 'totalAwayGames', 'home_wins', 'away_wins',
       'home_losses', 'away_losses', 'home_ties', 'away_ties', 'A_runs_scored',
       'H_runs_scored', 'A_runs_allowed', 'H_runs_allowed', 'games',
       'run_differential_home', 'run_differential_away', 'run_differential',
       'runs_scored_avg', 'runs_allowed_avg', 'runs_scored_avg_home',
       'runs_scored_avg_away', 'runs_allowed_avg_home',
       'runs_allowed_avg_away', 'run_differential_avg', 'home_runs_scored_avg',
       'home_runs_allowed_avg', 'home_run_differential_avg',
       'away_runs_scored_avg', 'away_runs_allowed_avg'],
      dtype='object')

In [44]:

# create filter for just mhsaa games tournamentName is not null
playoff_games = df[df['tournamentName'].notnull()]

# display a sample of the mhsaa_games dataframe
playoff_games.sample(5)

# create the same summary stats for the playoff_games dataframe
playoff_summary_stats = pd.crosstab(playoff_games['teamName'], playoff_games['result'])



## Calculate wins and losses
playoff_summary_stats['win'] = playoff_summary_stats['win']
playoff_summary_stats['losses'] = playoff_summary_stats['loss']
# game count
playoff_summary_stats['games'] = playoff_summary_stats['wins'] + playoff_summary_stats['losses']

## runs scored and allowed
playoff_summary_stats['runs_scored'] = playoff_games.groupby('teamName')['teamScore'].sum()
playoff_summary_stats['runs_allowed'] = playoff_games.groupby('teamName')['opponentScore'].sum()

# run differential
playoff_summary_stats['run_differential'] = playoff_summary_stats['runs_scored'] - playoff_summary_stats['runs_allowed']
# run differential average
playoff_summary_stats['run_differential'] = (playoff_summary_stats['runs_scored'] - playoff_summary_stats['runs_allowed']) / playoff_summary_stats['games']

# append the name of the column to the back of the column name with _playoff
playoff_summary_stats.columns = [col + '_playoff' for col in playoff_summary_stats.columns]

# display a sample of the playoff_summary_stats dataframe
playoff_summary_stats.sample(5)


# MERGE THE playoff_summary_stats RESULTS DATEFRAME BACK TO A MASTER DATAFRAME
yearly_team_stats = pd.merge(yearly_team_stats, playoff_summary_stats, on='teamName', how='left')






KeyError: 'win'

In [None]:
# # MERGE THE runscored RESULTS DATEFRAME BACK TO A MASTER DATAFRAME
# yearly_team_stats = pd.merge(summary_stats, runs_scored, on='teamName')


# yearly_team_stats = pd.merge(summary_stats, results, on='teamName')

# display a sample of the yearly_team_stats dataframe
yearly_team_stats.sample(5)

# shape
# yearly_team_stats.shape

# info
yearly_team_stats.info()

# columns
yearly_team_stats.columns

yearly_team_stats['wins'] = yearly_team_stats['home_wins'] + yearly_team_stats['away_wins']
yearly_team_stats['loss'] = yearly_team_stats['home_losses'] + yearly_team_stats['away_losses']
yearly_team_stats['tie'] = yearly_team_stats['home_ties'] + yearly_team_stats['away_ties']




In [None]:
# DISPLAY A SAMPLE OF THE YEARLY_TEAM_STATS DATAFRAME

# SHAPE

# COLUMNS

# INFO


# yearly_team_stats.columns

In [None]:
## create a list of the columns names
cols = list(yearly_team_stats.columns.values)

# put the list in alphabetical order
cols.sort()

# display the list
cols

In [None]:
## calculate winning percentages for regular season and playoffs, home and away
yearly_team_stats['winning_percentage'] = yearly_team_stats['wins'] / yearly_team_stats['games']
yearly_team_stats['winning_percentage_home'] = yearly_team_stats['wins_home'] / yearly_team_stats['games_home']
yearly_team_stats['winning_percentage_away'] = yearly_team_stats['wins_away'] / yearly_team_stats['games_away']


In [None]:
## Reorder the columns for readability and ease of use
yearly_team_stats = yearly_team_stats[['teamName','games_played','win','lose','tie'

In [None]:
print(cols)

# look at the fdataframe

yearly_team_stats.sample(5) 
yearly_team_stats.shape
## save thedataframe to a csv file
yearly_team_stats.to_csv('TEMP_V1_yearly_team_stats.csv', index=False)


In [None]:
### Create statistics for tournament games vs non tournament games

# create a new column called tournament and set it to True if the game is a tournament game and False if it is not
# df['tournament'] = df['tournament'].fillna(False)



In [None]:
tournament_games = df[df['contestType'].notnull()]

tournament_summary = tournament_games.groupby('teamName').agg({
    'teamScore': ['count', 'mean', 'median', 'sum'],
    'opponentScore': ['count', 'mean', 'median', 'sum']
}).reset_index()


tournament_summary.sample(5)

# tournament_summary.shape

# tournament_summary.columns



In [None]:
import plotly.express as px

## Number of wins by team filtered by season type

fig = px.bar(results, x='teamName', y='win', color='result')
fig.update_layout(title_text='Number of Wins by Team')
fig.show()


fig = px.bar(results, x='teamName', y='win')
fig.update_layout(title_text='Number of Wins by Team')
fig.show()


In [None]:
team = results.iloc[0]

fig = px.pie(values=[team['win'], team['loss'], team['tie']], names=['Wins', 'Losses', 'Ties'])
fig.update_layout(title_text=f'Win/Loss/Tie Proportions for Team: {team["teamName"]}')
fig.show()


In [None]:
fig = px.histogram(df, x='teamScore')
fig.update_layout(title_text='Distribution of Team Scores')
fig.show()


In [None]:
fig = px.scatter(df, x='teamScore', y='opponentScore', color='result')
fig.update_layout(title_text='Team Score vs. Opponent Score (colored by result)', xaxis_title='Team Score', yaxis_title='Opponent Score')
fig.show()


In [None]:
# Split the results based on home or away and calculate the win rate
home_games = df[df['homeOrAway'] == 'H']
away_games = df[df['homeOrAway'] == 'A']

home_results = home_games.groupby('teamName').apply(lambda x: (x['result'] == 'win').mean()).reset_index(name='home_win_rate')
away_results = away_games.groupby('teamName').apply(lambda x: (x['result'] == 'win').mean()).reset_index(name='away_win_rate')

# Merge the home and away results
home_away_results = pd.merge(home_results, away_results, on='teamName')

# Calculate the difference in win rate (home - away)
home_away_results['advantage'] = home_away_results['home_win_rate'] - home_away_results['away_win_rate']

# Display the teams with the biggest home field advantage
top_advantage_teams = home_away_results.sort_values(by='advantage', ascending=False).head(10)

print(top_advantage_teams)


In [None]:
# Save the data as jsons or a csv

# save the data as a json
yearly_team_stats.to_json('yearly_team_stats.json', orient='records')

# save the data as a csv
# using os.path.join to make sure the file is saved in the correct location
yearly_team_stats.to_csv(os.path.join('yearly_team_stats.csv'), index=False)




                         


