In [1]:
## This Notebook creates the yearly_team_stats file.
# It takes the raw scraped game data from the MHSAA website and groups results by team,
# then calculates the yearly stats split into home and awy for each team.

# Dependencies

import pandas as pd
import numpy as np
import os

# File to Load

file_to_load = os.path.join("TEMP", "clean_tables", "game_level", "_2023_games.csv")

In [2]:
# load file as dataframe
df = pd.read_csv(file_to_load)

# ## Quick Check of the file

print(df.columns)

df.info()

# how many unique teams are there in the data set

print(df['teamName'].nunique())

df.sample(5)

df.head(20)



# show value count for all of these columns
keep = ['contestType', 'seasonType', 'postSeasonInfo','tournamentInfo', 'tournamentName', 'tournamentType', 'contestName',    'seasonTypeCode']

for col in df.columns:
    print(col)
    print(df[col].value_counts())
    print('-----------------------')

# create a new dataframe with only the columns we want to keep






Index(['teamName', 'teamId', 'leagueName', 'opponentName', 'opponentId',
       'gameDate', 'gameTime', 'homeOrAway', 'location', 'teamScore',
       'opponentScore', 'notes', 'contestType', 'seasonType', 'postSeasonInfo',
       'tournamentInfo', 'tournamentName', 'tournamentType', 'contestName',
       'seasonTypeCode'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22176 entries, 0 to 22175
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   teamName        22176 non-null  object 
 1   teamId          22176 non-null  int64  
 2   leagueName      20985 non-null  object 
 3   opponentName    21585 non-null  object 
 4   opponentId      21585 non-null  float64
 5   gameDate        22176 non-null  object 
 6   gameTime        22176 non-null  object 
 7   homeOrAway      22176 non-null  object 
 8   location        19581 non-null  object 
 9   teamScore       16343 non-null  float64
 10 

In [3]:
total_games = len(df)
games_with_score = len(df.dropna(subset=['teamScore', 'opponentScore']))
games_without_score = total_games - games_with_score

print(f'Total games: {total_games}\nGames with score: {games_with_score}\nGames without score: {games_without_score}\nProportion of games with score: {games_with_score / total_games}')


Total games: 22176
Games with score: 16343
Games without score: 5833
Proportion of games with score: 0.7369678932178932


In [4]:
df.head()

df.columns

Index(['teamName', 'teamId', 'leagueName', 'opponentName', 'opponentId',
       'gameDate', 'gameTime', 'homeOrAway', 'location', 'teamScore',
       'opponentScore', 'notes', 'contestType', 'seasonType', 'postSeasonInfo',
       'tournamentInfo', 'tournamentName', 'tournamentType', 'contestName',
       'seasonTypeCode'],
      dtype='object')

In [5]:
## Create a column WIth the result of each game win, loss or tie

def calc_result(row):
    if row['teamScore'] > row['opponentScore']:
        return 'W'
    elif row['teamScore'] < row['opponentScore']:
        return 'L'
    else:
        return 'T'

df['result'] = df.apply(calc_result, axis=1)

df.head()




Unnamed: 0,teamName,teamId,leagueName,opponentName,opponentId,gameDate,gameTime,homeOrAway,location,teamScore,...,notes,contestType,seasonType,postSeasonInfo,tournamentInfo,tournamentName,tournamentType,contestName,seasonTypeCode,result
0,Brighton,2713163,Kensington Lakes Activities Association,Northville,2713449.0,2023-04-06T15:30:00,3:30 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,1.0,...,,1,1,,,,0.0,,S,L
1,Brighton,2713163,Kensington Lakes Activities Association,Northville,2713449.0,2023-04-06T17:30:00,5:30 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,5.0,...,,1,1,,,,0.0,,S,L
2,Brighton,2713163,Kensington Lakes Activities Association,Livonia Stevenson,2713479.0,2023-04-07T16:30:00,4:30 PM,H,,12.0,...,,1,1,,,,0.0,,S,W
3,Brighton,2713163,Kensington Lakes Activities Association,Ann Arbor Pioneer,2713329.0,2023-04-08T12:00:00,12:00 PM,H,http://maps.google.com/maps?q=BHS+Baseball+787...,16.0,...,,1,1,,,,0.0,,S,W
4,Brighton,2713163,Kensington Lakes Activities Association,Ann Arbor Pioneer,2713329.0,2023-04-08T14:00:00,2:00 PM,H,http://maps.google.com/maps?q=BHS+Baseball+787...,12.0,...,,1,1,,,,0.0,,S,W


In [6]:
## New Stragety - split the dataframe into playoff games, regular season games

## playoff dataframe
playoff_df = df[df['postSeasonInfo'].notnull()]


playoff_df['homeOrAway'] = 'N'


## regular season dataframe
regular_season_df = df[df['postSeasonInfo'].isnull()]

# Create home and away dataframes for the regular season dataframes
home_df = regular_season_df[regular_season_df['homeOrAway'] == 'H']   # home games
away_df = regular_season_df[regular_season_df['homeOrAway'] == 'A']   # away games


# # $ Look at the playoff dataframe
# playoff_df.info()

# playoff_df.head()

# # $ Look at the regular season dataframe
# regular_season_df.info()

# regular_season_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playoff_df['homeOrAway'] = 'N'


In [7]:
df['result'].value_counts()

L    8140
W    8062
T    5974
Name: result, dtype: int64

In [8]:

# Create a new dataframe with the summary stats for each team and each dataframe
grouped = playoff_df.groupby('teamName')

In [9]:
grouped.head()

grouped['result'].value_counts()

## Save value counts to a new dataframe with the team name as the index and columns for each result type
results_frame = pd.DataFrame(grouped['result'].value_counts())

# transpose results into seperate columns for each result type
results_frame = results_frame.unstack(level=-1)








results_frame.head()


# grouped.head()


Unnamed: 0_level_0,result,result
result,L,W
teamName,Unnamed: 1_level_2,Unnamed: 2_level_2
Ada Forest Hills Eastern,1.0,5.0
Addison,1.0,1.0
Adrian,1.0,4.0
Adrian Lenawee Christian,1.0,
Adrian Madison,1.0,


In [10]:

playoff_stat = grouped.agg({
    'teamScore': ['count', 'mean', 'median', 'sum'],
    'opponentScore': ['count', 'mean', 'median', 'sum']
}).reset_index()




# playoff_stat = df[['teamName', 'teamId', 'leagueName']].drop_duplicates()

# simplify the column names
playoff_stat.columns = ['teamName', 
                        'playoff_games_played',
                        'playoff_runs_scored_mean',
                        'playoff_runs_scored_median',
                        'playoff_runs_scored_total',
                        'playoff_runs_allowed_count',
                        'playoff_runs_allowed_mean',
                        'playoff_runs_allowed_median',
                        'playoff_runs_allowed_total']


## Add the result count columns to the playoff_stat dataframe
playoff_stat = playoff_stat.merge(results_frame, how='left', on='teamName')

## Rename the columns
playoff_stat.columns = ['teamName',
                        'playoff_games_played',
                        'playoff_runs_scored_mean',
                        'playoff_runs_scored_median',
                        'playoff_runs_scored_total',
                        'playoff_runs_allowed_count',
                        'playoff_runs_allowed_mean',
                        'playoff_runs_allowed_median',
                        'playoff_runs_allowed_total',
                        'playoff_losses',
                        'playoff_wins'
                        ]

# Calculate winning percentage and run differential (total and mean)
playoff_stat['playoff_win_pct'] = playoff_stat['playoff_wins'] / playoff_stat['playoff_games_played']
playoff_stat['playoff_run_diff_total'] = playoff_stat['playoff_runs_scored_total'] - playoff_stat['playoff_runs_allowed_total']
playoff_stat['playoff_run_diff_mean'] = playoff_stat['playoff_runs_scored_mean'] - playoff_stat['playoff_runs_allowed_mean']

## Reorder the columns so the result columns right after the games played column
playoff_stat = playoff_stat[['teamName',
                        'playoff_games_played',
                        'playoff_wins',
                        'playoff_losses',
                        'playoff_win_pct',
                        'playoff_runs_scored_mean', 
                        'playoff_runs_scored_median',
                        'playoff_runs_scored_total',
                        'playoff_runs_allowed_mean',
                        'playoff_runs_allowed_median',
                        'playoff_runs_allowed_total',
                        'playoff_run_diff_total',
                        'playoff_run_diff_mean'
                        ]]



## Show the summary stats dataframe
playoff_stat.head()



  playoff_stat = playoff_stat.merge(results_frame, how='left', on='teamName')


Unnamed: 0,teamName,playoff_games_played,playoff_wins,playoff_losses,playoff_win_pct,playoff_runs_scored_mean,playoff_runs_scored_median,playoff_runs_scored_total,playoff_runs_allowed_mean,playoff_runs_allowed_median,playoff_runs_allowed_total,playoff_run_diff_total,playoff_run_diff_mean
0,Ada Forest Hills Eastern,6,5.0,1.0,0.833333,6.333333,4.5,38.0,1.0,1.0,6.0,32.0,5.333333
1,Addison,2,1.0,1.0,0.5,2.5,2.5,5.0,4.0,4.0,8.0,-3.0,-1.5
2,Adrian,5,4.0,1.0,0.8,5.8,4.0,29.0,4.0,2.0,20.0,9.0,1.8
3,Adrian Lenawee Christian,1,,1.0,,7.0,7.0,7.0,17.0,17.0,17.0,-10.0,-10.0
4,Adrian Madison,1,,1.0,,4.0,4.0,4.0,9.0,9.0,9.0,-5.0,-5.0


## Playoff stats are transformed and stored

## use that framework to build the rest of the situational stats

In [11]:
### Regular Season Stats

## Create a new dataframe with the summary stats for each team and each dataframe
grouped = regular_season_df.groupby('teamName')

# Save the total games played for each team
regular_season_games_played = grouped['teamName'].count()

## Save value counts to a new dataframe with the team name as the index and columns for each result type
results_frame = pd.DataFrame(grouped['result'].value_counts())

# # transpose results into seperate columns for each result type W L T
results_frame = results_frame.unstack(level=-1)

# add the regular_season_games_played
results_frame['regular_season_games_played'] = regular_season_games_played

# Rename reults columns
results_frame.columns = ['regular_season_losses',
                        'regular_season_ties',
                        'regular_season_wins',
                        'regular_season_games_played']

                        

# # look at the results frame
results_frame.head(20)



Unnamed: 0_level_0,regular_season_losses,regular_season_ties,regular_season_wins,regular_season_games_played
teamName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ada Forest Hills Eastern,7.0,7.0,27.0,41
Addison,14.0,,15.0,29
Adrian,8.0,10.0,27.0,45
Adrian Lenawee Christian,10.0,1.0,1.0,12
Adrian Madison,12.0,3.0,21.0,36
Akron-Fairgrove,12.0,3.0,8.0,23
Alanson,15.0,9.0,2.0,26
Alba,,2.0,,2
Alcona,10.0,8.0,9.0,27
Algonac,3.0,9.0,31.0,43


In [12]:


## Save the summary stats to a new dataframe
regular_season_stat = grouped.agg({
    'teamScore': ['mean', 'median', 'sum'],
    'opponentScore': ['mean', 'median', 'sum']
}).reset_index()



# simplify the column names
regular_season_stat.columns = ['teamName',
                        
                        'regular_season_runs_scored_mean',
                        'regular_season_runs_scored_median',
                        'regular_season_runs_scored_total',
                        
                        'regular_season_runs_allowed_mean',
                        'regular_season_runs_allowed_median',
                        'regular_season_runs_allowed_total']

## Add the result count columns to the regular_season_stat dataframe
regular_season_stat = regular_season_stat.merge(results_frame, how='left', on='teamName')


# Calculate winning percentage and run differential (total and mean)
regular_season_stat['regular_season_win_pct'] = regular_season_stat['regular_season_wins'] / regular_season_stat['regular_season_games_played']
regular_season_stat['regular_season_run_diff_total'] = regular_season_stat['regular_season_runs_scored_total'] - regular_season_stat['regular_season_runs_allowed_total']
regular_season_stat['regular_season_run_diff_mean'] = regular_season_stat['regular_season_runs_scored_mean'] - regular_season_stat['regular_season_runs_allowed_mean']

## Reorder the columns so the result columns right after the games played column
regular_season_stat = regular_season_stat[['teamName',
                        'regular_season_games_played',
                        'regular_season_wins',
                        'regular_season_losses',
                        'regular_season_ties',
                        'regular_season_win_pct',
                        'regular_season_runs_scored_mean',
                        'regular_season_runs_scored_median',
                        'regular_season_runs_scored_total',
                        'regular_season_runs_allowed_mean',
                        'regular_season_runs_allowed_median',
                        'regular_season_runs_allowed_total',
                        'regular_season_run_diff_total',
                        'regular_season_run_diff_mean'
                        ]]

## Show the summary stats dataframe
regular_season_stat.head()


Unnamed: 0,teamName,regular_season_games_played,regular_season_wins,regular_season_losses,regular_season_ties,regular_season_win_pct,regular_season_runs_scored_mean,regular_season_runs_scored_median,regular_season_runs_scored_total,regular_season_runs_allowed_mean,regular_season_runs_allowed_median,regular_season_runs_allowed_total,regular_season_run_diff_total,regular_season_run_diff_mean
0,Ada Forest Hills Eastern,41,27.0,7.0,7.0,0.658537,7.117647,7.0,242.0,3.264706,2.0,111.0,131.0,3.852941
1,Addison,29,15.0,14.0,,0.517241,6.068966,4.0,176.0,5.517241,5.0,160.0,16.0,0.551724
2,Adrian,45,27.0,8.0,10.0,0.6,8.6,7.0,301.0,2.942857,2.0,103.0,198.0,5.657143
3,Adrian Lenawee Christian,12,1.0,10.0,1.0,0.083333,4.818182,4.0,53.0,13.0,13.0,143.0,-90.0,-8.181818
4,Adrian Madison,36,21.0,12.0,3.0,0.583333,8.969697,9.0,296.0,4.69697,4.0,155.0,141.0,4.272727


In [13]:
### Combine the playoff and regular season stats into one dataframe

## Merge the playoff and regular season stats into one dataframe
combined_stat = playoff_stat.merge(regular_season_stat, how='left', on='teamName')

combined_stat.head()



Unnamed: 0,teamName,playoff_games_played,playoff_wins,playoff_losses,playoff_win_pct,playoff_runs_scored_mean,playoff_runs_scored_median,playoff_runs_scored_total,playoff_runs_allowed_mean,playoff_runs_allowed_median,...,regular_season_ties,regular_season_win_pct,regular_season_runs_scored_mean,regular_season_runs_scored_median,regular_season_runs_scored_total,regular_season_runs_allowed_mean,regular_season_runs_allowed_median,regular_season_runs_allowed_total,regular_season_run_diff_total,regular_season_run_diff_mean
0,Ada Forest Hills Eastern,6,5.0,1.0,0.833333,6.333333,4.5,38.0,1.0,1.0,...,7.0,0.658537,7.117647,7.0,242.0,3.264706,2.0,111.0,131.0,3.852941
1,Addison,2,1.0,1.0,0.5,2.5,2.5,5.0,4.0,4.0,...,,0.517241,6.068966,4.0,176.0,5.517241,5.0,160.0,16.0,0.551724
2,Adrian,5,4.0,1.0,0.8,5.8,4.0,29.0,4.0,2.0,...,10.0,0.6,8.6,7.0,301.0,2.942857,2.0,103.0,198.0,5.657143
3,Adrian Lenawee Christian,1,,1.0,,7.0,7.0,7.0,17.0,17.0,...,1.0,0.083333,4.818182,4.0,53.0,13.0,13.0,143.0,-90.0,-8.181818
4,Adrian Madison,1,,1.0,,4.0,4.0,4.0,9.0,9.0,...,3.0,0.583333,8.969697,9.0,296.0,4.69697,4.0,155.0,141.0,4.272727


In [14]:
## Create a new dataframe with the summary stats for home and away games
# home dataframe

grouped = home_df.groupby('teamName')

# head
grouped.head()

## Save the summary stats to a new dataframe
home_stat = grouped.agg({
    'teamScore': ['mean', 'median', 'sum'],
    'opponentScore': ['mean', 'median', 'sum']
}).reset_index()

# simplify the column names
home_stat.columns = ['teamName',
                    'home_runs_scored_mean',
                    'home_runs_scored_median',
                    'home_runs_scored_total',

                    'home_runs_allowed_mean',
                    'home_runs_allowed_median',
                    'home_runs_allowed_total']

## Add the result count columns to the home_stat dataframe
home_stat = home_stat.merge(results_frame, how='left', on='teamName')


# Save the total games played for each team
home_games_played = grouped['teamName'].count()

## Save value counts to a new dataframe with the team name as the index and columns for each result type
results_frame = pd.DataFrame(grouped['result'].value_counts())

# # # transpose results into seperate columns for each result type W L T
results_frame = results_frame.unstack(level=-1)

# simplify the column names
results_frame.columns = ['home_losses',
                        'home_ties',
                        'home_wins']

# # add the home_games_played
results_frame['home_games_played'] = home_games_played

# # Reorder reults columns
results_frame = results_frame[['home_games_played',
                            'home_wins',
                            'home_losses',
                            'home_ties']]





# Calculate winning percentage
results_frame['home_win_pct'] = results_frame['home_wins'] / results_frame['home_games_played']

## add the results columns to the home_stat dataframe
home_stat = home_stat.merge(results_frame, how='left', on='teamName')

# Calculate run differential (total and mean)
home_stat['home_run_diff_total'] = home_stat['home_runs_scored_total'] - home_stat['home_runs_allowed_total']
home_stat['home_run_diff_mean'] = home_stat['home_runs_scored_mean'] - home_stat['home_runs_allowed_mean']



# drop the rehular season games played column
# home_stat.drop(columns=['regular_season_losses','regular_season_ties','regular_season_wins','regular_season_games_played'], inplace=True)


# reorder the columns
home_stat = home_stat[['teamName',
                        'home_games_played',
                        'home_wins',
                        'home_losses',
                        'home_ties',
                        'home_win_pct',

                        'home_runs_scored_mean',
                        'home_runs_scored_median',
                        'home_runs_scored_total',

                        'home_runs_allowed_mean',
                        'home_runs_allowed_median',
                        'home_runs_allowed_total',

                        'home_run_diff_total',
                        'home_run_diff_mean'
                        ]]


# display the home_stat dataframe
home_stat.head()

# # look at the results frame
# results_frame.head(20)



Unnamed: 0,teamName,home_games_played,home_wins,home_losses,home_ties,home_win_pct,home_runs_scored_mean,home_runs_scored_median,home_runs_scored_total,home_runs_allowed_mean,home_runs_allowed_median,home_runs_allowed_total,home_run_diff_total,home_run_diff_mean
0,Ada Forest Hills Eastern,23,13.0,5.0,5.0,0.565217,6.666667,7.0,120.0,3.166667,2.0,57.0,63.0,3.5
1,Addison,14,7.0,7.0,,0.5,6.642857,3.5,93.0,4.285714,5.0,60.0,33.0,2.357143
2,Adrian,25,12.0,6.0,7.0,0.48,6.666667,6.0,120.0,2.944444,2.0,53.0,67.0,3.722222
3,Adrian Lenawee Christian,4,,3.0,1.0,,6.666667,8.0,20.0,19.333333,17.0,58.0,-38.0,-12.666667
4,Adrian Madison,15,8.0,7.0,,0.533333,7.133333,7.0,107.0,5.866667,4.0,88.0,19.0,1.266667


In [15]:
### Merge the home stats into the combined dataframe
combined_stat = combined_stat.merge(home_stat, how='left', on='teamName')

# combined_stat.head()

# combined_stat.columns

In [16]:
### Calculate the away stats

## Create a new dataframe with the summary stats for away games
# away dataframe

grouped = away_df.groupby('teamName')

# head
grouped.head()


## Save the summary stats to a new dataframe
away_stat = grouped.agg({
    'teamScore': ['mean', 'median', 'sum'],
    'opponentScore': ['mean', 'median', 'sum']
}).reset_index()

# simplify the column names
away_stat.columns = ['teamName',
                    'away_runs_scored_mean',
                    'away_runs_scored_median',
                    'away_runs_scored_total',

                    'away_runs_allowed_mean',
                    'away_runs_allowed_median',
                    'away_runs_allowed_total']

## Add the result count columns to the home_stat dataframe
away_stat = away_stat.merge(results_frame, how='left', on='teamName')


# Save the total games played for each team
away_games_played = grouped['teamName'].count()

## Save value counts to a new dataframe with the team name as the index and columns for each result type
results_frame = pd.DataFrame(grouped['result'].value_counts())

# # # transpose results into seperate columns for each result type W L T
results_frame = results_frame.unstack(level=-1)

# simplify the column names
results_frame.columns = ['away_losses',
                        'away_ties',
                        'away_wins']

# # add the home_games_played
results_frame['away_games_played'] = away_games_played

# # Reorder reults columns
results_frame = results_frame[['away_games_played',
                            'away_wins',
                            'away_losses',
                            'away_ties']]





# Calculate winning percentage
results_frame['away_win_pct'] = results_frame['away_wins'] / results_frame['away_games_played']

## add the results columns to the home_stat dataframe
away_stat = away_stat.merge(results_frame, how='left', on='teamName')

# Calculate run differential (total and mean)
away_stat['away_run_diff_total'] = away_stat['away_runs_scored_total'] - away_stat['away_runs_allowed_total']
away_stat['away_run_diff_mean'] = away_stat['away_runs_scored_mean'] - away_stat['away_runs_allowed_mean']



# drop the rehular season games played column
# away_stat.drop(columns=['regular_season_losses','regular_season_ties','regular_season_wins','regular_season_games_played'], inplace=True)


# reorder the columns
away_stat = away_stat[['teamName',
                        'away_games_played',
                        'away_wins',
                        'away_losses',
                        'away_ties',
                        'away_win_pct',

                        'away_runs_scored_mean',
                        'away_runs_scored_median',
                        'away_runs_scored_total',

                        'away_runs_allowed_mean',
                        'away_runs_allowed_median',
                        'away_runs_allowed_total',

                        'away_run_diff_total',
                        'away_run_diff_mean'
                        ]]


# display the away_stat dataframe
away_stat.head()

Unnamed: 0,teamName,away_games_played,away_wins,away_losses,away_ties,away_win_pct,away_runs_scored_mean,away_runs_scored_median,away_runs_scored_total,away_runs_allowed_mean,away_runs_allowed_median,away_runs_allowed_total,away_run_diff_total,away_run_diff_mean
0,Ada Forest Hills Eastern,18,14.0,2.0,2.0,0.777778,7.625,6.5,122.0,3.375,2.5,54.0,68.0,4.25
1,Addison,15,8.0,7.0,,0.533333,5.533333,5.0,83.0,6.666667,6.0,100.0,-17.0,-1.133333
2,Adrian,20,15.0,2.0,3.0,0.75,10.647059,10.0,181.0,2.941176,2.0,50.0,131.0,7.705882
3,Adrian Lenawee Christian,8,1.0,7.0,,0.125,4.125,3.0,33.0,10.625,11.5,85.0,-52.0,-6.5
4,Adrian Madison,21,13.0,5.0,3.0,0.619048,10.5,10.0,189.0,3.722222,4.0,67.0,122.0,6.777778


In [17]:
## Merge the away stats into the combined dataframe
combined_stat = combined_stat.merge(away_stat, how='left', on='teamName')

# combined_stat.head()

combined_stat.columns

Index(['teamName', 'playoff_games_played', 'playoff_wins', 'playoff_losses',
       'playoff_win_pct', 'playoff_runs_scored_mean',
       'playoff_runs_scored_median', 'playoff_runs_scored_total',
       'playoff_runs_allowed_mean', 'playoff_runs_allowed_median',
       'playoff_runs_allowed_total', 'playoff_run_diff_total',
       'playoff_run_diff_mean', 'regular_season_games_played',
       'regular_season_wins', 'regular_season_losses', 'regular_season_ties',
       'regular_season_win_pct', 'regular_season_runs_scored_mean',
       'regular_season_runs_scored_median', 'regular_season_runs_scored_total',
       'regular_season_runs_allowed_mean',
       'regular_season_runs_allowed_median',
       'regular_season_runs_allowed_total', 'regular_season_run_diff_total',
       'regular_season_run_diff_mean', 'home_games_played', 'home_wins',
       'home_losses', 'home_ties', 'home_win_pct', 'home_runs_scored_mean',
       'home_runs_scored_median', 'home_runs_scored_total',
       'ho

In [18]:
## SAVE THE COMBINED STAT DATAFRAME TO A CSV FILE
combined_stat.to_csv('TEMP/yearly_stats_TEST.csv', index=False)

In [None]:
grouped = df.groupby('teamName')

summary_stats = grouped.agg({
    'teamScore': ['count', 'mean', 'median', 'sum'],
    'opponentScore': ['count', 'mean', 'median', 'sum']
}).reset_index()

## Show a sample of the summary stats
summary_stats.sample(5)

## Show the shape of the summary stats
summary_stats.shape

summary_stats.columns

#




In [None]:
## look at the columns in the summary stats dataframe
summary_stats.columns

# simplify the column names
summary_stats.columns = ['teamName', 
                        'games_played',
                        'runs_scored_mean',
                        'runs_scored_median',
                        'runs_scored_total',
                        'runs_allowed_count',
                        'runs_allowed_mean',
                        'runs_allowed_median',
                        'runs_allowed_total']


summary_stats.sample(5)

summary_stats.shape

summary_stats.info()

In [None]:
## Start to build the year stats dataframe
# Merge the team_df and summary_stats dataframes into the beginning of the yearly_team_stats dataframe

yearly_team_stats = pd.merge(team_df, summary_stats, on='teamName', how='left')



In [None]:

# # lookat the shape of the yearly_team_stats dataframe
yearly_team_stats.shape

# # look at the columns in the yearly_team_stats dataframe
yearly_team_stats.columns

# reorder the columns
# yearly_team_stats = yearly_team_stats[['teamName', 'leagueName', 'games_played', 'runs_scored_mean', 'runs_scored_median', 'runs_scored_total', 'runs_allowed_mean', 'runs_allowed_median', 'runs_allowed_total']]

# yearly_team_stats.sample(5)
# 

In [None]:



# calculate total home and away games for each row
df['totalHomeGames'] = np.where(df['homeOrAway'] == 'H', 1, 0)
df['totalAwayGames'] = np.where(df['homeOrAway'] == 'A', 1, 0)

# calculate the total home and away wins for each row
df['home_wins'] = np.where(df['result'] == 'home_win', 1, 0)
df['away_wins'] = np.where(df['result'] == 'away_win', 1, 0)

# calculate the total home and away losses for each row
df['home_losses'] = np.where(df['result'] == 'home_loss', 1, 0)
df['away_losses'] = np.where(df['result'] == 'away_loss', 1, 0)

# calculate the total home and away ties for each row
df['home_ties'] = np.where(df['result'] == 'home_tie', 1, 0)
df['away_ties'] = np.where(df['result'] == 'away_tie', 1, 0)



## aggregate the results by team
grouped = df.groupby('teamName')

summary_stats = grouped.agg({
    'totalHomeGames': 'sum',
    'totalAwayGames': 'sum',
    'home_wins': 'sum',
    'away_wins': 'sum',
    'home_losses': 'sum',
    'away_losses': 'sum',
    'home_ties': 'sum',
    'away_ties': 'sum'
}).reset_index()


# df.sample(5)

# merge the summary stats into the yearly_team_stats dataframe
yearly_team_stats = pd.merge(yearly_team_stats, summary_stats, on='teamName', how='left')




In [None]:
# merge the results dataframe with the summary stats dataframe
# yearly_team_stats = pd.merge(yearly_team_stats, results, on='teamName', how='left')

yearly_team_stats.sample(5)


In [None]:
######### PROBABLY UNNECESSARY #########

# Calculate the number of games played at home / away and the number of wins / losses / ties at home / away

# def calculate_home_away_results(row):
#     if row['homeOrAway'] == 'Home':
#         if row['result'] == 'win':
#             return 'win_home'
#         elif row['result'] == 'loss':
#             return 'loss_home'
#         else:
#             return 'tie_home'
#     else:
#         if row['result'] == 'win':
#             return 'win_away'
#         elif row['result'] == 'loss':
#             return 'loss_away'
#         else:
#             return 'tie_away'

            
# ## Show a sample of the summary stats
# # df.sample(5)
        

# df['home_away_result'] = df.apply(calculate_home_away_results, axis=1)


# home_away_results = df.groupby(['teamName', 'home_away_result']).size().unstack(fill_value=0).reset_index()

# home_away_results.sample(5)
# # df.sample(5)

# # location_summary = pd.crosstab(df['teamName'], df['homeOrAway'])

# # location_summary.sample(5)

# # # location_summary.shape




In [None]:
## LOOK AT THE SHAPE OF THE MASTER
yearly_team_stats.shape

## LOOK AT THE COLUMNS OF THE MASTER
yearly_team_stats.columns

## LOOK AT A SAMPLE OF THE MASTER
# yearly_team_stats.sample(5)

## check df
# df.sample(5)
# runs_sc


In [None]:
### Calculate the runs socred and allowed home and away

# runs scored
runs_scored = df.groupby(['teamName', 'homeOrAway'])['teamScore'].sum().unstack(fill_value=0).reset_index()

# runs allowed
runs_allowed = df.groupby(['teamName', 'homeOrAway'])['opponentScore'].sum().unstack(fill_value=0).reset_index()





# merge the runs scored and runs allowed dataframes named runs_scored_home and runs_allowed_home, ect
runs_scored = pd.merge(runs_scored, runs_allowed, on='teamName', suffixes=('_runs_scored', '_runs_allowed'))

# get the count of records for each team
runs_scored['games'] = df.groupby('teamName')['teamScore'].count().values

# # calculate the run differential overall, home and away
runs_scored['run_differential_home'] = runs_scored['H_runs_scored'] - runs_scored['H_runs_allowed']
runs_scored['run_differential_away'] = runs_scored['A_runs_scored'] - runs_scored['A_runs_allowed']
runs_scored['run_differential'] = runs_scored['run_differential_home'] + runs_scored['run_differential_away']


# runs_scored.sample(5)
## Calculate the averages
runs_scored['runs_scored_avg'] = (runs_scored['H_runs_scored'] + runs_scored['A_runs_scored']) / runs_scored['games']
runs_scored['runs_allowed_avg'] = (runs_scored['H_runs_allowed'] + runs_scored['A_runs_allowed']) / runs_scored['games']

# # Runs scored home and away
runs_scored['runs_scored_avg_home'] = runs_scored['H_runs_scored'] / runs_scored['games']
runs_scored['runs_scored_avg_away'] = runs_scored['A_runs_scored'] / runs_scored['games']
# # allowed home and away
runs_scored['runs_allowed_avg_home'] = runs_scored['H_runs_allowed'] / runs_scored['games']
runs_scored['runs_allowed_avg_away'] = runs_scored['A_runs_allowed'] / runs_scored['games']


# games_with_score = df.groupby('teamName')['teamScore'].count().values

# # Calculate the averages
runs_scored['runs_scored_avg'] = (runs_scored['H_runs_scored'] + runs_scored['A_runs_scored']) / runs_scored['games']
runs_scored['runs_allowed_avg'] = (runs_scored['H_runs_allowed'] + runs_scored['A_runs_allowed']) / runs_scored['games']
runs_scored['run_differential_avg'] = runs_scored['run_differential'] / runs_scored['games']

runs_scored['home_runs_scored_avg'] = runs_scored['H_runs_scored'] / runs_scored['games']
runs_scored['home_runs_allowed_avg'] = runs_scored['H_runs_allowed'] / runs_scored['games']
runs_scored['home_run_differential_avg'] = runs_scored['run_differential_home'] / runs_scored['games']

runs_scored['away_runs_scored_avg'] = runs_scored['A_runs_scored'] / runs_scored['games']
runs_scored['away_runs_allowed_avg'] = runs_scored['A_runs_allowed'] / runs_scored['games']

print(runs_scored.columns)

# show a sample of the runs scored dataframe
# runs_scored.sample(5)

# runs_scored.shape

# runs_scored.info()





In [None]:
## Merge the runs scored and allowed to the master
yearly_team_stats = pd.merge(yearly_team_stats, runs_scored, on='teamName', how='outer')

# yearly_team_stats.sample(5)

# yearly_team_stats.shape

yearly_team_stats.info()

yearly_team_stats.columns

In [None]:

# create filter for just mhsaa games tournamentName is not null
playoff_games = df[df['tournamentName'].notnull()]

# display a sample of the mhsaa_games dataframe
playoff_games.sample(5)

# create the same summary stats for the playoff_games dataframe
playoff_summary_stats = pd.crosstab(playoff_games['teamName'], playoff_games['result'])



## Calculate wins and losses
playoff_summary_stats['win'] = playoff_summary_stats['win']
playoff_summary_stats['losses'] = playoff_summary_stats['loss']
# game count
playoff_summary_stats['games'] = playoff_summary_stats['wins'] + playoff_summary_stats['losses']

## runs scored and allowed
playoff_summary_stats['runs_scored'] = playoff_games.groupby('teamName')['teamScore'].sum()
playoff_summary_stats['runs_allowed'] = playoff_games.groupby('teamName')['opponentScore'].sum()

# run differential
playoff_summary_stats['run_differential'] = playoff_summary_stats['runs_scored'] - playoff_summary_stats['runs_allowed']
# run differential average
playoff_summary_stats['run_differential'] = (playoff_summary_stats['runs_scored'] - playoff_summary_stats['runs_allowed']) / playoff_summary_stats['games']

# append the name of the column to the back of the column name with _playoff
playoff_summary_stats.columns = [col + '_playoff' for col in playoff_summary_stats.columns]

# display a sample of the playoff_summary_stats dataframe
playoff_summary_stats.sample(5)


# MERGE THE playoff_summary_stats RESULTS DATEFRAME BACK TO A MASTER DATAFRAME
yearly_team_stats = pd.merge(yearly_team_stats, playoff_summary_stats, on='teamName', how='left')






In [None]:
# # MERGE THE runscored RESULTS DATEFRAME BACK TO A MASTER DATAFRAME
# yearly_team_stats = pd.merge(summary_stats, runs_scored, on='teamName')


# yearly_team_stats = pd.merge(summary_stats, results, on='teamName')

# display a sample of the yearly_team_stats dataframe
yearly_team_stats.sample(5)

# shape
# yearly_team_stats.shape

# info
yearly_team_stats.info()

# columns
yearly_team_stats.columns

yearly_team_stats['wins'] = yearly_team_stats['home_wins'] + yearly_team_stats['away_wins']
yearly_team_stats['loss'] = yearly_team_stats['home_losses'] + yearly_team_stats['away_losses']
yearly_team_stats['tie'] = yearly_team_stats['home_ties'] + yearly_team_stats['away_ties']




In [None]:
# DISPLAY A SAMPLE OF THE YEARLY_TEAM_STATS DATAFRAME

# SHAPE

# COLUMNS

# INFO


# yearly_team_stats.columns

In [None]:
## create a list of the columns names
cols = list(yearly_team_stats.columns.values)

# put the list in alphabetical order
cols.sort()

# display the list
cols

In [None]:
## calculate winning percentages for regular season and playoffs, home and away
yearly_team_stats['winning_percentage'] = yearly_team_stats['wins'] / yearly_team_stats['games']
yearly_team_stats['winning_percentage_home'] = yearly_team_stats['wins_home'] / yearly_team_stats['games_home']
yearly_team_stats['winning_percentage_away'] = yearly_team_stats['wins_away'] / yearly_team_stats['games_away']


In [None]:
## Reorder the columns for readability and ease of use
yearly_team_stats = yearly_team_stats[['teamName','games_played','win','lose','tie'

In [None]:
print(cols)

# look at the fdataframe

yearly_team_stats.sample(5) 
yearly_team_stats.shape
## save thedataframe to a csv file
yearly_team_stats.to_csv('TEMP_V1_yearly_team_stats.csv', index=False)


In [None]:
### Create statistics for tournament games vs non tournament games

# create a new column called tournament and set it to True if the game is a tournament game and False if it is not
# df['tournament'] = df['tournament'].fillna(False)



In [None]:
tournament_games = df[df['contestType'].notnull()]

tournament_summary = tournament_games.groupby('teamName').agg({
    'teamScore': ['count', 'mean', 'median', 'sum'],
    'opponentScore': ['count', 'mean', 'median', 'sum']
}).reset_index()


tournament_summary.sample(5)

# tournament_summary.shape

# tournament_summary.columns



In [None]:
import plotly.express as px

## Number of wins by team filtered by season type

fig = px.bar(results, x='teamName', y='win', color='result')
fig.update_layout(title_text='Number of Wins by Team')
fig.show()


fig = px.bar(results, x='teamName', y='win')
fig.update_layout(title_text='Number of Wins by Team')
fig.show()


In [None]:
team = results.iloc[0]

fig = px.pie(values=[team['win'], team['loss'], team['tie']], names=['Wins', 'Losses', 'Ties'])
fig.update_layout(title_text=f'Win/Loss/Tie Proportions for Team: {team["teamName"]}')
fig.show()


In [None]:
fig = px.histogram(df, x='teamScore')
fig.update_layout(title_text='Distribution of Team Scores')
fig.show()


In [None]:
fig = px.scatter(df, x='teamScore', y='opponentScore', color='result')
fig.update_layout(title_text='Team Score vs. Opponent Score (colored by result)', xaxis_title='Team Score', yaxis_title='Opponent Score')
fig.show()


In [None]:
# Split the results based on home or away and calculate the win rate
home_games = df[df['homeOrAway'] == 'H']
away_games = df[df['homeOrAway'] == 'A']

home_results = home_games.groupby('teamName').apply(lambda x: (x['result'] == 'win').mean()).reset_index(name='home_win_rate')
away_results = away_games.groupby('teamName').apply(lambda x: (x['result'] == 'win').mean()).reset_index(name='away_win_rate')

# Merge the home and away results
home_away_results = pd.merge(home_results, away_results, on='teamName')

# Calculate the difference in win rate (home - away)
home_away_results['advantage'] = home_away_results['home_win_rate'] - home_away_results['away_win_rate']

# Display the teams with the biggest home field advantage
top_advantage_teams = home_away_results.sort_values(by='advantage', ascending=False).head(10)

print(top_advantage_teams)


In [None]:
# Save the data as jsons or a csv

# save the data as a json
yearly_team_stats.to_json('yearly_team_stats.json', orient='records')

# save the data as a csv
# using os.path.join to make sure the file is saved in the correct location
yearly_team_stats.to_csv(os.path.join('yearly_team_stats.csv'), index=False)




                         


