In [3]:
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import numpy as np



# Assuming the CSV file is named 'baseball_data.csv'
df = pd.read_csv('data/MHSAA/game_data/2023_team_stats_thurs_6_16.csv')

# Show the first few rows of the DataFrame to confirm it's loaded correctly
# print(df.head())


In [4]:
## Simple Line Chart


# Create a line chart
line_chart = go.Figure(data=go.Scatter(x=df['home_games'], y=df['home_avg_runs_scored'], mode='lines'))

# Set the title and labels
line_chart.update_layout(title='Average Runs Scored at Home vs. Number of Home Games',
                         xaxis_title='Number of Home Games',
                         yaxis_title='Average Runs Scored at Home')

# Display the chart
line_chart.show()

In [5]:
# BARCHART OF JUST 10 TEAMS WITH SOME STYLE APPLIED 



# import plotly.graph_objects as go

# Select the first 10 teams for clarity
subset_df = df.head(10)

# Create a bar chart
fig = go.Figure()

# Add a bar for home average runs scored
fig.add_trace(go.Bar(
    x=subset_df['primary_team'],
    y=subset_df['home_avg_runs_scored'],
    name='Home',
    marker_color='indigo'
))

# Add a bar for away average runs scored
fig.add_trace(go.Bar(
    x=subset_df['primary_team'],
    y=subset_df['away_avg_runs_scored'],
    name='Away',
    marker_color='green'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels. 
fig.update_layout(barmode='group', xaxis_tickangle=-45)

# Set the title and labels
fig.update_layout(title_text='Average Runs Scored at Home vs Away (First 10 Teams)',
                  xaxis_title='Team',
                  yaxis_title='Average Runs Scored')

# Show the figure
fig.show()


In [6]:
# find the top 10 teams that get the greatest boost from playing at home and the top 10 teams that underperform at home:

# Calculate home field advantage as home run differential - away run differential
df['home_field_advantage'] = df['home_run_diff'] - df['away_run_diff']

# Get the top 10 teams that get the greatest boost from playing at home
top_boost_teams = df.nlargest(10, 'home_field_advantage')

# Get the top 10 teams that underperform at home
top_underperform_teams = df.nsmallest(10, 'home_field_advantage')


In [7]:
## Create a scatter plot

# import plotly.graph_objects as go

# Create a scatter plot for top 10 teams that get the greatest boost from playing at home
fig1 = go.Figure(data=go.Scatter(
    x=top_boost_teams['primary_team'],
    y=top_boost_teams['home_field_advantage'],
    mode='markers',
    marker=dict(
        color=top_boost_teams['home_field_advantage'], # set color equal to a variable
        colorbar=dict(title='Home Field Advantage'), # add a color bar
        colorscale='Viridis', # specify the color scale
        size=15, # adjust the size of the markers
        line=dict(
            color='DarkSlateGrey', # color of the marker borders
            width=2 # width of the marker borders
        ),
        opacity=0.8 # adjust the opacity
    )
))

# Set the title and labels
fig1.update_layout(
    title='Top 10 Teams with Greatest Home Field Advantage',
    xaxis_title='Team',
    yaxis_title='Home Field Advantage',
    font=dict(
        family="Courier New, monospace", # specify the font family
        size=18, # specify the font size
        color="#7f7f7f" # specify the font color
    )
)

# Show the figure
fig1.show()

# Create a scatter plot for top 10 teams that underperform at home
fig2 = go.Figure(data=go.Scatter(
    x=top_underperform_teams['primary_team'],
    y=top_underperform_teams['home_field_advantage'],
    mode='markers',
    marker=dict(
        color=top_underperform_teams['home_field_advantage'], # set color equal to a variable
        colorbar=dict(title='Home Field Disadvantage'), # add a color bar
        colorscale='Inferno', # specify the color scale
        size=15, # adjust the size of the markers
        line=dict(
            color='MediumPurple', # color of the marker borders
            width=2 # width of the marker borders
        ),
        opacity=0.8 # adjust the opacity
    )
))

# Set the title and labels
fig2.update_layout(
    title='Top 10 Teams with Greatest Home Field Disadvantage',
    xaxis_title='Team',
    yaxis_title='Home Field Disadvantage',
    font=dict(
        family="Courier New, monospace", # specify the font family
        size=18, # specify the font size
        color="#7f7f7f" # specify the font color
   )
)

# Show the figure
fig2.show()


In [8]:
# Filter the data to only include teams that have played at least 20 games
df_filtered = df[df['games'] >= 20]

# Recalculate home field advantage and top boost/underperform teams
df_filtered['home_field_advantage'] = df_filtered['home_run_diff'] - df_filtered['away_run_diff']
top_boost_teams_filtered = df_filtered.nlargest(10, 'home_field_advantage')
top_underperform_teams_filtered = df_filtered.nsmallest(10, 'home_field_advantage')

# Combine the two dataframes into one for easier plotting
combined_df = pd.concat([top_boost_teams_filtered, top_underperform_teams_filtered])

import plotly.express as px

fig = px.scatter(combined_df,
                 x='primary_team',
                 y='home_field_advantage',
                 size='games', # size of bubbles represents number of games played
                 color='home_field_advantage', # color represents home field advantage/disadvantage
                 color_continuous_scale=px.colors.diverging.RdYlBu, # color blind friendly color scale
                 labels={
                     'primary_team': 'Team',
                     'home_field_advantage': 'Home Field Advantage',
                     'games': 'Games Played'},
                 title='Home Field Advantage vs Games Played',
                 template='plotly_dark') # use dark theme

# Adjust size range for clarity
fig.update_traces(marker=dict(sizemode='diameter', sizemin=8, sizeref=0.2))

# Rotate x-axis labels for readability
fig.update_xaxes(tickangle=45)

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
# create a scatter plot of home win percentages against away win percentages for the teams with at least 20 games:

# Filter the data to only include teams that have played at least 20 games
df_filtered = df[df['games'] >= 20]

# Plotly Express Scatter Plot
fig = px.scatter(df_filtered,
                 x='home_win_pct',
                 y='away_win_pct',
                 color='primary_team',
                 labels={
                     'home_win_pct': 'Home Win Percentage',
                     'away_win_pct': 'Away Win Percentage'},
                 title='Home vs Away Win Percentages',
                 template='plotly_dark', # use dark theme
                 color_continuous_scale=px.colors.diverging.RdYlBu) # color blind friendly color scale

# Add a diagonal line to represent equal home and away win percentages
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.show()



In [11]:
## Look at the data sets as they are created 

path = 'TEMP/2022_20230617000213.csv'

df = pd.read_csv(path)

df.head()


Unnamed: 0,teamName,teamId,leagueName,opponentName,opponentId,gameDate,gameTime,homeOrAway,location,teamScore,opponentScore,notes,contestType,seasonType,postSeasonInfo,tournamentInfo,tournamentName,tournamentType,contestName,seasonTypeCode
0,Brighton,2601545,Kensington Lakes Activities Association,Novi,2601596.0,2022-03-30T16:00:00,4:00 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,,,,1,1,,,,0.0,,S
1,Brighton,2601545,Kensington Lakes Activities Association,Novi,2601596.0,2022-03-30T18:00:00,6:00 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,,,,1,1,,,,0.0,,S
2,Brighton,2601545,Kensington Lakes Activities Association,Flushing,2601631.0,2022-04-08T16:00:00,4:00 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,10.0,8.0,,1,1,,,,0.0,,S
3,Brighton,2601545,Kensington Lakes Activities Association,Flushing,2601631.0,2022-04-08T18:00:00,6:00 PM,H,http://maps.google.com/maps?q=Brighton+High+Sc...,,,,1,1,,,,0.0,,S
4,Brighton,2601545,Kensington Lakes Activities Association,Livonia Stevenson,2601861.0,2022-04-11T16:30:00,4:30 PM,A,,8.0,1.0,,1,1,,,,0.0,,S


In [21]:
## Number to teams represented in the data set

# Value Counts of primary_team
print(df['teamName'].value_counts())

# unique teams
# print the number
print(len(df['teamName'].unique()))
df['teamName'].unique()

Ferndale                            66
Ann Arbor Huron                     59
Ann Arbor Skyline                   55
White Lake Lakeland                 52
Clinton Township Chippewa Valley    52
                                    ..
Iron Mountain                       13
Harper Woods Chandler Park          10
Saginaw                              9
Big Rapids Crossroads Academy        4
Burton St Thomas More Academy        1
Name: teamName, Length: 127, dtype: int64
127


array(['Brighton', 'Roseville', 'Shepherd', 'Cass City',
       'Bloomfield Hills Roeper', 'Traverse City St Francis',
       'East Jordan', 'Corunna', 'Coleman', 'Richmond',
       'Vermontville Maple Valley', 'Warren Cousino',
       'Romulus Summit Academy North', 'Walkerville',
       'Madison Heights Madison', 'Crystal Falls Forest Park',
       'Harper Woods Chandler Park',
       'Grosse Pointe Woods University Liggett', 'Saginaw',
       'Bay City John Glenn', 'Paw Paw', 'Holland Black River',
       'Bridgeport', 'Boyne Falls', 'Coldwater', 'Detroit Southeastern',
       'Kingsley', 'White Lake Lakeland', 'Homer',
       'Big Rapids Crossroads Academy', 'Laingsburg', 'Berrien Springs',
       'DeWitt', 'Bad Axe', 'Warren Fitzgerald', 'Lansing Everett',
       'Ubly', 'Wayne Memorial', 'Hamilton', 'Portage Central',
       'Muskegon Catholic Central', 'Brooklyn Columbia Central',
       'Whittemore-Prescott', 'Niles', 'Detroit Mumford',
       'Mason County Central', 'Madison H