In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Average homeruns per year

In [None]:
df = pd.read_csv('core/Batting.csv')
batting_df = df[df['HR'] > 0]
home_runs_per_year = batting_df.groupby('yearID')['HR'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x='yearID', y='HR', data=home_runs_per_year)
plt.axvline(x=1919, color='red', linestyle='--') 
plt.axvline(x=1901, color='red', linestyle='--')  
plt.axvline(x=1920, color='blue', linestyle='--') 
plt.axvline(x=1941, color='blue', linestyle='--')
plt.axvline(x=1942, color='yellow', linestyle='--') 
plt.axvline(x=1960, color='yellow', linestyle='--')
plt.axvline(x=1961, color='green', linestyle='--') 
plt.axvline(x=1976, color='green', linestyle='--') 
plt.axvline(x=1977, color='purple', linestyle='--') 
plt.axvline(x=1993, color='purple', linestyle='--') 
plt.axvline(x=1994, color='orange', linestyle='--') 
plt.axvline(x=2005, color='orange', linestyle='--') 
plt.title('Yearly Trends in Home Runs')
plt.xlabel('Year')
plt.ylabel('Total Home Runs')
plt.show()


### Average attendance per year

In [None]:
df = pd.read_csv('core/teams.csv')
df.attendance = df.attendance/df.Ghome
attendance_per_year = df.groupby('yearID')['attendance'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x='yearID', y='attendance', data=attendance_per_year)
plt.axvline(x=1920, color='red', linestyle='--')
plt.axvline(x=1900, color='red', linestyle='--') 
plt.axvline(x=1973, color='green', linestyle='--')
plt.axvline(x=1994, color='purple', linestyle='--')
plt.axvline(x=2005, color='purple', linestyle='--')
plt.axvline(x=2014, color='black', linestyle='--')
plt.title('Yearly Trends in Average Attendance')
plt.xlabel('Year')
plt.ylabel('Attednance')
plt.show()

## Winning percentage vs attendance

In [None]:
teams = pd.read_csv('core/Teams.csv')
teams.attendance = teams.attendance/teams.Ghome
teams['WP']=teams.W/ teams.G
def classify_era(year):
    if 1901 <= year <= 1920:
        return "Dead Ball era (1901 - 1920)"
    elif 1921 <= year <= 1942:
        return "Live Ball era (1921 - 1942)"
    elif 1943 <= year <= 1961:
        return "Integration era (1943 - 1961"
    elif 1962 <= year <= 1977:
        return "Expansion era (1962 -1977)"
    elif 1978 <= year <= 1994:
        return "Free Agent era (1978 - 1994)"
    elif 1995 <= year <= 2004:
        return "Steroid era (1995 - 2004)"
    elif 2005 <= year <= 2014:
        return "Contemporary era (2005 - 2014)"
    else:
        return None
teams['Era'] = teams.yearID.apply(classify_era)
plt.figure (figsize = (10,8))
sns.scatterplot(data=teams, x = 'WP', y ='attendance', hue = 'Era')
plt.ylabel('Average Home Attendance in a year')
plt.xlabel('Winning percentage of team in that year')

### Regular vs post season batting averages

In [None]:
df = pd.read_csv('core/Batting.csv')
post_df = pd.read_csv('core/BattingPost.csv')
batting_df = df[df['AB'] > 0]
batting_post_df = post_df[post_df['AB'] > 0]
batting_avg = batting_df.groupby('yearID').agg({'H': 'sum', 'AB': 'sum'})
batting_avg['BA'] = batting_avg['H'] / batting_avg['AB']
batting_post_avg = batting_post_df.groupby('yearID').agg({'H': 'sum', 'AB': 'sum'})
batting_post_avg['BA'] = batting_post_avg['H'] / batting_post_avg['AB']
plt.figure(figsize=(10, 6))
plt.plot(batting_avg.index, batting_avg['BA'], label='Regular Season BA', color='blue')
plt.plot(batting_post_avg.index, batting_post_avg['BA'], label='Postseason BA', color='red')
plt.axvline(x=1960, color='red', linestyle='--')
plt.xlabel('Year')
plt.ylabel('Batting Average')
plt.title('Regular Season vs Postseason Batting Averages by Year')
plt.legend()
plt.show()

## Salary over Time
### Requires inflation data

In [None]:
salary = pd.read_csv('core/Salaries.csv')
inflation = pd.read_csv('core/inflation_us.csv')
inflation.columns = ['yearID','rate']
salary_year = salary.groupby('yearID').agg({'salary':'mean'}).reset_index()
salary_year = salary_year.merge(inflation, on ='yearID')
salary_year['rate'] = salary_year['rate'] / 100
salary_year['inflation_adjusted_salary'] = salary_year['salary'].copy()
for i in range(1, len(salary_year)):
    salary_year.loc[i, 'inflation_adjusted_salary'] = salary_year.loc[i-1, 'inflation_adjusted_salary'] * (1 + salary_year.loc[i-1, 'rate'])
salary_year['yearID'] = salary_year['yearID'].astype(str)
fig, ax = plt.subplots(figsize=(8,5))
sns.barplot(data=salary_year, x='yearID', y='salary', ax=ax, color='skyblue', label='Actual Salary')
sns.lineplot(data=salary_year, x='yearID', y='inflation_adjusted_salary', ax=ax, color='red', marker='o', label='Inflation-Adjusted from Base Salary')
plt.xticks(rotation=90)
plt.tight_layout()
ax.set_xlabel('Year')
ax.set_ylabel('Salary ($)')
ax.set_title('Actual Salary and inflation-adjusted Base Salary (1985) over time')
ax.legend()
plt.show()

