In [None]:
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
file_path = r'C:\Users\justi\Documents\Projects\Data\Baseball_databank'

teams_path = file_path + '\Teams.csv'
salary_path = file_path + '\Salaries.csv'
batting_path = file_path + '\Batting.csv'

In [None]:
teams_df = pd.read_csv(teams_path)
teams_df = teams_df[teams_df['yearID'] >= 1985]
teams_df.info()

In [None]:
salary_df = pd.read_csv(salary_path)
salary_df = salary_df[salary_df['yearID'] >= 1985]
salary_df.info()

In [None]:
# How have salaries evolved by division over the years?
# Sum of salaries by division by year, then dig in by team

# .reset_index() converts back to a pandas dataframe, otherwise it would have been a series

team_salary_by_year_df = salary_df.groupby(['yearID','teamID','lgID'])['salary'].sum().reset_index()
team_salary_by_year_df.head()

In [None]:
# cool reference material for graphs here:
# https://medium.com/dataexplorations/reproducing-the-baseball-salary-visualization-from-fivethirtyeight-in-matplotlib-1af449beb409
# Lets try and recreate!

team_salary_by_year_df['salary_zscore'] = team_salary_by_year_df.groupby(['yearID'])['salary'].transform(
    lambda x: (x - x.mean()) / x.std()
)

team_salary_by_year_df.head()

In [None]:
teams_df.head()

In [None]:
#now, lets get win rates. First step is grabbing season wins and losses from the teams table and cleaning it up
teams_df = teams_df[['yearID','lgID','teamID','G','W','L']]
teams_df = teams_df[(teams_df['yearID'] >= 1985)].reset_index().drop('index',axis=1)
teams_df.head()

In [None]:
# Now that we have wins and total # games (W and G)
teams_df['win_rate'] = teams_df['W']/teams_df['G']
teams_df.head()

In [None]:
merged_df = team_salary_by_year_df.merge(teams_df, on = ['yearID','lgID', 'teamID']).reset_index().drop('index',axis = 1)
merged_df.head()

In [None]:
# Lets start with one scatterplot for the Phillies (go Matt). THink it's PHI but lets check
a = merged_df['teamID'].unique()
print(sorted(a))

In [None]:
# Now, lets graph this!
plt.scatter(x = phillies['salary_zscore'], y = phillies['win_rate'], alpha = 0.5)
plt.title("PHI")
plt.xlabel("Standardized Salaries")
plt.ylabel("Win Rate")

#plotting a grid axes (win rate at y=0.5, Salary zscore at x=0)
# this also looks like it forces the graph space to be square and centered somewhat - cool
plt.hlines(0.5,-2,2)
plt.vlines(0,0.4,0.6)

# I want to add a line of best fit to the graph for the win rate. Going to use the polyfit function in NumPy, although this is outdated. Come back and update
z = np.polyfit(phillies['salary_zscore'],phillies['win_rate'], 1)
p = np.poly1d(z)

plt.plot(phillies['salary_zscore'], p(phillies['salary_zscore']),alpha=0.8,c ='#1f77b4',linewidth =3)

plt.show()

In [None]:
# Lets do this again, but tweaking a bit to remove the border and add a gridline.

# not sure how declaring fig, ax and using subplots allowes the use of spines 
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(x=phillies['salary_zscore'], y=phillies['win_rate'], alpha=0.5)
plt.title('PHI')
plt.ylabel('Win Rate')
plt.xlabel('Standardized Salaries')

plt.hlines(0.5, -2, 2)
plt.vlines(0,0.3,0.7)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.grid(color='grey', linestyle ='-', linewidth=0.25, alpha=0.5)

z = np.polyfit(phillies['salary_zscore'],phillies['win_rate'], 1)
p = np.poly1d(z)

plt.plot(phillies['salary_zscore'], p(phillies['salary_zscore']),alpha=0.8,c ='#1f77b4',linewidth =3)

plt.show()