In [None]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from matplotlib.lines import Line2D

df = pd.read_csv('baseball/core/Teams.csv')
# print(df.columns)

""" Analysis: which team makes the most money?
    Will a better team make more money?
    Win/Loss ratio vs. Attendance
"""

# calculate win ratio, group by teamID, and sort by attendance
df = df[['yearID', 'teamID', 'attendance', 'W', 'L']]
df['win_ratio'] = df['W'] / (df['W'] + df['L'])
df.drop(['W', 'L'], axis=1, inplace=True)
df.groupby('teamID').mean()
df.sort_values(by='attendance', ascending=False, inplace=True)

# drop rows with NaN
df.dropna(inplace=True)
# print(df.head(20))

# Standardize attendance by z-score
df['attendance'] = (df['attendance'] - df['attendance'].mean()) / df['attendance'].std()

# Do the two variables correlate?
correlation = df['attendance'].corr(df['win_ratio'])
print('Correlation between attendance and win ratio: ', correlation)
print('The low positive correlation suggests that there is a weak' +
      'relationship between the two variables.')
print('The correlation is calculated using df.corr(), which' +
      'uses Pearson correlation coefficient.')

# Print the regression summary (p-value, t-statistic, etc.)
X = df['attendance']
y = df['win_ratio']
model = sm.OLS(y, X).fit()
print(model.summary())
print('\nThe p-value is 0.040, which is less than 0.05, so we can reject the null hypothesis.')
print('This means that there is a statistically significant relationship between the two variables.')


# plot
sns.set(style='darkgrid')
sns.relplot(
            x='attendance', 
            y='win_ratio', 
            size=1, 
            height=10,
            aspect=2,
            legend=False,
            data=df
            )

legend_elements = [Line2D([0], [0], marker='o', color='w', label='Team', markerfacecolor='blue', markersize=10),
]

plt.xlabel('Attendance (z-score)')
plt.ylabel('Win Ratio')
plt.legend(handles=legend_elements, title='Mean team win-ratio \n each year', loc='upper left', bbox_to_anchor=(1, 1), ncol=5)
plt.title('Attendance vs. Win Ratio')


print('\nConclusion: There is a weak positive correlation between attendance and win ratio.')


