## Visualization #3

In [1]:
#Research Question: How does average salary change over year by League?

### Data Importing and Calculating

In [2]:
#Imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from bokeh.plotting import *
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models.glyphs import Line as Line_glyph, Oval
from bokeh.models import Range1d, LinearAxis, FactorRange, LabelSet, Legend
from bokeh.charts import Bar, Scatter, BoxPlot
from bokeh.charts.attributes import cat
from bokeh.layouts import gridplot
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
%matplotlib inline
pd.options.display.max_columns = 999

In [3]:
#Grabbing Data
hof_df = pd.read_csv("baseballdatabank-master/core/HallOfFame.csv", low_memory = False)
pitching_df = pd.read_csv("baseballdatabank-master/core/Pitching.csv", low_memory = False)
batting_df = pd.read_csv("baseballdatabank-master/core/Batting.csv", low_memory = False)
salaries_df = pd.read_csv("baseballdatabank-master/core/Salaries.csv", low_memory = False)

In [4]:
#Boolean Indexing to get specific data
nl_df = salaries_df[salaries_df['lgID'] == 'NL']
al_df = salaries_df[salaries_df['lgID'] == 'AL']

In [5]:
#Finding Average Salary per Year
NLsalAvg_df = nl_df.groupby(['yearID'], as_index = False).mean()
ALsalAvg_df = al_df.groupby(['yearID'], as_index = False).mean()

In [6]:
NLsalAvg_df.describe()
ALsalAvg_df.describe()

Unnamed: 0,yearID,salary
count,32.0,32.0
mean,2000.5,2136574.0
std,9.380832,1349938.0
min,1985.0,402337.9
25%,1992.75,1025844.0
50%,2000.5,2168793.0
75%,2008.25,3393465.0
max,2016.0,4788923.0


### Creating Graph

In [7]:
#Graph in Bokeh
output_notebook()

#Interactivity
hover = HoverTool(tooltips=[
    ("Year", "@x"),
    ("Average Salary", "@y{int}"),
])

#Creating High-Level Figure
s1 = figure( plot_width = 1000, plot_height = 600, tools = [hover])
s1.left[0].formatter.use_scientific = False

#Adding Data
a = s1.scatter(x = ALsalAvg_df['yearID'], y = ALsalAvg_df['salary'])
b = s1.scatter(x = ALsalAvg_df['yearID'], y = NLsalAvg_df['salary'], color = 'red')

#Creating Labels
s1.xaxis.axis_label = 'Year'
s1.yaxis.axis_label = 'Average Salary'

#Legend
legend = Legend(items=[("American League" , [a]), ("National League", [b])])
s1.add_layout(legend)

#Regression Lines Calculations
poly = PolynomialFeatures(degree = 1)

#American League
X = np.array(ALsalAvg_df['yearID']).reshape((len(ALsalAvg_df['yearID']), 1))
x = poly.fit_transform(X)

lg = linear_model.LinearRegression()

lg.fit(x, ALsalAvg_df['salary'])

y = lg.predict(x)

#National League
X2 = np.array(NLsalAvg_df['yearID']).reshape((len(NLsalAvg_df['yearID']), 1))
x2 = poly.fit_transform(X2)

lg2 = linear_model.LinearRegression()

lg2.fit(x2, NLsalAvg_df['salary'])

y2 = lg2.predict(x2)

#Adding Regression Lines to Graphs
s1.line(ALsalAvg_df['yearID'], y)
s1.line(ALsalAvg_df['yearID'], y2, color = 'red')

show(s1)

In [122]:
poly = PolynomialFeatures(degree = 1)
X = np.array(ALsalAvg_df['yearID']).reshape((len(ALsalAvg_df['yearID']), 1))
x = poly.fit_transform(X)

lg = linear_model.LinearRegression()

lg.fit(x, ALsalAvg_df['salary'])

y = lg.predict(x)

list(y)

[-58758.990185797215,
 82875.408373296261,
 224509.80693238974,
 366144.20549154282,
 507778.60405063629,
 649413.00260972977,
 791047.40116888285,
 932681.79972797632,
 1074316.1982870698,
 1215950.5968462229,
 1357584.9954053164,
 1499219.3939644098,
 1640853.7925235629,
 1782488.1910826564,
 1924122.5896417499,
 2065756.9882009029,
 2207391.3867599964,
 2349025.7853190899,
 2490660.1838781834,
 2632294.5824373364,
 2773928.9809964299,
 2915563.3795555234,
 3057197.7781146765,
 3198832.17667377,
 3340466.5752328634,
 3482100.9737920165,
 3623735.37235111,
 3765369.7709102035,
 3907004.1694693565,
 4048638.56802845,
 4190272.9665875435,
 4331907.3651466966]

In [128]:
#National League
X2 = np.array(NLsalAvg_df['yearID']).reshape((len(NLsalAvg_df['yearID']), 1))
x2 = poly.fit_transform(X2)

lg2 = linear_model.LinearRegression()

lg2.fit(x2, NLsalAvg_df['salary'])

y2 = lg2.predict(x2)

list(y2)

[7937.4930013716221,
 135257.63015595078,
 262577.76731052995,
 389897.90446507931,
 517218.04161965847,
 644538.17877423763,
 771858.3159288168,
 899178.45308339596,
 1026498.5902379751,
 1153818.7273925245,
 1281138.8645471036,
 1408459.0017016828,
 1535779.138856262,
 1663099.2760108411,
 1790419.4131654203,
 1917739.5503199697,
 2045059.6874745488,
 2172379.824629128,
 2299699.9617837071,
 2427020.0989382863,
 2554340.2360928655,
 2681660.3732474148,
 2808980.510401994,
 2936300.6475565732,
 3063620.7847111523,
 3190940.9218657315,
 3318261.0590202808,
 3445581.19617486,
 3572901.3333294392,
 3700221.4704840183,
 3827541.6076385975,
 3954861.7447931767]