## Is The Average Hall of Fame Baseball Player Better Than the Average Baseball Player Year Over Year?

In [1]:
import pandas as pd
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import Range1d, HoverTool, LabelSet, Label
from bokeh.io import output_file, show
from bokeh.layouts import gridplot


In [4]:
# Reads in files
df_pitching = pd.read_csv("../Pitching.csv")
df_batting = pd.read_csv("../Batting.csv")
df_hof = pd.read_csv("../HallOfFame.csv")
df_playerInfo = pd.read_csv('../People.csv')



In [5]:
#Merges files 
batting = pd.merge(df_batting, df_playerInfo, on="playerID")
pitching = pd.merge(df_pitching, df_playerInfo, on="playerID")

hof_batting = pd.merge(df_hof,df_batting, on="playerID")
hof_pitching = pd.merge(df_hof,df_pitching, on="playerID")

hof_pitching_names = pd.merge(hof_pitching, df_playerInfo, on ="playerID")
hof_batting_names = pd.merge(hof_batting,df_playerInfo, on="playerID")


In [6]:
#generates season and name columns
pitching['season'] = pitching.groupby('playerID')['playerID'].transform('count')
batting['season'] = batting.groupby('playerID')['playerID'].transform('count')

pitching['name'] = pitching['nameFirst']+pitching['nameLast']
batting['name'] = batting['nameFirst']+batting['nameLast']


pitching = pitching.round(decimals=3);
batting = batting.round(decimals=3);

In [7]:
#generates batting average column
batting['BA'] = batting['H']/batting['AB']

In [8]:
# Drops all elements not player, inducted  = "n", and creates more season and names columns for different data frames 
hof_pitching_names = hof_pitching_names[hof_pitching_names.category == 'Player']
hof_pitching_names = hof_pitching_names[hof_pitching_names.inducted != 'N']
hof_pitching_names['season'] = hof_pitching_names.groupby('playerID')['playerID'].transform('count')
hof_pitching_names['name'] = hof_pitching_names['nameFirst']+hof_batting_names['nameLast']
hof_batting_names = hof_batting_names.round(decimals=3);

In [9]:
# Drops all elements not player, inducted  = "n", and creates more season and names columns for different data frames 

hof_batting_names = hof_batting_names[hof_batting_names.category == 'Player']
hof_batting_names = hof_batting_names[hof_batting_names.inducted != 'N']
hof_batting_names['season'] = hof_batting_names.groupby('playerID')['playerID'].transform('count')
hof_batting_names['name'] = hof_batting_names['nameFirst']+hof_batting_names['nameLast']
hof_batting_names['BA'] = hof_batting_names['H']/hof_batting_names['AB']
hof_batting_names = hof_batting_names.round(decimals=3);

In [10]:
# Deletes unwanted columns
columns = ['yearid','votedBy', 'ballots',
          'needed','votes','inducted','category',
           'needed_note', 'bats','throws',
          'stint','teamID','lgID','birthYear',
          'birthMonth','birthDay','birthCountry',
          'birthState', 'birthCity','deathYear',
          'deathMonth','deathDay','deathCountry',
          'deathState','deathCity','nameGiven',
          'debut','finalGame','retroID','bbrefID',
           'nameFirst', 'nameLast', 'playerID','name'
          ]
hof_batting_names.drop(columns, inplace = True, axis =1)
hof_pitching_names.drop(columns, inplace = True, axis =1)




In [11]:
#Deletes unwanted columns
columns = ['bats','throws',
          'stint','teamID','lgID','birthYear',
          'birthMonth','birthDay','birthCountry',
          'birthState', 'birthCity','deathYear',
          'deathMonth','deathDay','deathCountry',
          'deathState','deathCity','nameGiven',
          'debut','finalGame','retroID','bbrefID',
           'nameFirst', 'nameLast', 'playerID','name'
          ]
pitching.drop(columns, inplace = True, axis =1)
batting.drop(columns, inplace = True, axis =1)

In [12]:
#fills nan values
hof_batting_names = hof_batting_names.fillna(0)
hof_pitching_names = hof_pitching_names.fillna(0)

pitching = pitching.fillna(0)
batting = batting.fillna(0)

In [13]:
# does a groupby
hof_batting_names = hof_batting_names.groupby([ 'yearID'], as_index = False).mean()
hof_pitching_names = hof_pitching_names.groupby([ 'yearID'], as_index = False).mean()

In [14]:
#does a groupby
batting = batting.groupby([ 'yearID'], as_index = False).mean()
pitching = pitching.groupby([ 'yearID'], as_index = False).mean()

In [15]:
hof_pitching_names =  hof_pitching_names.round(decimals = 3)
hof_batting_names =  hof_batting_names.round(decimals = 3);

pitching = pitching.round(decimals = 3)
batting = batting.round(decimals = 3);

In [16]:
#sets ranges so x and y lengths match for grpahing
batting = batting[batting.yearID < 2011]
pitching = pitching[pitching.yearID < 2011]
pitching = pitching[pitching.yearID > 1877]
hof_pitching_names= hof_pitching_names[hof_pitching_names.yearID > 1877]
hof_batting_names = hof_batting_names[hof_batting_names.yearID < 2011]
hof_pitching_names = hof_pitching_names[hof_pitching_names.yearID < 2011]

In [17]:
# initializes x and y coordinates for batting

x_batting = list(range(1871,2011))


y_hof_ba = hof_batting_names['BA']
y_all_ba = batting['BA']

y_hof_hr = hof_batting_names['HR']
y_all_hr = batting['HR']

y_hof_so = hof_batting_names['SO']
y_all_so = batting['SO']

y_hof_rbi = hof_batting_names['RBI']
y_all_rbi = batting['RBI']

y_hof_2B = hof_batting_names['2B']
y_all_2B = batting['2B']

y_hof_3B = hof_batting_names['3B']
y_all_3B = batting['3B']

In [18]:
#initializes x and y coordinates for grpahing 
x_pitching = list(range(1878,2011))

y_hof_wins = hof_pitching_names['W']
y_all_wins = pitching['W']

y_hof_loss = hof_pitching_names['L']
y_all_loss = pitching['L']

y_hof_games = hof_pitching_names['G']
y_all_games = pitching['G']

y_hof_shutout = hof_pitching_names['SHO']
y_all_shutout = pitching['SHO']

y_hof_saves = hof_pitching_names['SV']
y_all_saves = pitching['SV']

y_hof_era = hof_pitching_names['ERA']
y_all_era = pitching['ERA']

y_hof_hbp = hof_pitching_names['HBP']
y_all_hbp = pitching['HBP']

y_hof_gidp = hof_pitching_names['GIDP']
y_all_gidp = pitching['GIDP']

In [19]:
output_file("AverageHOF_v_AverageAll.html")

In [20]:

#initializes hover attributes
hover_ba= HoverTool(tooltips=[
    ("Average Batting Average", "@y"), ('Year', '@x')
])

# Plots all 12 visuals
b_ba = figure(title = 'Batting: Batting Average',plot_width=400, plot_height=400,background_fill_color = 'green', background_fill_alpha = 0.1, tools = [hover_ba])
b_ba.title.align = 'center'
b_ba.scatter(x_batting,y_hof_ba,legend = 'Hall Of Fame', size = 2, color= 'blue', alpha = 0.4)
b_ba.scatter(x_batting,y_all_ba, legend = 'All Players', size = 2, color= 'gold', alpha = 0.4)
b_ba.line(x_batting,y_hof_ba, color = 'blue', line_width = 1.7)
b_ba.line(x_batting,y_all_ba, color = 'gold', line_width = 1.7)
b_ba.y_range = Range1d(0, 0.5)
b_ba.xaxis.axis_label = 'Year'
b_ba.yaxis.axis_label = 'Batting Average'

hover_hr = HoverTool(tooltips=[
    ("Average Number of Home Runs", "$y"), ('Year', '@x')
])
b_hr = figure(title = 'Batting: Homeruns',plot_width=400, plot_height=400, tools = [hover_hr],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_hr.title.align = 'center'
b_hr.scatter(x_batting,y_hof_hr, size = 2, color= 'blue', alpha = 0.4)
b_hr.scatter(x_batting,y_all_hr, size = 2, color= 'gold', alpha = 0.4)
b_hr.line(x_batting,y_hof_hr, color = 'blue', line_width = 1.7)
b_hr.line(x_batting,y_all_hr, color = 'gold', line_width = 1.7)
b_hr.y_range = Range1d(0, 20)
b_hr.xaxis.axis_label = 'Year'
b_hr.yaxis.axis_label = 'Home Runs'

hover_so = HoverTool(tooltips=[
    ("Average Number of Strikouts", "@y"), ('Year', '@x')])
b_so = figure(title = 'Batting: Strikeouts',plot_width=400, plot_height=400, tools = [hover_so],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_so.title.align = 'center'
b_so.scatter(x_batting,y_hof_so, size = 2, color= 'blue', alpha = 0.4)
b_so.scatter(x_batting,y_all_so, size = 2, color= 'gold', alpha = 0.4)
b_so.line(x_batting,y_hof_so, color = 'blue', line_width = 1.7)
b_so.line(x_batting,y_all_so, color = 'gold', line_width = 1.7)

b_so.y_range = Range1d(0, 100)
b_so.xaxis.axis_label = 'Year'
b_so.yaxis.axis_label = 'Strike Outs'

hover_rbi = HoverTool(tooltips=[
    ("Average Number of Runs Batted In", "@y"), ('Year', '@x')])
b_rbi = figure(title = 'Batting: RBI',plot_width=400, plot_height=400, tools = [hover_rbi],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_rbi.title.align = 'center'
b_rbi.scatter(x_batting,y_hof_rbi, size = 2, color= 'blue', alpha = 0.4)
b_rbi.scatter(x_batting,y_all_rbi, size = 2, color= 'gold', alpha = 0.4)
b_rbi.line(x_batting,y_hof_rbi,  color = 'blue', line_width = 1.7)
b_rbi.line(x_batting,y_all_rbi, color = 'gold', line_width = 1.7)
b_rbi.y_range = Range1d(0, 100)
b_rbi.xaxis.axis_label = 'Year'
b_rbi.yaxis.axis_label = 'Runs Batted In'

hover_2B = HoverTool(tooltips=[
    ("Average Number of Doubles", "@y"), ('Year', '@x')])
b_2B = figure(title = 'Batting: Doubles',plot_width=400, plot_height=400, tools = [hover_2B],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_2B.title.align = 'center'
b_2B.scatter(x_batting,y_hof_2B, size = 2, color= 'blue', alpha = 0.4)
b_2B.scatter(x_batting,y_all_2B, size = 2, color= 'gold', alpha = 0.4)
b_2B.line(x_batting,y_hof_2B,  color = 'blue', line_width = 1.7)
b_2B.line(x_batting,y_all_2B, color = 'gold', line_width = 1.7)
b_2B.y_range = Range1d(0, 50)
b_2B.xaxis.axis_label = 'Year'
b_2B.yaxis.axis_label = 'Doubles'

hover_3B = HoverTool(tooltips=[
    ("Average Number of Triples", "@y"), ('Year', '@x')])
b_3B = figure(title = 'Batting: Triples',plot_width=400, plot_height=400, tools = [hover_3B],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_3B.title.align = 'center'
b_3B.scatter(x_batting,y_hof_3B,size = 2, color= 'blue', alpha = 0.4)
b_3B.scatter(x_batting,y_all_3B, size = 2, color= 'gold', alpha = 0.4)
b_3B.line(x_batting,y_hof_3B, color = 'blue', line_width = 1.7)
b_3B.line(x_batting,y_all_3B, color = 'gold', line_width = 1.7)
b_3B.y_range = Range1d(0, 50)
b_3B.xaxis.axis_label = 'Year'
b_3B.yaxis.axis_label = 'Triples'


hover_wins = HoverTool(tooltips=[
    ("Average Number of Wins", "@y"), ('Year', '@x')])
b_wins = figure(title = 'Pitching: Wins',plot_width=400, plot_height=400, tools = [hover_wins],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_wins.title.align = 'center'
b_wins.scatter(x_pitching,y_hof_wins, legend = 'Hall Of Fame', size = 2, color= 'brown', alpha = 0.4)
b_wins.scatter(x_pitching,y_all_wins, legend = 'All Players', size = 2, color= 'orange', alpha = 0.4)
b_wins.line(x_pitching,y_hof_wins,  color = 'brown', line_width = 1.7)
b_wins.line(x_pitching,y_all_wins, color = 'orange', line_width = 1.7)
b_wins.y_range = Range1d(0, 50)
b_wins.xaxis.axis_label = 'Year'
b_wins.yaxis.axis_label = 'Wins'

hover_loss = HoverTool(tooltips=[
    ("Average Number of Losses", "@y"), ('Year', '@x')])
b_loss = figure(title = 'Pitching: Losses',plot_width=400, plot_height=400, tools = [hover_loss],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_loss.title.align = 'center'
b_loss.scatter(x_pitching,y_hof_loss, size = 2, color= 'brown', alpha = 0.4)
b_loss.scatter(x_pitching,y_all_loss, size = 2, color= 'orange', alpha = 0.4)
b_loss.line(x_pitching,y_hof_loss,  color = 'brown', line_width = 1.7)
b_loss.line(x_pitching,y_all_loss, color = 'orange', line_width = 1.7)
b_loss.y_range = Range1d(0, 50)
b_loss.xaxis.axis_label = 'Year'
b_loss.yaxis.axis_label = 'Losses'

hover_saves = HoverTool(tooltips=[
    ("Average Number of Saves", "@y"), ('Year', '@x')])
b_saves = figure(title = 'Pitching: Saves',plot_width=400, plot_height=400, tools = [hover_saves],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_saves.title.align = 'center'
b_saves.scatter(x_pitching,y_hof_loss, size = 2, color= 'brown', alpha = 0.4)
b_saves.scatter(x_pitching,y_all_loss, size = 2, color= 'orange', alpha = 0.4)
b_saves.line(x_pitching,y_hof_loss,  color = 'brown', line_width = 1.7)
b_saves.line(x_pitching,y_all_loss, color = 'orange', line_width = 1.7)
b_saves.y_range = Range1d(0, 50)
b_saves.xaxis.axis_label = 'Year'
b_saves.yaxis.axis_label = 'Saves'

hover_era = HoverTool(tooltips=[
    ("Average Earned Runs Against", "@y"), ('Year', '@x')])
b_era = figure(title = 'Pitching: ERA',plot_width=400, plot_height=400, tools = [hover_era],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_era.title.align = 'center'
b_era.scatter(x_pitching,y_hof_era, size = 2, color= 'brown', alpha = 0.4)
b_era.scatter(x_pitching,y_all_era, size = 2, color= 'orange', alpha = 0.4)
b_era.line(x_pitching,y_hof_era,  color = 'brown', line_width = 1.7)
b_era.line(x_pitching,y_all_era, color = 'orange', line_width = 1.7)
b_era.y_range = Range1d(0, 20)
b_era.xaxis.axis_label = 'Year'
b_era.yaxis.axis_label = 'ERA'

hover_games = HoverTool(tooltips=[
    ('Average Number of Individual Games Pitched', "@y"),('Year', '@x')])
b_games = figure(title = 'Pitching: Games Played',plot_width=400, plot_height=400, tools = [hover_games],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_games.title.align = 'center'
b_games.scatter(x_pitching,y_hof_games, size = 2, color= 'brown', alpha = 0.4)
b_games.scatter(x_pitching,y_all_games, size = 2, color= 'orange', alpha = 0.4)
b_games.line(x_pitching,y_hof_games,  color = 'brown', line_width = 1.7)
b_games.line(x_pitching,y_all_games, color = 'orange', line_width = 1.7)
b_games.y_range = Range1d(0, 50)
b_games.xaxis.axis_label = 'Year'
b_games.yaxis.axis_label = 'Games Played'

hover_shutout = HoverTool(tooltips=[
    ("Average Number of Shutout Games Pitched", "@y"),
('Year', '@x')])
b_shutout = figure(title = 'Pitching: Shutouts',plot_width=400, plot_height=400, tools = [hover_shutout],  background_fill_color = 'green', background_fill_alpha = 0.1)
b_shutout.title.align = 'center'
b_shutout.scatter(x_pitching,y_hof_shutout, size = 2, color= 'brown', alpha = 0.4)
b_shutout.scatter(x_pitching,y_all_shutout, size = 2, color= 'orange', alpha = 0.4)
b_shutout.line(x_pitching,y_hof_shutout,  color = 'brown', line_width = 1.7)
b_shutout.line(x_pitching,y_all_shutout, color = 'orange', line_width = 1.7)
b_shutout.y_range = Range1d(0, 20)
b_shutout.xaxis.axis_label = 'Year'
b_shutout.yaxis.axis_label = 'Shutouts'


b = gridplot([[b_ba, b_rbi, b_so],[b_2B, b_3B,b_hr],[b_wins,b_loss, b_era],[b_saves,b_games,b_shutout]],  background_fill_color = 'green', background_fill_alpha = 0.1)

show(b)

