# Database: HOF inductees/votes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df_hof = pd.read_csv("data/HallOfFame.csv")
df_hof.head()

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


In [2]:
df_hof.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4191 entries, 0 to 4190
Data columns (total 9 columns):
playerID       4191 non-null object
yearID         4191 non-null int64
votedBy        4191 non-null object
ballots        3994 non-null float64
needed         3837 non-null float64
votes          3994 non-null float64
inducted       4191 non-null object
category       4191 non-null object
needed_note    157 non-null object
dtypes: float64(3), int64(1), object(5)
memory usage: 294.8+ KB


In [3]:
see_bonds = df_hof[df_hof['playerID'].str.contains('walkela01')]
see_bonds.head(50)

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
3922,walkela01,2011,BBWAA,581.0,436.0,118.0,N,Player,
3955,walkela01,2012,BBWAA,573.0,430.0,131.0,N,Player,
3986,walkela01,2013,BBWAA,569.0,427.0,123.0,N,Player,
4036,walkela01,2014,BBWAA,571.0,429.0,58.0,N,Player,
4070,walkela01,2015,BBWAA,549.0,412.0,65.0,N,Player,
4102,walkela01,2016,BBWAA,440.0,330.0,68.0,N,Player,
4132,walkela01,2017,BBWAA,442.0,332.0,97.0,N,Player,
4166,walkela01,2018,BBWAA,422.0,317.0,144.0,N,Player,


In [4]:
# groupby HOFers only (exclude those eligible, but not voted in) just for reference
df_hof[df_hof.inducted=='Y'].category.value_counts()

Player               256
Pioneer/Executive     34
Manager               23
Umpire                10
Name: category, dtype: int64

In [5]:
# Drop managers, executives, umpires so we are only looking at players
df_hof = df_hof[df_hof['category']=='Player']
df_hof.inducted.value_counts()

N    3810
Y     256
Name: inducted, dtype: int64

In [6]:
# Drop columns
df_hof.drop(['needed_note','category','votedBy'], axis = 1, inplace = True)
df_hof.head()

Unnamed: 0,playerID,yearID,ballots,needed,votes,inducted
0,cobbty01,1936,226.0,170.0,222.0,Y
1,ruthba01,1936,226.0,170.0,215.0,Y
2,wagneho01,1936,226.0,170.0,215.0,Y
3,mathech01,1936,226.0,170.0,205.0,Y
4,johnswa01,1936,226.0,170.0,189.0,Y


In [7]:
# jeter and larry walker are HOFers (2020 HOF class)


df2 = {'playerID': 'jeterde01', 'yearID': 2020, 'ballots': 222, 'needed': 221, 
       'votes': 221, 'inducted': 'Y'}
df_hof = df_hof.append(df2, ignore_index=True)
df3 = {'playerID': 'walkela01', 'yearID': 2020, 'ballots': 222, 'needed': 221, 
       'votes': 221, 'inducted': 'Y'}
df_hof = df_hof.append(df3, ignore_index=True)
# df_hof.at['walkela01', 'inducted'] = 'Y'

In [8]:
df_hof.tail()

Unnamed: 0,playerID,yearID,ballots,needed,votes,inducted
4063,zambrca01,2018,422.0,317.0,0.0,N
4064,morrija02,2018,,,,Y
4065,trammal01,2018,,,,Y
4066,jeterde01,2020,222.0,221.0,221.0,Y
4067,walkela01,2020,222.0,221.0,221.0,Y


In [9]:
df_hof.inducted.value_counts()

N    3810
Y     258
Name: inducted, dtype: int64

In [10]:
df_hof.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 6 columns):
playerID    4068 non-null object
yearID      4068 non-null int64
ballots     3938 non-null float64
needed      3781 non-null float64
votes       3938 non-null float64
inducted    4068 non-null object
dtypes: float64(3), int64(1), object(2)
memory usage: 190.8+ KB


In [11]:
see_bonds = df_hof[df_hof['playerID'].str.contains('walkela01')]
see_bonds.head(50)

Unnamed: 0,playerID,yearID,ballots,needed,votes,inducted
3805,walkela01,2011,581.0,436.0,118.0,N
3837,walkela01,2012,573.0,430.0,131.0,N
3868,walkela01,2013,569.0,427.0,123.0,N
3913,walkela01,2014,571.0,429.0,58.0,N
3947,walkela01,2015,549.0,412.0,65.0,N
3979,walkela01,2016,440.0,330.0,68.0,N
4009,walkela01,2017,442.0,332.0,97.0,N
4041,walkela01,2018,422.0,317.0,144.0,N
4067,walkela01,2020,222.0,221.0,221.0,Y


In [12]:
df_hof.loc[df_hof['inducted']=='Y']

Unnamed: 0,playerID,yearID,ballots,needed,votes,inducted
0,cobbty01,1936,226.0,170.0,222.0,Y
1,ruthba01,1936,226.0,170.0,215.0,Y
2,wagneho01,1936,226.0,170.0,215.0,Y
3,mathech01,1936,226.0,170.0,205.0,Y
4,johnswa01,1936,226.0,170.0,189.0,Y
...,...,...,...,...,...,...
4034,hoffmtr01,2018,422.0,317.0,337.0,Y
4064,morrija02,2018,,,,Y
4065,trammal01,2018,,,,Y
4066,jeterde01,2020,222.0,221.0,221.0,Y


In [13]:
# Replace missing values with median for multiple columns
cols = ['ballots','needed','votes']
df_hof[cols] = df_hof[cols].fillna(df_hof.median().iloc[0])

In [14]:
# Create column for % of votes received
df_hof['percent_voted_yes'] = (df_hof['votes']/df_hof['ballots'])*100
df_hof.percent_voted_yes.round(decimals=2)
df_hof.head()

Unnamed: 0,playerID,yearID,ballots,needed,votes,inducted,percent_voted_yes
0,cobbty01,1936,226.0,170.0,222.0,Y,98.230088
1,ruthba01,1936,226.0,170.0,215.0,Y,95.132743
2,wagneho01,1936,226.0,170.0,215.0,Y,95.132743
3,mathech01,1936,226.0,170.0,205.0,Y,90.707965
4,johnswa01,1936,226.0,170.0,189.0,Y,83.628319


In [15]:
df_hof.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 7 columns):
playerID             4068 non-null object
yearID               4068 non-null int64
ballots              4068 non-null float64
needed               4068 non-null float64
votes                4068 non-null float64
inducted             4068 non-null object
percent_voted_yes    4068 non-null float64
dtypes: float64(4), int64(1), object(2)
memory usage: 222.6+ KB


In [16]:
df_hof.duplicated().sum()

0

In [17]:
df_hof_only = df_hof.loc[df_hof['inducted']=='Y']
df_hof_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258 entries, 0 to 4067
Data columns (total 7 columns):
playerID             258 non-null object
yearID               258 non-null int64
ballots              258 non-null float64
needed               258 non-null float64
votes                258 non-null float64
inducted             258 non-null object
percent_voted_yes    258 non-null float64
dtypes: float64(4), int64(1), object(2)
memory usage: 16.1+ KB


In [18]:
df_hof_only = df_hof_only[['playerID','inducted']]
df_hof_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258 entries, 0 to 4067
Data columns (total 2 columns):
playerID    258 non-null object
inducted    258 non-null object
dtypes: object(2)
memory usage: 6.0+ KB


# Database: People

In [19]:
df_names = pd.read_csv('data/People.csv')
df_names.head()
print(df_names.shape)

(20093, 24)


In [20]:
# Drop columns
df_names.drop(['birthYear','birthMonth','birthDay','birthCity','deathYear','deathMonth','deathDay','weight',
               'deathCountry','deathState','deathCity','debut','finalGame'], axis = 1, inplace = True)
df_names.head()

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID
0,aardsda01,USA,CO,David,Aardsma,David Allan,75.0,R,R,aardd001,aardsda01
1,aaronha01,USA,AL,Hank,Aaron,Henry Louis,72.0,R,R,aaroh101,aaronha01
2,aaronto01,USA,AL,Tommie,Aaron,Tommie Lee,75.0,R,R,aarot101,aaronto01
3,aasedo01,USA,CA,Don,Aase,Donald William,75.0,R,R,aased001,aasedo01
4,abadan01,USA,FL,Andy,Abad,Fausto Andres,73.0,L,L,abada001,abadan01


In [21]:
df_names.isna().sum()

playerID           0
birthCountry      61
birthState       532
nameFirst         37
nameLast           0
nameGiven         37
height           736
bats            1180
throws           976
retroID           56
bbrefID            2
dtype: int64

In [22]:
# Drop rows where nulls in 'nameFirst' aka keep the rows that are notna
df_names = df_names[df_names['nameFirst'].notna()]

In [23]:
# Replace missing values with 'unknown' and median
df_names['birthCountry'] = df_names['birthCountry'].fillna('unknown')
df_names['birthState'] = df_names['birthState'].fillna('unknown')
df_names['retroID'] = df_names['retroID'].fillna('unknown')
df_names['bbrefID'] = df_names['bbrefID'].fillna('unknown')
df_names['height'] = df_names['height'].fillna(value=df_names['height'].median)

In [24]:
df_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20056 entries, 0 to 20092
Data columns (total 11 columns):
playerID        20056 non-null object
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
nameGiven       20056 non-null object
height          20056 non-null object
bats            18913 non-null object
throws          19117 non-null object
retroID         20056 non-null object
bbrefID         20056 non-null object
dtypes: object(11)
memory usage: 1.8+ MB


Below I am going to replace the missing values in 'bats' and 'throws' with the percentage of that of the respective column for the entire dataset. 

In [25]:
df_names.bats.value_counts(normalize=True)

R    0.657590
L    0.277428
B    0.064982
Name: bats, dtype: float64

In [26]:
df_names.throws.value_counts(normalize=True)

R    0.798295
L    0.201653
S    0.000052
Name: throws, dtype: float64

In [27]:
# getting values and associated probabilites
options  = df_names.throws.value_counts(normalize=True).index.to_list()
percents = df_names.throws.value_counts(normalize=True).to_list()
#using np.random.choice to select
df_names['throws'] = df_names['throws'].apply(lambda x: np.random.choice(options,1, True,percents)[0] if (not isinstance(x, str)) else x)
df_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20056 entries, 0 to 20092
Data columns (total 11 columns):
playerID        20056 non-null object
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
nameGiven       20056 non-null object
height          20056 non-null object
bats            18913 non-null object
throws          20056 non-null object
retroID         20056 non-null object
bbrefID         20056 non-null object
dtypes: object(11)
memory usage: 1.8+ MB


In [28]:
# getting values and associated probabilites
options  = df_names.bats.value_counts(normalize=True).index.to_list()
percents = df_names.bats.value_counts(normalize=True).to_list()
#using np.random.choice to select
df_names['bats'] = df_names['bats'].apply(lambda x: np.random.choice(options,1, True,percents)[0] if (not isinstance(x, str)) else x)
df_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20056 entries, 0 to 20092
Data columns (total 11 columns):
playerID        20056 non-null object
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
nameGiven       20056 non-null object
height          20056 non-null object
bats            20056 non-null object
throws          20056 non-null object
retroID         20056 non-null object
bbrefID         20056 non-null object
dtypes: object(11)
memory usage: 1.8+ MB


In [29]:
df_names.bats.value_counts(normalize=True)

R    0.657260
L    0.278321
B    0.064420
Name: bats, dtype: float64

In [30]:
df_names.throws.value_counts(normalize=True)

R    0.797766
L    0.202184
S    0.000050
Name: throws, dtype: float64

In [31]:
df_names.head()

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID
0,aardsda01,USA,CO,David,Aardsma,David Allan,75,R,R,aardd001,aardsda01
1,aaronha01,USA,AL,Hank,Aaron,Henry Louis,72,R,R,aaroh101,aaronha01
2,aaronto01,USA,AL,Tommie,Aaron,Tommie Lee,75,R,R,aarot101,aaronto01
3,aasedo01,USA,CA,Don,Aase,Donald William,75,R,R,aased001,aasedo01
4,abadan01,USA,FL,Andy,Abad,Fausto Andres,73,L,L,abada001,abadan01


# Database: Batting

In [32]:
df_batting = pd.read_csv('data/Batting.csv')
df_batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0


In [33]:
# Modern Era of Baseball is 1900 forward and dropping 2020 as it is not a full year
df_batting = df_batting[df_batting['yearID']> 1899]
df_batting = df_batting[df_batting['yearID']!= 2020]

In [34]:
df_batting.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99515 entries, 7914 to 107428
Data columns (total 22 columns):
playerID    99515 non-null object
yearID      99515 non-null int64
stint       99515 non-null int64
teamID      99515 non-null object
lgID        99515 non-null object
G           99515 non-null int64
AB          99515 non-null int64
R           99515 non-null int64
H           99515 non-null int64
2B          99515 non-null int64
3B          99515 non-null int64
HR          99515 non-null int64
RBI         99515 non-null float64
SB          99515 non-null float64
CS          83151 non-null float64
BB          99515 non-null int64
SO          98896 non-null float64
IBB         70779 non-null float64
HBP         99514 non-null float64
SH          99515 non-null float64
SF          71326 non-null float64
GIDP        81251 non-null float64
dtypes: float64(9), int64(10), object(3)
memory usage: 17.5+ MB


In [35]:
# Drop columns
df_batting.drop(['CS','IBB','SF','GIDP','HBP','SH','stint'], axis = 1, inplace = True)
df_batting.head()

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
7914,allenbo01,1900,CIN,NL,5,15,0,2,1,0,0,1.0,0.0,0,4.0
7915,baileha01,1900,BSN,NL,4,9,2,2,0,1,0,1.0,0.0,0,2.0
7916,barreji01,1900,CIN,NL,137,545,114,172,11,7,5,42.0,44.0,72,63.0
7917,barrysh01,1900,BSN,NL,81,254,40,66,10,7,1,37.0,9.0,13,16.0
7918,beaumgi01,1900,PIT,NL,138,567,105,158,14,9,5,50.0,27.0,40,34.0


In [36]:
df_batting.isna().sum()

playerID      0
yearID        0
teamID        0
lgID          0
G             0
AB            0
R             0
H             0
2B            0
3B            0
HR            0
RBI           0
SB            0
BB            0
SO          619
dtype: int64

Below I'm identifying those players with an avg 'G' < 60 (avg 60 games played per year) to remove from df_batting.
- identified
- join on playerID and take only the 'left' which would just be whats left in df_batting, excluding those w/ avg < 60

In [37]:
# first remove players with < 6 seasons played
remove_min6seasons = df_batting.groupby(['playerID'])['yearID'].count()
remove_min6seasons.sort_values(ascending=False).head(10)

playerID
henderi01    29
newsobo01    29
johnto01     28
kaatji01     28
moyerja01    27
baineha01    27
ryanno01     27
carltst01    27
niekrph01    26
oroscje01    26
Name: yearID, dtype: int64

In [38]:
filter_min6seasons = pd.DataFrame(data=remove_min6seasons)
filter_min6seasons.head()

Unnamed: 0_level_0,yearID
playerID,Unnamed: 1_level_1
aardsda01,9
aaronha01,23
aaronto01,7
aasedo01,13
abadan01,3


In [39]:
# df now has only players with 6+ years played
filter_min6seasons = filter_min6seasons[filter_min6seasons['yearID'] > 5]

In [40]:
df_batting.set_index('playerID', inplace=True)

In [41]:
filter_min6seasons.head()

Unnamed: 0_level_0,yearID
playerID,Unnamed: 1_level_1
aardsda01,9
aaronha01,23
aaronto01,7
aasedo01,13
abadfe01,10


In [42]:
# using join as a filter to filter out those with < 6 seasons played
df_batting_min6s = df_batting.join(filter_min6seasons, how='inner', rsuffix=("DROP")).filter(regex="^(?!.*DROP)")
df_batting_min6s.head()

Unnamed: 0_level_0,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
aardsda01,2004,SFN,NL,11,0,0,0,0,0,0,0.0,0.0,0,0.0
aardsda01,2006,CHN,NL,45,2,0,0,0,0,0,0.0,0.0,0,0.0
aardsda01,2007,CHA,AL,25,0,0,0,0,0,0,0.0,0.0,0,0.0
aardsda01,2008,BOS,AL,47,1,0,0,0,0,0,0.0,0.0,0,1.0
aardsda01,2009,SEA,AL,73,0,0,0,0,0,0,0.0,0.0,0,0.0


In [43]:
df_batting_min6s.reset_index(inplace=True)

In [44]:
df_batting_min6s.describe()

Unnamed: 0,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
count,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74477.0,74145.0
mean,1974.104448,60.35427,165.724237,21.700995,43.698726,7.612713,1.372692,3.752085,20.220511,3.123434,15.846516,24.958406
std,31.9939,48.575241,196.85422,29.663696,56.312242,10.561128,2.62728,7.280889,28.45781,7.418858,22.639487,30.997177
min,1900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1951.0,21.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,1981.0,42.0,67.0,6.0,13.0,2.0,0.0,0.0,5.0,0.0,4.0,13.0
75%,2001.0,99.0,300.0,36.0,77.0,13.0,2.0,4.0,32.0,3.0,25.0,37.0
max,2019.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,191.0,130.0,232.0,223.0


In [45]:
see_trout = df_batting_min6s[df_batting_min6s['playerID'].str.contains('troutmi01')]
see_trout.head(30)

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
67881,troutmi01,2011,LAA,AL,40,123,20,27,6,0,5,16.0,4.0,9,30.0
67882,troutmi01,2012,LAA,AL,139,559,129,182,27,8,30,83.0,49.0,67,139.0
67883,troutmi01,2013,LAA,AL,157,589,109,190,39,9,27,97.0,33.0,110,136.0
67884,troutmi01,2014,LAA,AL,157,602,115,173,39,9,36,111.0,16.0,83,184.0
67885,troutmi01,2015,LAA,AL,159,575,104,172,32,6,41,90.0,11.0,92,158.0
67886,troutmi01,2016,LAA,AL,159,549,123,173,32,5,29,100.0,30.0,116,137.0
67887,troutmi01,2017,LAA,AL,114,402,92,123,25,3,33,72.0,22.0,94,90.0
67888,troutmi01,2018,LAA,AL,140,471,101,147,24,4,39,79.0,24.0,122,124.0
67889,troutmi01,2019,LAA,AL,134,470,110,137,27,2,45,104.0,11.0,110,120.0


In [46]:
# remove players with a mean of < 70 games played per season
remove_players = df_batting_min6s.groupby(['playerID'])['G'].mean()
remove_players.sort_values(ascending=False).head()

playerID
hosmeer01    151.666667
markani01    151.214286
abreujo02    150.166667
santoro01    149.533333
puckeki01    148.583333
Name: G, dtype: float64

In [47]:
see_mills = df_batting_min6s[df_batting_min6s['playerID'].str.contains('millsbr02')]
see_mills.head(30)

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
45433,millsbr02,2009,TOR,AL,2,1,0,0,0,0,0,0.0,0.0,0,0.0
45434,millsbr02,2010,TOR,AL,7,0,0,0,0,0,0,0.0,0.0,0,0.0
45435,millsbr02,2011,TOR,AL,5,0,0,0,0,0,0,0.0,0.0,0,0.0
45436,millsbr02,2012,LAA,AL,1,0,0,0,0,0,0,0.0,0.0,0,0.0
45437,millsbr02,2014,OAK,AL,3,3,0,0,0,0,0,0.0,0.0,0,1.0
45438,millsbr02,2014,TOR,AL,2,0,0,0,0,0,0,0.0,0.0,0,0.0
45439,millsbr02,2015,OAK,AL,1,0,0,0,0,0,0,0.0,0.0,0,0.0


In [48]:
filter_players = pd.DataFrame(data=remove_players)

In [49]:
filter_players.describe()

Unnamed: 0,G
count,6776.0
mean,56.253869
std,34.420888
min,3.0
25%,26.7
50%,45.923077
75%,83.787879
max,151.666667


In [50]:
filter_players.reset_index()

Unnamed: 0,playerID,G
0,aardsda01,36.777778
1,aaronha01,143.391304
2,aaronto01,62.428571
3,aasedo01,34.461538
4,abadfe01,38.400000
...,...,...
6771,zoldasa01,25.100000
6772,zuberbi01,18.666667
6773,zuninmi01,96.714286
6774,zuvelpa01,23.222222


In [51]:
df_filtered = filter_players[filter_players['G'] < 70]

In [52]:
df_filtered.reset_index(inplace=True)

In [53]:
df_filtered.head()

Unnamed: 0,playerID,G
0,aardsda01,36.777778
1,aaronto01,62.428571
2,aasedo01,34.461538
3,abadfe01,38.4
4,abbotgl01,20.666667


In [54]:
df_filtered.drop('G',axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [55]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4460 entries, 0 to 4459
Data columns (total 1 columns):
playerID    4460 non-null object
dtypes: object(1)
memory usage: 35.0+ KB


In [56]:
# put the 4k+ players in a list to then drop from the df_batting
drop_players = df_filtered['playerID'].tolist()

In [57]:
df_batting_min6s.head()

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
0,aardsda01,2004,SFN,NL,11,0,0,0,0,0,0,0.0,0.0,0,0.0
1,aardsda01,2006,CHN,NL,45,2,0,0,0,0,0,0.0,0.0,0,0.0
2,aardsda01,2007,CHA,AL,25,0,0,0,0,0,0,0.0,0.0,0,0.0
3,aardsda01,2008,BOS,AL,47,1,0,0,0,0,0,0.0,0.0,0,1.0
4,aardsda01,2009,SEA,AL,73,0,0,0,0,0,0,0.0,0.0,0,0.0


In [58]:
# the ~ means 'not in' , so keep playerIDs that are NOT IN the list drop_players
df_batting_final = df_batting_min6s[~df_batting_min6s['playerID'].isin(drop_players)]

In [59]:
df_batting_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29153 entries, 9 to 74457
Data columns (total 15 columns):
playerID    29153 non-null object
yearID      29153 non-null int64
teamID      29153 non-null object
lgID        29153 non-null object
G           29153 non-null int64
AB          29153 non-null int64
R           29153 non-null int64
H           29153 non-null int64
2B          29153 non-null int64
3B          29153 non-null int64
HR          29153 non-null int64
RBI         29153 non-null float64
SB          29153 non-null float64
BB          29153 non-null int64
SO          29004 non-null float64
dtypes: float64(3), int64(9), object(3)
memory usage: 3.6+ MB


In [60]:
# Replace nulls with median
df_batting_final.fillna(df_batting_final.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [61]:
df_batting_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29153 entries, 9 to 74457
Data columns (total 15 columns):
playerID    29153 non-null object
yearID      29153 non-null int64
teamID      29153 non-null object
lgID        29153 non-null object
G           29153 non-null int64
AB          29153 non-null int64
R           29153 non-null int64
H           29153 non-null int64
2B          29153 non-null int64
3B          29153 non-null int64
HR          29153 non-null int64
RBI         29153 non-null float64
SB          29153 non-null float64
BB          29153 non-null int64
SO          29153 non-null float64
dtypes: float64(3), int64(9), object(3)
memory usage: 3.6+ MB


In [62]:
df_batting_final.describe()

Unnamed: 0,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
count,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0
mean,1971.492162,99.841766,338.643776,46.674202,92.472164,16.303434,2.969403,8.376085,43.335986,7.0614,33.800535,47.18756
std,32.727267,47.621109,194.092168,32.096188,57.897749,11.52946,3.434569,9.583142,31.745084,10.305717,25.574952,35.368531
min,1900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,63.0,165.0,19.0,41.0,6.0,0.0,1.0,17.0,1.0,13.0,19.0
50%,1978.0,112.0,364.0,44.0,95.0,15.0,2.0,5.0,40.0,3.0,30.0,40.0
75%,1999.0,142.0,512.0,71.0,141.0,25.0,4.0,12.0,64.0,9.0,49.0,68.0
max,2019.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,191.0,130.0,232.0,223.0


In [63]:
df_batting_final.duplicated().sum()

0

In [64]:
df_batting_final.G.sort_values(ascending=True).head(40)

41610    1
38705    1
57804    1
70263    1
59933    1
59932    1
9323     1
21169    1
40099    1
15622    1
24149    1
45210    1
62407    1
50916    1
19034    1
21060    1
62683    1
5780     1
20928    1
15466    1
28560    1
15465    1
63950    1
2172     1
2171     1
28843    1
2170     1
23758    1
60143    1
30796    1
9026     1
8372     1
40731    1
64243    1
13830    1
13847    1
11340    1
62189    1
13858    1
49821    1
Name: G, dtype: int64

# Database: Appearances

In [65]:
df_position_name = pd.read_csv("data/Fielding.csv")
df_position_name.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144768 entries, 0 to 144767
Data columns (total 18 columns):
playerID    144768 non-null object
yearID      144768 non-null int64
stint       144768 non-null int64
teamID      144768 non-null object
lgID        143256 non-null object
POS         144768 non-null object
G           144768 non-null int64
GS          98612 non-null float64
InnOuts     114839 non-null float64
PO          144768 non-null int64
A           144768 non-null int64
E           144767 non-null float64
DP          144768 non-null int64
PB          11709 non-null float64
WP          1169 non-null float64
SB          8922 non-null float64
CS          8922 non-null float64
ZR          1169 non-null float64
dtypes: float64(8), int64(6), object(4)
memory usage: 19.9+ MB


In [66]:
df_position_name.POS.value_counts()

P     48400
OF    29778
3B    14741
1B    14416
2B    13559
SS    12165
C     11709
Name: POS, dtype: int64

In [67]:
# filter out pitchers by identifying where rows = P
df_position_name.drop(df_position_name.loc[df_position_name['POS']=='P'].index, inplace=True)

In [68]:
df_position_name.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96368 entries, 0 to 144767
Data columns (total 18 columns):
playerID    96368 non-null object
yearID      96368 non-null int64
stint       96368 non-null int64
teamID      96368 non-null object
lgID        94988 non-null object
POS         96368 non-null object
G           96368 non-null int64
GS          63259 non-null float64
InnOuts     76351 non-null float64
PO          96368 non-null int64
A           96368 non-null int64
E           96368 non-null float64
DP          96368 non-null int64
PB          11709 non-null float64
WP          1169 non-null float64
SB          8922 non-null float64
CS          8922 non-null float64
ZR          1169 non-null float64
dtypes: float64(8), int64(6), object(4)
memory usage: 14.0+ MB


In [69]:
# player_pos = df_position_name.sort_values('G', ascending=False).drop_duplicates(['playerID','POS'])
# pp2 = player_pos.groupby(['playerID','POS'])['G'].max()
# pp2.head()

In [70]:
# df_position_name.groupby(['playerID','POS']).apply(lambda x: x[x['POS']==x['POS'].max()])

In [71]:
# new method
idx = df_position_name.groupby(['playerID','POS'])['G'].transform(max) == df_position_name['G']
pp_df = df_position_name[idx]

In [72]:
pp_df.groupby(['playerID','POS'])['G'].max()

playerID   POS
aaronha01  1B     109
           2B      27
           3B       5
           OF     161
aaronto01  1B     110
                 ... 
zuvelpa01  2B      42
           3B       5
           SS      49
zwilldu01  1B       3
           OF     154
Name: G, Length: 23953, dtype: int64

In [73]:
pp_df.drop(columns=['yearID','stint','teamID','lgID','GS','InnOuts','PO','A','E','DP','PB','WP','SB','CS','ZR'], axis=1, inplace=True)
pp_df.head()

Unnamed: 0,playerID,POS,G
0,abercda01,SS,1
2,addybo01,SS,3
3,allisar01,2B,2
11,armstbo01,OF,12
12,barkeal01,OF,1


In [74]:
pp_sorted = pp_df.sort_values(by='G',ascending=False)

In [75]:
pp_sorted.drop_duplicates(subset='playerID',keep='first',inplace=True)

In [76]:
pp_sorted.shape

(11115, 3)

In [77]:
pp_sorted.sort_values(by='playerID',ascending=True)

Unnamed: 0,playerID,POS,G
58728,aaronha01,OF,161
57774,aaronto01,1B,110
111671,abadan01,1B,7
1046,abadijo01,1B,11
17008,abbated01,SS,154
...,...,...,...
132948,zuninmi01,C,130
96954,zupcibo01,OF,122
54518,zupofr01,C,8
89689,zuvelpa01,SS,49


In [78]:
see_before = df_position_name.groupby(['playerID','POS'])['G'].max()
see_before.tail(50)

playerID   POS
zimmedo01  OF       4
           SS     114
zimmeed01  3B     122
zimmehe01  1B      22
           2B     108
           3B     149
           OF       8
           SS      26
zimmeje01  C      104
zimmero01  1B      25
           OF       1
zimmery01  1B     143
           3B     161
           OF      30
           SS       1
zinnfr01   C        2
zinngu01   OF     106
zinnji01   OF       1
zinteal01  1B       8
           C        1
zipfebu01  1B      44
           OF      23
ziskri01   OF     152
zitzmbi01  1B       5
           3B       3
           OF      89
           SS       8
zobribe01  1B      14
           2B     131
           3B       4
           OF     110
           SS      52
zoccope01  OF       7
zoskyed01  2B       2
           3B       4
           SS      18
zuberjo01  1B      22
           OF       5
zuletju01  1B      35
           OF       6
zuninmi01  C      130
zupcibo01  1B       1
           3B       2
           OF     122
zupofr01   C     

In [79]:
see_before1 = df_position_name[df_position_name['playerID']=='lindsch02']
see_before1.head(30)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
54942,lindsch02,1958,1,CHA,AL,C,1,0.0,15.0,2,0,0.0,0,1.0,0.0,0.0,0.0,0.0


In [80]:
see_after = pp_sorted[pp_sorted['playerID']=='zobribe01']
see_after.head(30)

Unnamed: 0,playerID,POS,G
127425,zobribe01,2B,131


In [81]:
pp_sorted.drop('G',axis=1,inplace=True)

In [82]:
pp_sorted.head()

Unnamed: 0,playerID,POS
58699,willsma01,SS
61431,santoro01,3B
58440,paganjo01,SS
61608,willibi01,OF
60589,wagnele01,OF


# Database: Awards

In [83]:
df_awards = pd.read_csv("data/AwardsPlayers.csv")
df_awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6236 entries, 0 to 6235
Data columns (total 6 columns):
playerID    6236 non-null object
awardID     6236 non-null object
yearID      6236 non-null int64
lgID        6235 non-null object
tie         49 non-null object
notes       4768 non-null object
dtypes: int64(1), object(5)
memory usage: 292.4+ KB


In [84]:
df_awards.drop(columns=['tie','notes','lgID'], axis=1, inplace=True)

In [85]:
df_awards.duplicated().sum()

511

In [86]:
df_awards.drop_duplicates(keep='first',inplace=True)

In [87]:
# Modern Era of Baseball is 1900 forward
df_awards = df_awards[df_awards['yearID']> 1899]

In [88]:
df_awards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5716 entries, 9 to 6235
Data columns (total 3 columns):
playerID    5716 non-null object
awardID     5716 non-null object
yearID      5716 non-null int64
dtypes: int64(1), object(2)
memory usage: 178.6+ KB


In [89]:
df_awards.head()

Unnamed: 0,playerID,awardID,yearID
9,youngcy01,Pitching Triple Crown,1901
10,lajoina01,Triple Crown,1901
11,wadderu01,Pitching Triple Crown,1905
12,mathech01,Pitching Triple Crown,1905
13,chaseha01,Baseball Magazine All-Star,1908


In [90]:
df_awards.awardID.value_counts()

TSN All-Star                           1391
Gold Glove                             1091
Baseball Magazine All-Star             1009
Silver Slugger                          685
Most Valuable Player                    196
Rookie of the Year                      142
TSN Pitcher of the Year                 137
Cy Young Award                          114
TSN Player of the Year                   92
TSN Fireman of the Year                  88
TSN Major League Player of the Year      82
Rolaids Relief Man Award                 74
World Series MVP                         65
Babe Ruth Award                          64
Lou Gehrig Memorial Award                57
All-Star Game MVP                        55
Roberto Clemente Award                   48
Hutch Award                              48
NLCS MVP                                 43
Hank Aaron Award                         38
ALCS MVP                                 37
TSN Guide MVP                            33
Pitching Triple Crown           

In [91]:
# filter on awards I care about. filter based on value in row. filter on value in column.
keep_awards = ['Gold Glove','Silver Slugger','Most Valuable Player','Rookie of the Year','World Series MVP','Babe Ruth Award',
               'Lou Gehrig Memorial Award','All-Star Game MVP','Roberto Clemente Award','Hutch Award','NLCS MVP','Hank Aaron Award',
               'ALCS MVP','Comeback Player of the Year','Triple Crown','Baseball Magazine All-Star','TSN All-Star']
df_awards = df_awards.loc[df_awards['awardID'].isin(keep_awards)]

In [92]:
df_awards.awardID.value_counts()

TSN All-Star                   1391
Gold Glove                     1091
Baseball Magazine All-Star     1009
Silver Slugger                  685
Most Valuable Player            196
Rookie of the Year              142
World Series MVP                 65
Babe Ruth Award                  64
Lou Gehrig Memorial Award        57
All-Star Game MVP                55
Roberto Clemente Award           48
Hutch Award                      48
NLCS MVP                         43
Hank Aaron Award                 38
ALCS MVP                         37
Comeback Player of the Year      26
Triple Crown                     14
Name: awardID, dtype: int64

In [93]:
df_awards.head()

Unnamed: 0,playerID,awardID,yearID
10,lajoina01,Triple Crown,1901
13,chaseha01,Baseball Magazine All-Star,1908
14,lajoina01,Baseball Magazine All-Star,1908
15,lordha01,Baseball Magazine All-Star,1908
16,crigelo01,Baseball Magazine All-Star,1908


In [94]:
dummies = pd.get_dummies(df_awards['awardID'], prefix='award', drop_first=True)

In [95]:
df_awards.drop(['awardID'], axis=1, inplace=True)
df_awards_final = pd.concat([df_awards, dummies], axis=1)
df_awards_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5009 entries, 10 to 6235
Data columns (total 18 columns):
playerID                             5009 non-null object
yearID                               5009 non-null int64
award_All-Star Game MVP              5009 non-null uint8
award_Babe Ruth Award                5009 non-null uint8
award_Baseball Magazine All-Star     5009 non-null uint8
award_Comeback Player of the Year    5009 non-null uint8
award_Gold Glove                     5009 non-null uint8
award_Hank Aaron Award               5009 non-null uint8
award_Hutch Award                    5009 non-null uint8
award_Lou Gehrig Memorial Award      5009 non-null uint8
award_Most Valuable Player           5009 non-null uint8
award_NLCS MVP                       5009 non-null uint8
award_Roberto Clemente Award         5009 non-null uint8
award_Rookie of the Year             5009 non-null uint8
award_Silver Slugger                 5009 non-null uint8
award_TSN All-Star                  

In [96]:
df_awards_final.head()

Unnamed: 0,playerID,yearID,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
10,lajoina01,1901,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
13,chaseha01,1908,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
14,lajoina01,1908,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
15,lordha01,1908,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
16,crigelo01,1908,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [97]:
df_awards_final.reset_index(inplace=True)

In [98]:
awards_grouped = df_awards_final.groupby(['playerID','yearID']).agg('sum')
df_a2 = pd.DataFrame(data=awards_grouped)

In [99]:
df_a2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
playerID,yearID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aaronha01,1956,2090,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1957,2116,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
aaronha01,1958,4319,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1959,4403,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1960,2238,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [100]:
df_a2.reset_index(inplace=True)

In [101]:
see_trout = df_a2[df_a2['playerID'].str.contains('pujol')]
see_trout.head(30)

Unnamed: 0,playerID,yearID,index,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
2498,pujolal01,2001,10158,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2499,pujolal01,2003,15752,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0
2500,pujolal01,2004,15990,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
2501,pujolal01,2005,10824,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2502,pujolal01,2006,5461,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2503,pujolal01,2008,16941,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0
2504,pujolal01,2009,22822,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0
2505,pujolal01,2010,11532,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [102]:
df_a2.drop('index', axis=1, inplace=True)
df_a2.head()

Unnamed: 0,playerID,yearID,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
0,aaronha01,1956,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,aaronha01,1957,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,aaronha01,1958,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,aaronha01,1959,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,aaronha01,1960,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


# Join DBs to create final DB:
- df_hof
- df_names
- df_batting_final
- pp_sorted
- df_awards_final

Do we still want pitching stats or am I only going to focus on hitting stats?


In [103]:
df_names.head()

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID
0,aardsda01,USA,CO,David,Aardsma,David Allan,75,R,R,aardd001,aardsda01
1,aaronha01,USA,AL,Hank,Aaron,Henry Louis,72,R,R,aaroh101,aaronha01
2,aaronto01,USA,AL,Tommie,Aaron,Tommie Lee,75,R,R,aarot101,aaronto01
3,aasedo01,USA,CA,Don,Aase,Donald William,75,R,R,aased001,aasedo01
4,abadan01,USA,FL,Andy,Abad,Fausto Andres,73,L,L,abada001,abadan01


In [104]:
df_hof_only.head()

Unnamed: 0,playerID,inducted
0,cobbty01,Y
1,ruthba01,Y
2,wagneho01,Y
3,mathech01,Y
4,johnswa01,Y


In [105]:
# jeter and larry walker are HOFers (2020 HOF class)
df_hof_only.at['jeterde01', 'inducted'] = 'Y'
df_hof_only.at['walkela01', 'inducted'] = 'Y'

In [106]:
df_hof_only.inducted.value_counts()

Y    260
Name: inducted, dtype: int64

In [107]:
df_hof_only.info()

<class 'pandas.core.frame.DataFrame'>
Index: 260 entries, 0 to walkela01
Data columns (total 2 columns):
playerID    258 non-null object
inducted    260 non-null object
dtypes: object(2)
memory usage: 16.1+ KB


In [108]:
df_hof_only.set_index('playerID', inplace=True)

In [109]:
df_names.set_index('playerID', inplace=True)

# Join 1:

In [110]:
hof_with_names = df_names.join(df_hof_only, how='left')
hof_with_names.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20056 entries, aardsda01 to zychto01
Data columns (total 11 columns):
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
nameGiven       20056 non-null object
height          20056 non-null object
bats            20056 non-null object
throws          20056 non-null object
retroID         20056 non-null object
bbrefID         20056 non-null object
inducted        258 non-null object
dtypes: object(11)
memory usage: 1.8+ MB


In [111]:
hof_with_names.loc[hof_with_names['inducted']=='Y']

Unnamed: 0_level_0,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID,inducted
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
aaronha01,USA,AL,Hank,Aaron,Henry Louis,72,R,R,aaroh101,aaronha01,Y
alexape01,USA,NE,Pete,Alexander,Grover Cleveland,73,R,R,alexg102,alexape01,Y
alomaro01,P.R.,unknown,Roberto,Alomar,Roberto,72,B,R,alomr001,alomaro01,Y
ansonca01,USA,IA,Cap,Anson,Adrian Constantine,72,R,R,ansoc101,ansonca01,Y
aparilu01,Venezuela,Zulia,Luis,Aparicio,Luis Ernesto,69,R,R,aparl101,aparilu01,Y
...,...,...,...,...,...,...,...,...,...,...,...
wynnea01,USA,AL,Early,Wynn,Early,72,B,R,wynne101,wynnea01,Y
yastrca01,USA,NY,Carl,Yastrzemski,Carl Michael,71,L,R,yastc101,yastrca01,Y
youngcy01,USA,OH,Cy,Young,Denton True,74,R,R,younc102,youngcy01,Y
youngro01,USA,TX,Ross,Youngs,Royce Middlebrook,68,L,R,younr101,youngro01,Y


In [112]:
hof_with_names.fillna(0, inplace=True)

In [113]:
hof_with_names.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20056 entries, aardsda01 to zychto01
Data columns (total 11 columns):
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
nameGiven       20056 non-null object
height          20056 non-null object
bats            20056 non-null object
throws          20056 non-null object
retroID         20056 non-null object
bbrefID         20056 non-null object
inducted        20056 non-null object
dtypes: object(11)
memory usage: 2.5+ MB


In [114]:
hof_with_names.head()

Unnamed: 0_level_0,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID,inducted
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
aardsda01,USA,CO,David,Aardsma,David Allan,75,R,R,aardd001,aardsda01,0
aaronha01,USA,AL,Hank,Aaron,Henry Louis,72,R,R,aaroh101,aaronha01,Y
aaronto01,USA,AL,Tommie,Aaron,Tommie Lee,75,R,R,aarot101,aaronto01,0
aasedo01,USA,CA,Don,Aase,Donald William,75,R,R,aased001,aasedo01,0
abadan01,USA,FL,Andy,Abad,Fausto Andres,73,L,L,abada001,abadan01,0


In [115]:
hof_with_names.inducted.value_counts()

0    19798
Y      258
Name: inducted, dtype: int64

In [116]:
hof_with_names.reset_index(inplace=True)

In [117]:
see_trout2 = hof_with_names[hof_with_names['playerID'].str.contains('bonds')]
see_trout2.head(30)

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID,inducted
1653,bondsba01,USA,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
1654,bondsbo01,USA,CA,Bobby,Bonds,Bobby Lee,73,R,R,bondb101,bondsbo01,0


# Join 2:

In [118]:
hof_with_names.set_index(['playerID'], inplace=True)

In [119]:
df_batting_final.set_index(['playerID'], inplace=True)

In [120]:
batting_with_hofnames = df_batting_final.join(hof_with_names, how='left', rsuffix=("DROP")).filter(regex="^(?!.*DROP)")
batting_with_hofnames.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29153 entries, aaronha01 to zuninmi01
Data columns (total 25 columns):
yearID          29153 non-null int64
teamID          29153 non-null object
lgID            29153 non-null object
G               29153 non-null int64
AB              29153 non-null int64
R               29153 non-null int64
H               29153 non-null int64
2B              29153 non-null int64
3B              29153 non-null int64
HR              29153 non-null int64
RBI             29153 non-null float64
SB              29153 non-null float64
BB              29153 non-null int64
SO              29153 non-null float64
birthCountry    29153 non-null object
birthState      29153 non-null object
nameFirst       29153 non-null object
nameLast        29153 non-null object
nameGiven       29153 non-null object
height          29153 non-null object
bats            29153 non-null object
throws          29153 non-null object
retroID         29153 non-null object
bbrefID         

In [121]:
batting_with_hofnames.describe()

Unnamed: 0,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
count,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0
mean,1971.492162,99.841766,338.643776,46.674202,92.472164,16.303434,2.969403,8.376085,43.335986,7.0614,33.800535,47.18756
std,32.727267,47.621109,194.092168,32.096188,57.897749,11.52946,3.434569,9.583142,31.745084,10.305717,25.574952,35.368531
min,1900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,63.0,165.0,19.0,41.0,6.0,0.0,1.0,17.0,1.0,13.0,19.0
50%,1978.0,112.0,364.0,44.0,95.0,15.0,2.0,5.0,40.0,3.0,30.0,40.0
75%,1999.0,142.0,512.0,71.0,141.0,25.0,4.0,12.0,64.0,9.0,49.0,68.0
max,2019.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,191.0,130.0,232.0,223.0


In [122]:
batting_with_hofnames.reset_index(inplace=True)

In [123]:
see_trout2 = batting_with_hofnames[batting_with_hofnames['playerID'].str.contains('bondsba01')]
see_trout2.sort_values(by='yearID').head(30)

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,...,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,retroID,bbrefID,inducted
2363,bondsba01,1986,PIT,NL,113,413,72,92,26,3,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2364,bondsba01,1987,PIT,NL,150,551,99,144,34,9,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2365,bondsba01,1988,PIT,NL,144,538,97,152,30,5,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2366,bondsba01,1989,PIT,NL,159,580,96,144,34,6,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2367,bondsba01,1990,PIT,NL,151,519,104,156,32,3,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2368,bondsba01,1991,PIT,NL,153,510,95,149,28,5,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2369,bondsba01,1992,PIT,NL,140,473,109,147,36,5,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2370,bondsba01,1993,SFN,NL,159,539,129,181,38,4,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2371,bondsba01,1994,SFN,NL,112,391,89,122,18,1,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0
2372,bondsba01,1995,SFN,NL,144,506,109,149,30,7,...,CA,Barry,Bonds,Barry Lamar,73,L,L,bondb001,bondsba01,0


In [124]:
batting_with_hofnames.drop(columns=['retroID','bbrefID'], axis=1, inplace=True)

In [125]:
batting_with_hofnames.describe()

Unnamed: 0,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
count,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0,29153.0
mean,1971.492162,99.841766,338.643776,46.674202,92.472164,16.303434,2.969403,8.376085,43.335986,7.0614,33.800535,47.18756
std,32.727267,47.621109,194.092168,32.096188,57.897749,11.52946,3.434569,9.583142,31.745084,10.305717,25.574952,35.368531
min,1900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,63.0,165.0,19.0,41.0,6.0,0.0,1.0,17.0,1.0,13.0,19.0
50%,1978.0,112.0,364.0,44.0,95.0,15.0,2.0,5.0,40.0,3.0,30.0,40.0
75%,1999.0,142.0,512.0,71.0,141.0,25.0,4.0,12.0,64.0,9.0,49.0,68.0
max,2019.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,191.0,130.0,232.0,223.0


In [126]:
batting_with_hofnames.inducted.value_counts()

0    26553
Y     2600
Name: inducted, dtype: int64

In [127]:
batting_with_hofnames.head()

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,...,SO,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,inducted
0,aaronha01,1954,ML1,NL,122,468,58,131,27,6,...,39.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
1,aaronha01,1955,ML1,NL,153,602,105,189,37,9,...,61.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
2,aaronha01,1956,ML1,NL,153,609,106,200,34,14,...,54.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
3,aaronha01,1957,ML1,NL,151,615,118,198,27,6,...,58.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
4,aaronha01,1958,ML1,NL,153,601,109,196,34,4,...,49.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y


In [128]:
batting_with_hofnames.reset_index(inplace=True)

In [129]:
# see unique number of HOF
# started with 256 HOFers, now only have 143
batting_with_hofnames.groupby(['inducted'])['playerID'].nunique()

inducted
0    2171
Y     145
Name: playerID, dtype: int64

# Join 3:

In [130]:
df_a2.set_index(['playerID','yearID'], inplace=True)

In [131]:
batting_with_hofnames.reset_index(inplace=True)

In [132]:
batting_with_hofnames.set_index(['playerID','yearID'], inplace=True)

In [133]:
batting_with_hofnames.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,level_0,index,teamID,lgID,G,AB,R,H,2B,3B,...,SO,birthCountry,birthState,nameFirst,nameLast,nameGiven,height,bats,throws,inducted
playerID,yearID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
aaronha01,1954,0,0,ML1,NL,122,468,58,131,27,6,...,39.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
aaronha01,1955,1,1,ML1,NL,153,602,105,189,37,9,...,61.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
aaronha01,1956,2,2,ML1,NL,153,609,106,200,34,14,...,54.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
aaronha01,1957,3,3,ML1,NL,151,615,118,198,27,6,...,58.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y
aaronha01,1958,4,4,ML1,NL,153,601,109,196,34,4,...,49.0,USA,AL,Hank,Aaron,Henry Louis,72,R,R,Y


In [134]:
df_a2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
playerID,yearID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
aaronha01,1956,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1957,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
aaronha01,1958,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1959,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
aaronha01,1960,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [135]:
df_a2.describe()

Unnamed: 0,award_All-Star Game MVP,award_Babe Ruth Award,award_Baseball Magazine All-Star,award_Comeback Player of the Year,award_Gold Glove,award_Hank Aaron Award,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
count,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0,3552.0
mean,0.015484,0.018018,0.284065,0.00732,0.307151,0.010698,0.013514,0.016047,0.05518,0.012106,0.013514,0.039977,0.192849,0.39161,0.003941,0.0183
std,0.123486,0.133035,0.451032,0.085254,0.461377,0.102892,0.115476,0.125675,0.228364,0.109374,0.115476,0.195934,0.394591,0.488179,0.062666,0.134051
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [136]:
# join3_allbut_position = batting_with_hofnames.reset_index(level=0).join(df_a2,  rsuffix=("DROP")).set_index('playerID', append=True).swaplevel(1, 0).filter(regex="^(?!.*DROP)")
# join3_df = batting_with_hofnames.reset_index(level=0).join(df_a2).set_index('playerID', append=True).swaplevel(1, 0)


In [137]:
# join awards table to batting with hof 
join3_allbut_position = batting_with_hofnames.join(df_a2, how='left')
join3_allbut_position.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 29153 entries, (aaronha01, 1954) to (zuninmi01, 2019)
Data columns (total 40 columns):
level_0                              29153 non-null int64
index                                29153 non-null int64
teamID                               29153 non-null object
lgID                                 29153 non-null object
G                                    29153 non-null int64
AB                                   29153 non-null int64
R                                    29153 non-null int64
H                                    29153 non-null int64
2B                                   29153 non-null int64
3B                                   29153 non-null int64
HR                                   29153 non-null int64
RBI                                  29153 non-null float64
SB                                   29153 non-null float64
BB                                   29153 non-null int64
SO                                   29153 no

In [138]:
join3_allbut_position.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,level_0,index,teamID,lgID,G,AB,R,H,2B,3B,...,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
playerID,yearID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
aaronha01,1954,0,0,ML1,NL,122,468,58,131,27,6,...,,,,,,,,,,
aaronha01,1955,1,1,ML1,NL,153,602,105,189,37,9,...,,,,,,,,,,
aaronha01,1956,2,2,ML1,NL,153,609,106,200,34,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
aaronha01,1957,3,3,ML1,NL,151,615,118,198,27,6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaronha01,1958,4,4,ML1,NL,153,601,109,196,34,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [139]:
join3_allbut_position.reset_index(inplace=True)

In [140]:
join3_allbut_position.fillna(0, inplace=True)

In [141]:
see_trout2 = join3_allbut_position[join3_allbut_position['playerID'].str.contains('bondsba01')]
see_trout2.head(30)

Unnamed: 0,playerID,yearID,level_0,index,teamID,lgID,G,AB,R,H,...,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
2363,bondsba01,1986,2363,2363,PIT,NL,113,413,72,92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2364,bondsba01,1987,2364,2364,PIT,NL,150,551,99,144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2365,bondsba01,1988,2365,2365,PIT,NL,144,538,97,152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2366,bondsba01,1989,2366,2366,PIT,NL,159,580,96,144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2367,bondsba01,1990,2367,2367,PIT,NL,151,519,104,156,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2368,bondsba01,1991,2368,2368,PIT,NL,153,510,95,149,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2369,bondsba01,1992,2369,2369,PIT,NL,140,473,109,147,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2370,bondsba01,1993,2370,2370,SFN,NL,159,539,129,181,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2371,bondsba01,1994,2371,2371,SFN,NL,112,391,89,122,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2372,bondsba01,1995,2372,2372,SFN,NL,144,506,109,149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
join3_allbut_position.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29153 entries, 0 to 29152
Data columns (total 42 columns):
playerID                             29153 non-null object
yearID                               29153 non-null int64
level_0                              29153 non-null int64
index                                29153 non-null int64
teamID                               29153 non-null object
lgID                                 29153 non-null object
G                                    29153 non-null int64
AB                                   29153 non-null int64
R                                    29153 non-null int64
H                                    29153 non-null int64
2B                                   29153 non-null int64
3B                                   29153 non-null int64
HR                                   29153 non-null int64
RBI                                  29153 non-null float64
SB                                   29153 non-null float64
BB            

In [143]:
join3_allbut_position.drop_duplicates(subset=['playerID','yearID','teamID','G'], inplace=True)

In [144]:
join3_allbut_position.inducted.value_counts()

0    26552
Y     2600
Name: inducted, dtype: int64

In [145]:
# see unique number of HOF
join3_allbut_position.groupby(['inducted'])['playerID'].nunique()

inducted
0    2171
Y     145
Name: playerID, dtype: int64

In [146]:
join3_allbut_position.describe()

Unnamed: 0,yearID,level_0,index,G,AB,R,H,2B,3B,HR,...,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
count,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,...,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0
mean,1971.493757,14575.842481,14575.842481,99.845156,338.655358,46.675768,92.475302,16.303993,2.969505,8.376372,...,0.001063,0.001475,0.005797,0.00096,0.001269,0.003156,0.02216,0.037699,0.00048,0.001269
std,32.726695,8415.991908,8415.991908,47.618407,194.085422,32.095623,57.896263,11.529262,3.434584,9.58318,...,0.032593,0.038378,0.07592,0.030977,0.035604,0.056089,0.147205,0.19047,0.02191,0.035604
min,1900.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,7287.75,7287.75,63.0,165.0,19.0,41.0,6.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,14575.5,14575.5,112.0,364.0,44.0,95.0,15.0,2.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1999.0,21864.25,21864.25,142.0,512.0,71.0,141.0,25.0,4.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,29152.0,29152.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Join 4: join Position

In [147]:
pp_sorted.head()

Unnamed: 0,playerID,POS
58699,willsma01,SS
61431,santoro01,3B
58440,paganjo01,SS
61608,willibi01,OF
60589,wagnele01,OF


In [148]:
join3_allbut_position.set_index('playerID',inplace=True)

In [149]:
pp_sorted.set_index('playerID',inplace=True)

In [150]:
join4_position = join3_allbut_position.join(pp_sorted, how='inner')
join4_position.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29152 entries, aaronha01 to zuninmi01
Data columns (total 42 columns):
yearID                               29152 non-null int64
level_0                              29152 non-null int64
index                                29152 non-null int64
teamID                               29152 non-null object
lgID                                 29152 non-null object
G                                    29152 non-null int64
AB                                   29152 non-null int64
R                                    29152 non-null int64
H                                    29152 non-null int64
2B                                   29152 non-null int64
3B                                   29152 non-null int64
HR                                   29152 non-null int64
RBI                                  29152 non-null float64
SB                                   29152 non-null float64
BB                                   29152 non-null int64
SO      

In [151]:
join4_position.describe()

Unnamed: 0,yearID,level_0,index,G,AB,R,H,2B,3B,HR,...,award_Hutch Award,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP
count,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,...,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0
mean,1971.493757,14575.842481,14575.842481,99.845156,338.655358,46.675768,92.475302,16.303993,2.969505,8.376372,...,0.001063,0.001475,0.005797,0.00096,0.001269,0.003156,0.02216,0.037699,0.00048,0.001269
std,32.726695,8415.991908,8415.991908,47.618407,194.085422,32.095623,57.896263,11.529262,3.434584,9.58318,...,0.032593,0.038378,0.07592,0.030977,0.035604,0.056089,0.147205,0.19047,0.02191,0.035604
min,1900.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,7287.75,7287.75,63.0,165.0,19.0,41.0,6.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,14575.5,14575.5,112.0,364.0,44.0,95.0,15.0,2.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1999.0,21864.25,21864.25,142.0,512.0,71.0,141.0,25.0,4.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,29152.0,29152.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [152]:
join4_position.reset_index(inplace=True)

In [153]:
see_trout4 = join4_position[join4_position['playerID'].str.contains('bondsba01')]
see_trout4.head(40)

Unnamed: 0,playerID,yearID,level_0,index,teamID,lgID,G,AB,R,H,...,award_Lou Gehrig Memorial Award,award_Most Valuable Player,award_NLCS MVP,award_Roberto Clemente Award,award_Rookie of the Year,award_Silver Slugger,award_TSN All-Star,award_Triple Crown,award_World Series MVP,POS
2363,bondsba01,1986,2363,2363,PIT,NL,113,413,72,92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
2364,bondsba01,1987,2364,2364,PIT,NL,150,551,99,144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
2365,bondsba01,1988,2365,2365,PIT,NL,144,538,97,152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
2366,bondsba01,1989,2366,2366,PIT,NL,159,580,96,144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
2367,bondsba01,1990,2367,2367,PIT,NL,151,519,104,156,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
2368,bondsba01,1991,2368,2368,PIT,NL,153,510,95,149,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
2369,bondsba01,1992,2369,2369,PIT,NL,140,473,109,147,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
2370,bondsba01,1993,2370,2370,SFN,NL,159,539,129,181,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
2371,bondsba01,1994,2371,2371,SFN,NL,112,391,89,122,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
2372,bondsba01,1995,2372,2372,SFN,NL,144,506,109,149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF


In [154]:
join4_position.drop_duplicates(subset=['playerID','yearID','teamID','G'], inplace=True)

In [155]:
join4_position.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29152 entries, 0 to 29151
Data columns (total 43 columns):
playerID                             29152 non-null object
yearID                               29152 non-null int64
level_0                              29152 non-null int64
index                                29152 non-null int64
teamID                               29152 non-null object
lgID                                 29152 non-null object
G                                    29152 non-null int64
AB                                   29152 non-null int64
R                                    29152 non-null int64
H                                    29152 non-null int64
2B                                   29152 non-null int64
3B                                   29152 non-null int64
HR                                   29152 non-null int64
RBI                                  29152 non-null float64
SB                                   29152 non-null float64
BB            

In [156]:
join4_position.inducted.value_counts()

0    26552
Y     2600
Name: inducted, dtype: int64

In [157]:
# see unique number of HOF
join4_position.groupby(['inducted'])['playerID'].nunique()

inducted
0    2171
Y     145
Name: playerID, dtype: int64

In [158]:
join4_position = join4_position.rename(columns={'award_All-Star Game MVP': 'asg_mvp',
                                   'award_Babe Ruth Award': 'baberuth_award',
                                   'award_Baseball Magazine All-Star': 'baseball_magazine_allstar',
                                   'award_Comeback Player of the Year': 'comeback_poy',
                                   'award_Gold Glove': 'gold_glove_award',
                                   'award_Hank Aaron Award': 'hankaaron_award',
                                   'award_Hutch Award': 'hutch_award',
                                   'award_Lou Gehrig Memorial Award': 'lougehrig_award',
                                   'award_Most Valuable Player': 'mvp',
                                   'award_NLCS MVP': 'nlcs_mvp',
                                   'award_Roberto Clemente Award': 'robertoclemente_award',
                                   'award_Rookie of the Year': 'roy',
                                   'award_Silver Slugger': 'silver_slugger',
                                   'award_TSN All-Star': 'tsn_allstar',
                                   'award_Triple Crown': 'triple_crown',
                                   'award_World Series MVP': 'ws_mvp',})

do groupby here on join4_position

In [159]:
join4_position.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29152 entries, 0 to 29151
Data columns (total 43 columns):
playerID                     29152 non-null object
yearID                       29152 non-null int64
level_0                      29152 non-null int64
index                        29152 non-null int64
teamID                       29152 non-null object
lgID                         29152 non-null object
G                            29152 non-null int64
AB                           29152 non-null int64
R                            29152 non-null int64
H                            29152 non-null int64
2B                           29152 non-null int64
3B                           29152 non-null int64
HR                           29152 non-null int64
RBI                          29152 non-null float64
SB                           29152 non-null float64
BB                           29152 non-null int64
SO                           29152 non-null float64
birthCountry                 291

In [160]:
join4_position.drop('index', axis=1, inplace=True)

In [161]:
join4_position.describe()

Unnamed: 0,yearID,level_0,G,AB,R,H,2B,3B,HR,RBI,...,hutch_award,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp
count,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,...,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0,29152.0
mean,1971.493757,14575.842481,99.845156,338.655358,46.675768,92.475302,16.303993,2.969505,8.376372,43.337473,...,0.001063,0.001475,0.005797,0.00096,0.001269,0.003156,0.02216,0.037699,0.00048,0.001269
std,32.726695,8415.991908,47.618407,194.085422,32.095623,57.896263,11.529262,3.434584,9.58318,31.744614,...,0.032593,0.038378,0.07592,0.030977,0.035604,0.056089,0.147205,0.19047,0.02191,0.035604
min,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1947.0,7287.75,63.0,165.0,19.0,41.0,6.0,0.0,1.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1978.0,14575.5,112.0,364.0,44.0,95.0,15.0,2.0,5.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1999.0,21864.25,142.0,512.0,71.0,141.0,25.0,4.0,12.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,29152.0,165.0,716.0,177.0,262.0,67.0,36.0,73.0,191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [162]:
# save DB to keep stats by year. could join this later with more stats db
import pickle
join4_position.to_pickle('df_by_year.pkl')

In [163]:
see_trout3 = join4_position[join4_position['playerID'].str.contains('cabremi01')]
see_trout3.head(30)

Unnamed: 0,playerID,yearID,level_0,teamID,lgID,G,AB,R,H,2B,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,POS
3683,cabremi01,2003,3683,FLO,NL,87,314,39,84,21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3684,cabremi01,2004,3684,FLO,NL,160,603,101,177,31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3685,cabremi01,2005,3685,FLO,NL,158,613,106,198,43,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,OF
3686,cabremi01,2006,3686,FLO,NL,158,576,112,195,50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,OF
3687,cabremi01,2007,3687,FLO,NL,157,588,91,188,38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3688,cabremi01,2008,3688,DET,AL,160,616,85,180,36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3689,cabremi01,2009,3689,DET,AL,160,611,96,198,34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3690,cabremi01,2010,3690,DET,AL,150,548,111,180,45,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,OF
3691,cabremi01,2011,3691,DET,AL,161,572,111,197,48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
3692,cabremi01,2012,3692,DET,AL,161,622,109,205,40,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,OF


In [164]:
join4_position.columns

Index(['playerID', 'yearID', 'level_0', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'BB', 'SO', 'birthCountry', 'birthState',
       'nameFirst', 'nameLast', 'nameGiven', 'height', 'bats', 'throws',
       'inducted', 'asg_mvp', 'baberuth_award', 'baseball_magazine_allstar',
       'comeback_poy', 'gold_glove_award', 'hankaaron_award', 'hutch_award',
       'lougehrig_award', 'mvp', 'nlcs_mvp', 'robertoclemente_award', 'roy',
       'silver_slugger', 'tsn_allstar', 'triple_crown', 'ws_mvp', 'POS'],
      dtype='object')

In [165]:
join4_position.loc[join4_position['inducted']=='Y']

Unnamed: 0,playerID,yearID,level_0,teamID,lgID,G,AB,R,H,2B,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,POS
0,aaronha01,1954,0,ML1,NL,122,468,58,131,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
1,aaronha01,1955,1,ML1,NL,153,602,105,189,37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
2,aaronha01,1956,2,ML1,NL,153,609,106,200,34,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,OF
3,aaronha01,1957,3,ML1,NL,151,615,118,198,27,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OF
4,aaronha01,1958,4,ML1,NL,153,601,109,196,34,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,OF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29013,yountro01,1989,29014,ML4,AL,160,614,101,195,38,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,SS
29014,yountro01,1990,29015,ML4,AL,158,587,98,145,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SS
29015,yountro01,1991,29016,ML4,AL,130,503,66,131,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SS
29016,yountro01,1992,29017,ML4,AL,150,557,71,147,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SS


In [166]:
# see unique number of HOF
join4_position.groupby(['inducted'])['playerID'].nunique()

inducted
0    2171
Y     145
Name: playerID, dtype: int64

In [167]:
# remove dupes due to players with the same name. The aggregation below was causing issues in the data before doing this removal
#join4_position[['playerID','yearID']].drop_duplicates(keep='first',inplace=True)


In [168]:
#join4_position.reset_index(inplace=True)

In [169]:
join4_position.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29152 entries, 0 to 29151
Data columns (total 42 columns):
playerID                     29152 non-null object
yearID                       29152 non-null int64
level_0                      29152 non-null int64
teamID                       29152 non-null object
lgID                         29152 non-null object
G                            29152 non-null int64
AB                           29152 non-null int64
R                            29152 non-null int64
H                            29152 non-null int64
2B                           29152 non-null int64
3B                           29152 non-null int64
HR                           29152 non-null int64
RBI                          29152 non-null float64
SB                           29152 non-null float64
BB                           29152 non-null int64
SO                           29152 non-null float64
birthCountry                 29152 non-null object
birthState                   29

In [170]:
join4_groupby_sum = join4_position.groupby(['playerID']).agg('sum')
join4_groupby_sum.drop('yearID',axis=1,inplace=True)

In [171]:
join4_groupby_sum.describe()

Unnamed: 0,level_0,G,AB,R,H,2B,3B,HR,RBI,SB,...,hutch_award,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp
count,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,...,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0
mean,183469.326425,1256.772884,4262.729275,587.518135,1164.006908,205.221934,37.377807,105.435233,545.498273,88.886442,...,0.013385,0.018566,0.072971,0.01209,0.015976,0.039724,0.278929,0.474525,0.006045,0.015976
std,127073.464966,534.11915,2072.247123,347.233691,623.185015,118.988868,31.602529,108.078556,352.668278,106.721411,...,0.114942,0.13818,0.353121,0.124115,0.125409,0.195351,1.015179,1.324874,0.087971,0.132118
min,212.0,420.0,789.0,95.0,182.0,26.0,0.0,0.0,56.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81660.0,847.75,2664.75,326.0,688.0,114.0,16.0,29.0,285.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,163660.5,1177.0,3908.0,506.5,1046.0,180.0,28.0,72.0,454.5,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,260356.25,1568.0,5433.0,755.25,1494.0,266.0,49.0,140.0,706.0,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,661457.0,3562.0,14053.0,2295.0,4256.0,792.0,302.0,762.0,2297.0,1406.0,...,1.0,2.0,7.0,2.0,1.0,1.0,12.0,13.0,2.0,2.0


In [172]:
join4_groupby_sum.reset_index(inplace=True)

In [173]:
# see pujols
see_pujols = join4_groupby_sum[join4_groupby_sum['playerID'].str.contains('pujol')]
see_pujols.head(30)

Unnamed: 0,playerID,level_0,G,AB,R,H,2B,3B,HR,RBI,...,hutch_award,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp
1683,pujolal01,404149,2823,10687,1828,3202,661,16,656,2075.0,...,0.0,1.0,3.0,1.0,1.0,1.0,6.0,3.0,0.0,0.0


In [174]:
join4_groupby_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2316 entries, 0 to 2315
Data columns (total 29 columns):
playerID                     2316 non-null object
level_0                      2316 non-null int64
G                            2316 non-null int64
AB                           2316 non-null int64
R                            2316 non-null int64
H                            2316 non-null int64
2B                           2316 non-null int64
3B                           2316 non-null int64
HR                           2316 non-null int64
RBI                          2316 non-null float64
SB                           2316 non-null float64
BB                           2316 non-null int64
SO                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null fl

In [175]:
df_hof_only.reset_index(inplace=True)

In [176]:
df_hof_only.head()

Unnamed: 0,playerID,inducted
0,cobbty01,Y
1,ruthba01,Y
2,wagneho01,Y
3,mathech01,Y
4,johnswa01,Y


In [177]:
# setup_dummy_df = join4_position[['playerID','inducted']]
# setup_dummy_df.drop_duplicates(inplace=True)

In [178]:
# setup_dummy_df.head()

In [179]:
# setup_dummy_df.inducted.value_counts()

In [180]:
dummies_y = pd.get_dummies(df_hof_only, columns=['inducted'])

In [181]:
dummies_y.head()

Unnamed: 0,playerID,inducted_Y
0,cobbty01,1
1,ruthba01,1
2,wagneho01,1
3,mathech01,1
4,johnswa01,1


In [182]:
dummies_y.inducted_Y.value_counts()

1    260
Name: inducted_Y, dtype: int64

In [183]:
join4_groupby_sum.describe()

Unnamed: 0,level_0,G,AB,R,H,2B,3B,HR,RBI,SB,...,hutch_award,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp
count,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,...,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0
mean,183469.326425,1256.772884,4262.729275,587.518135,1164.006908,205.221934,37.377807,105.435233,545.498273,88.886442,...,0.013385,0.018566,0.072971,0.01209,0.015976,0.039724,0.278929,0.474525,0.006045,0.015976
std,127073.464966,534.11915,2072.247123,347.233691,623.185015,118.988868,31.602529,108.078556,352.668278,106.721411,...,0.114942,0.13818,0.353121,0.124115,0.125409,0.195351,1.015179,1.324874,0.087971,0.132118
min,212.0,420.0,789.0,95.0,182.0,26.0,0.0,0.0,56.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81660.0,847.75,2664.75,326.0,688.0,114.0,16.0,29.0,285.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,163660.5,1177.0,3908.0,506.5,1046.0,180.0,28.0,72.0,454.5,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,260356.25,1568.0,5433.0,755.25,1494.0,266.0,49.0,140.0,706.0,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,661457.0,3562.0,14053.0,2295.0,4256.0,792.0,302.0,762.0,2297.0,1406.0,...,1.0,2.0,7.0,2.0,1.0,1.0,12.0,13.0,2.0,2.0


In [184]:
dummies_y.set_index('playerID',inplace=True)
join4_groupby_sum.set_index('playerID',inplace=True)

In [185]:
dummies_y.head()

Unnamed: 0_level_0,inducted_Y
playerID,Unnamed: 1_level_1
cobbty01,1
ruthba01,1
wagneho01,1
mathech01,1
johnswa01,1


In [186]:
join4_groupby_sum.head()

Unnamed: 0_level_0,level_0,G,AB,R,H,2B,3B,HR,RBI,SB,...,hutch_award,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,253,3298,12364,2174,3771,624,98,755,2297.0,240.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0
abbated01,212,827,2942,346,748,95,43,11,310.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbotku01,355,702,2044,273,523,109,23,62,242.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abreubo01,1010,2425,8480,1453,2470,574,59,288,1363.0,400.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
abreujo02,381,901,3547,483,1038,218,14,179,611.0,10.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [187]:
# dummies_y.drop('inducted_N', axis=1, inplace=True)
join4_all_features_and_dummy_inducted = join4_groupby_sum.join(dummies_y)
join4_all_features_and_dummy_inducted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2316 entries, aaronha01 to zuninmi01
Data columns (total 29 columns):
level_0                      2316 non-null int64
G                            2316 non-null int64
AB                           2316 non-null int64
R                            2316 non-null int64
H                            2316 non-null int64
2B                           2316 non-null int64
3B                           2316 non-null int64
HR                           2316 non-null int64
RBI                          2316 non-null float64
SB                           2316 non-null float64
BB                           2316 non-null int64
SO                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null float64
hankaaron_award              2316 no

In [188]:
join4_all_features_and_dummy_inducted.head()

Unnamed: 0_level_0,level_0,G,AB,R,H,2B,3B,HR,RBI,SB,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,inducted_Y
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,253,3298,12364,2174,3771,624,98,755,2297.0,240.0,...,1.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,1.0
abbated01,212,827,2942,346,748,95,43,11,310.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
abbotku01,355,702,2044,273,523,109,23,62,242.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
abreubo01,1010,2425,8480,1453,2470,574,59,288,1363.0,400.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
abreujo02,381,901,3547,483,1038,218,14,179,611.0,10.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,


In [189]:
join4_all_features_and_dummy_inducted.inducted_Y.value_counts()

1.0    145
Name: inducted_Y, dtype: int64

In [190]:
join4_all_features_and_dummy_inducted.reset_index(inplace=True)

In [191]:
see_trout3 = join4_all_features_and_dummy_inducted[join4_all_features_and_dummy_inducted['playerID'].str.contains('trout')]
see_trout3.head(30)

Unnamed: 0,playerID,level_0,G,AB,R,H,2B,3B,HR,RBI,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,inducted_Y
2107,troutmi01,238419,1199,4340,903,1324,251,46,285,752.0,...,0.0,2.0,0.0,0.0,1.0,5.0,2.0,0.0,0.0,


In [192]:
join4_all_features_and_dummy_inducted.describe()

Unnamed: 0,level_0,G,AB,R,H,2B,3B,HR,RBI,SB,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,inducted_Y
count,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,...,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,145.0
mean,183469.326425,1256.772884,4262.729275,587.518135,1164.006908,205.221934,37.377807,105.435233,545.498273,88.886442,...,0.018566,0.072971,0.01209,0.015976,0.039724,0.278929,0.474525,0.006045,0.015976,1.0
std,127073.464966,534.11915,2072.247123,347.233691,623.185015,118.988868,31.602529,108.078556,352.668278,106.721411,...,0.13818,0.353121,0.124115,0.125409,0.195351,1.015179,1.324874,0.087971,0.132118,0.0
min,212.0,420.0,789.0,95.0,182.0,26.0,0.0,0.0,56.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,81660.0,847.75,2664.75,326.0,688.0,114.0,16.0,29.0,285.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,163660.5,1177.0,3908.0,506.5,1046.0,180.0,28.0,72.0,454.5,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,260356.25,1568.0,5433.0,755.25,1494.0,266.0,49.0,140.0,706.0,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,661457.0,3562.0,14053.0,2295.0,4256.0,792.0,302.0,762.0,2297.0,1406.0,...,2.0,7.0,2.0,1.0,1.0,12.0,13.0,2.0,2.0,1.0


# join4_all_features_and_dummy_inducted is good to join to a grouped stats_df once ready

# Dataset: Stats

In [193]:
stats_df = pd.read_csv("data/stats.csv")
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8919 entries, 0 to 8918
Data columns (total 15 columns):
last_name           8919 non-null object
 first_name         8919 non-null object
year                8919 non-null int64
b_k_percent         8919 non-null float64
b_bb_percent        8919 non-null float64
batting_avg         8919 non-null float64
slg_percent         8919 non-null float64
on_base_percent     8919 non-null float64
on_base_plus_slg    8919 non-null float64
isolated_power      8919 non-null float64
b_lob               3311 non-null float64
b_total_bases       8919 non-null int64
b_ab_scoring        3311 non-null float64
b_gnd_into_dp       8919 non-null int64
Unnamed: 14         0 non-null float64
dtypes: float64(10), int64(3), object(2)
memory usage: 1.0+ MB


In [194]:
# nothing before 1950. main df has min 1900. How to fill those 50yrs of data when joining to main?? 
# is there really 70 hofs before 1950??
stats_df.describe()

Unnamed: 0,year,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_lob,b_total_bases,b_ab_scoring,b_gnd_into_dp,Unnamed: 14
count,8919.0,8919.0,8919.0,8919.0,8919.0,8919.0,8919.0,8919.0,3311.0,8919.0,3311.0,8919.0,0.0
mean,1988.823074,14.081668,9.16947,0.276484,0.437725,0.344386,0.782111,0.161251,214.631833,231.844489,131.347629,11.874874,
std,19.444514,5.552608,3.475392,0.027832,0.074536,0.036997,0.102746,0.062624,44.455029,54.146816,29.609344,5.189354,
min,1950.0,1.6,1.5,0.168,0.233,0.216,0.455,0.019,44.0,45.0,25.0,0.0,
25%,1974.0,10.1,6.6,0.257,0.385,0.319,0.711,0.114,193.0,195.0,116.0,8.0,
50%,1990.0,13.5,8.7,0.276,0.433,0.342,0.776,0.158,218.0,230.0,132.0,11.0,
75%,2005.0,17.5,11.3,0.295,0.484,0.367,0.844,0.203,243.0,267.0,150.0,15.0,
max,2020.0,43.9,37.6,0.394,0.863,0.609,1.422,0.536,348.0,425.0,239.0,36.0,


In [195]:
# rename columns to match join4
stats_df = stats_df.rename(columns={' first_name': 'nameFirst', 'last_name': 'nameLast' })

In [196]:
# remove whitespace in all columns
stats_df = stats_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [197]:
stats_df.drop(['b_ab_scoring','b_lob','Unnamed: 14'], axis=1, inplace=True)

In [198]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8919 entries, 0 to 8918
Data columns (total 12 columns):
nameLast            8919 non-null object
nameFirst           8919 non-null object
year                8919 non-null int64
b_k_percent         8919 non-null float64
b_bb_percent        8919 non-null float64
batting_avg         8919 non-null float64
slg_percent         8919 non-null float64
on_base_percent     8919 non-null float64
on_base_plus_slg    8919 non-null float64
isolated_power      8919 non-null float64
b_total_bases       8919 non-null int64
b_gnd_into_dp       8919 non-null int64
dtypes: float64(7), int64(3), object(2)
memory usage: 836.3+ KB


In [199]:
# need to get playerID into stats DB, so that I can then join stats DB into join4. Once thats done, groupby playerID, drop year.
names_df = join4_position[['playerID','nameFirst','nameLast']]
names_df.drop_duplicates(inplace=True)
names_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,playerID,nameFirst,nameLast
0,aaronha01,Hank,Aaron
23,abbated01,Ed,Abbaticchio
31,abbotku01,Kurt,Abbott
41,abreubo01,Bobby,Abreu
61,abreujo02,Jose,Abreu


In [200]:
stats_df.head()

Unnamed: 0,nameLast,nameFirst,year,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_total_bases,b_gnd_into_dp
0,Abreu,Bobby,2011,19.3,13.3,0.253,0.365,0.352,0.717,0.112,183,8
1,Damon,Johnny,2011,14.2,7.9,0.261,0.418,0.325,0.742,0.156,243,4
2,Guerrero,Vladimir,2011,9.5,2.9,0.29,0.416,0.317,0.733,0.126,234,23
3,Hunter,Torii,2011,19.3,9.6,0.262,0.429,0.336,0.765,0.167,249,24
4,Ibanez,Raul,2011,18.4,5.7,0.245,0.419,0.289,0.707,0.174,224,13


In [201]:
names_df.head()

Unnamed: 0,playerID,nameFirst,nameLast
0,aaronha01,Hank,Aaron
23,abbated01,Ed,Abbaticchio
31,abbotku01,Kurt,Abbott
41,abreubo01,Bobby,Abreu
61,abreujo02,Jose,Abreu


In [234]:
see_trout3 = names_df[names_df['playerID'].str.contains('pinsova01')]
see_trout3.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,playerID
nameFirst,nameLast,Unnamed: 2_level_1
Vada,Pinson,pinsova01


In [203]:
names_df.set_index(['nameFirst','nameLast'], inplace=True)
stats_df.set_index(['nameFirst','nameLast'], inplace=True)

In [204]:
# lost 100+ people here because I likely don't have them in the stats_df 
# or because their names don't match
# is it okay to leave them as null????
stats_names = names_df.join(stats_df, how='left')
stats_names.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9196 entries, (A. J., Pierzynski) to (Zoilo, Versalles)
Data columns (total 11 columns):
playerID            9196 non-null object
year                8342 non-null float64
b_k_percent         8342 non-null float64
b_bb_percent        8342 non-null float64
batting_avg         8342 non-null float64
slg_percent         8342 non-null float64
on_base_percent     8342 non-null float64
on_base_plus_slg    8342 non-null float64
isolated_power      8342 non-null float64
b_total_bases       8342 non-null float64
b_gnd_into_dp       8342 non-null float64
dtypes: float64(10), object(1)
memory usage: 845.6+ KB


In [205]:
# rename columns to match join4
stats_names = stats_names.rename(columns={'year': 'yearID'})

In [206]:
stats_names.reset_index(inplace=True)

In [207]:
stats_names.head(10)

Unnamed: 0,nameFirst,nameLast,playerID,yearID,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_total_bases,b_gnd_into_dp
0,A. J.,Pierzynski,pierzaj01,,,,,,,,,,
1,AJ,Pollock,polloaj01,2020.0,21.4,5.7,0.276,0.566,0.314,0.881,0.291,111.0,6.0
2,AJ,Pollock,polloaj01,2015.0,13.2,7.9,0.315,0.498,0.367,0.865,0.182,303.0,19.0
3,Aaron,Boone,booneaa01,2003.0,15.9,7.0,0.267,0.453,0.323,0.776,0.186,268.0,13.0
4,Aaron,Boone,booneaa01,2005.0,16.3,6.2,0.243,0.378,0.297,0.675,0.135,193.0,16.0
5,Aaron,Boone,booneaa01,2002.0,16.2,8.2,0.241,0.439,0.309,0.748,0.198,266.0,9.0
6,Aaron,Boone,booneaa01,1999.0,15.2,5.8,0.28,0.445,0.326,0.771,0.165,210.0,6.0
7,Aaron,Hicks,hicksaa01,2020.0,18.0,19.4,0.225,0.414,0.379,0.793,0.189,70.0,4.0
8,Aaron,Hicks,hicksaa01,2018.0,19.1,15.5,0.248,0.467,0.365,0.832,0.219,224.0,1.0
9,Aaron,Hill,hillaa01,2011.0,12.6,6.1,0.246,0.356,0.298,0.653,0.11,185.0,10.0


In [208]:
join4_all_features_and_dummy_inducted.head()

Unnamed: 0,playerID,level_0,G,AB,R,H,2B,3B,HR,RBI,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,inducted_Y
0,aaronha01,253,3298,12364,2174,3771,624,98,755,2297.0,...,1.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,1.0
1,abbated01,212,827,2942,346,748,95,43,11,310.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,abbotku01,355,702,2044,273,523,109,23,62,242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,abreubo01,1010,2425,8480,1453,2470,574,59,288,1363.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
4,abreujo02,381,901,3547,483,1038,218,14,179,611.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,


In [209]:
join4_all_features_and_dummy_inducted.inducted_Y.value_counts()

1.0    145
Name: inducted_Y, dtype: int64

group stats_df

In [210]:
statsnames_grouped_mean = stats_names.groupby(['playerID']).agg('mean')
statsnames_grouped_mean.drop('yearID',axis=1,inplace=True)

In [211]:
statsnames_grouped_mean.reset_index(inplace=True)

In [212]:
statsnames_grouped_mean.head()

Unnamed: 0,playerID,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_total_bases,b_gnd_into_dp
0,aaronha01,9.86,9.93,0.30645,0.5547,0.37415,0.9288,0.2483,316.85,15.35
1,abbated01,,,,,,,,,
2,abbotku01,24.95,6.0,0.252,0.423,0.302,0.726,0.1715,163.0,5.5
3,abreubo01,18.207143,14.771429,0.294571,0.483071,0.398071,0.881071,0.188429,273.571429,11.0
4,abreujo02,20.257143,6.4,0.295714,0.528,0.351714,0.879857,0.232286,281.285714,17.142857


In [213]:
print(statsnames_grouped_mean.shape)
print(join4_all_features_and_dummy_inducted.shape)

(2316, 10)
(2316, 30)


In [214]:
statsnames_grouped_mean.describe()

Unnamed: 0,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_total_bases,b_gnd_into_dp
count,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0
mean,14.203134,8.83736,0.27285,0.425709,0.338597,0.764313,0.152867,221.607401,11.573487
std,5.285373,2.924757,0.019778,0.05943,0.02934,0.080414,0.053346,38.074175,3.832808
min,2.093333,2.15,0.207,0.258,0.239,0.4975,0.031,80.0,1.0
25%,10.553125,6.66375,0.259893,0.385,0.318,0.7115,0.11275,197.112745,8.714286
50%,13.658333,8.6,0.272667,0.425292,0.338167,0.763583,0.152118,221.909091,11.6
75%,17.664286,10.785682,0.285843,0.467,0.358,0.8165,0.191692,247.428571,14.25
max,33.7,21.66,0.3448,0.631353,0.488,1.11,0.322647,335.0,25.0


In [215]:
join4_all_features_and_dummy_inducted.inducted_Y.value_counts()

1.0    145
Name: inducted_Y, dtype: int64

In [216]:
statsnames_grouped_mean.head()

Unnamed: 0,playerID,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_total_bases,b_gnd_into_dp
0,aaronha01,9.86,9.93,0.30645,0.5547,0.37415,0.9288,0.2483,316.85,15.35
1,abbated01,,,,,,,,,
2,abbotku01,24.95,6.0,0.252,0.423,0.302,0.726,0.1715,163.0,5.5
3,abreubo01,18.207143,14.771429,0.294571,0.483071,0.398071,0.881071,0.188429,273.571429,11.0
4,abreujo02,20.257143,6.4,0.295714,0.528,0.351714,0.879857,0.232286,281.285714,17.142857


In [217]:
see_trout3 = join4_all_features_and_dummy_inducted[join4_all_features_and_dummy_inducted['playerID'].str.contains('bond')]
see_trout3.head(30)

Unnamed: 0,playerID,level_0,G,AB,R,H,2B,3B,HR,RBI,...,lougehrig_award,mvp,nlcs_mvp,robertoclemente_award,roy,silver_slugger,tsn_allstar,triple_crown,ws_mvp,inducted_Y
190,bondsba01,52217,2986,9847,2227,2935,601,77,762,1996.0,...,0.0,7.0,0.0,0.0,0.0,12.0,12.0,0.0,0.0,
191,bondsbo01,35880,1849,7043,1258,1886,302,66,332,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,


In [218]:
join4_all_features_and_dummy_inducted.set_index('playerID', inplace=True)
statsnames_grouped_mean.set_index('playerID', inplace=True)

# final join: stats_names onto main DB

In [219]:
final_mlb_df = join4_all_features_and_dummy_inducted.join(statsnames_grouped_mean)
final_mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2316 entries, aaronha01 to zuninmi01
Data columns (total 38 columns):
level_0                      2316 non-null int64
G                            2316 non-null int64
AB                           2316 non-null int64
R                            2316 non-null int64
H                            2316 non-null int64
2B                           2316 non-null int64
3B                           2316 non-null int64
HR                           2316 non-null int64
RBI                          2316 non-null float64
SB                           2316 non-null float64
BB                           2316 non-null int64
SO                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null float64
hankaaron_award              2316 no

In [220]:
# rename columns and lowercase all columns
final_mlb_df = final_mlb_df.rename(columns={'b_k_percent': 'k_percentage', 
                                            'b_bb_percent': 'bb_percentage',
                                            'batting_avg': 'BA',
                                            'on_base_percent': 'OBP',
                                            'on_base_plus_slg': 'OPS',
                                            'isolated_power': 'ISO',
                                            'b_total_bases': 'TB',
                                            'b_gnd_into_dp': 'GIDP',
                                            'inducted_Y': 'inducted_y'})
final_mlb_df.columns = map(str.lower, final_mlb_df.columns)

In [221]:
# move target column to the end of df
cols = list(final_mlb_df.columns.values)
cols.pop(cols.index('inducted_y'))
final_mlb_df = final_mlb_df[cols+['inducted_y']]

In [222]:
final_mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2316 entries, aaronha01 to zuninmi01
Data columns (total 38 columns):
level_0                      2316 non-null int64
g                            2316 non-null int64
ab                           2316 non-null int64
r                            2316 non-null int64
h                            2316 non-null int64
2b                           2316 non-null int64
3b                           2316 non-null int64
hr                           2316 non-null int64
rbi                          2316 non-null float64
sb                           2316 non-null float64
bb                           2316 non-null int64
so                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null float64
hankaaron_award              2316 no

In [223]:
final_mlb_df.inducted_y.value_counts()

1.0    145
Name: inducted_y, dtype: int64

In [224]:
final_mlb_df.describe()

Unnamed: 0,level_0,g,ab,r,h,2b,3b,hr,rbi,sb,...,k_percentage,bb_percentage,ba,slg_percent,obp,ops,iso,tb,gidp,inducted_y
count,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,2316.0,...,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,145.0
mean,183469.326425,1256.772884,4262.729275,587.518135,1164.006908,205.221934,37.377807,105.435233,545.498273,88.886442,...,14.203134,8.83736,0.27285,0.425709,0.338597,0.764313,0.152867,221.607401,11.573487,1.0
std,127073.464966,534.11915,2072.247123,347.233691,623.185015,118.988868,31.602529,108.078556,352.668278,106.721411,...,5.285373,2.924757,0.019778,0.05943,0.02934,0.080414,0.053346,38.074175,3.832808,0.0
min,212.0,420.0,789.0,95.0,182.0,26.0,0.0,0.0,56.0,0.0,...,2.093333,2.15,0.207,0.258,0.239,0.4975,0.031,80.0,1.0,1.0
25%,81660.0,847.75,2664.75,326.0,688.0,114.0,16.0,29.0,285.0,22.0,...,10.553125,6.66375,0.259893,0.385,0.318,0.7115,0.11275,197.112745,8.714286,1.0
50%,163660.5,1177.0,3908.0,506.5,1046.0,180.0,28.0,72.0,454.5,51.0,...,13.658333,8.6,0.272667,0.425292,0.338167,0.763583,0.152118,221.909091,11.6,1.0
75%,260356.25,1568.0,5433.0,755.25,1494.0,266.0,49.0,140.0,706.0,118.0,...,17.664286,10.785682,0.285843,0.467,0.358,0.8165,0.191692,247.428571,14.25,1.0
max,661457.0,3562.0,14053.0,2295.0,4256.0,792.0,302.0,762.0,2297.0,1406.0,...,33.7,21.66,0.3448,0.631353,0.488,1.11,0.322647,335.0,25.0,1.0


In [225]:
final_mlb_df.reset_index(inplace=True)

In [226]:
final_mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2316 entries, 0 to 2315
Data columns (total 39 columns):
playerID                     2316 non-null object
level_0                      2316 non-null int64
g                            2316 non-null int64
ab                           2316 non-null int64
r                            2316 non-null int64
h                            2316 non-null int64
2b                           2316 non-null int64
3b                           2316 non-null int64
hr                           2316 non-null int64
rbi                          2316 non-null float64
sb                           2316 non-null float64
bb                           2316 non-null int64
so                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null fl

In [227]:
see_trout3 = final_mlb_df[final_mlb_df['playerID'].str.contains('pujo')]
see_trout3.head(30)

Unnamed: 0,playerID,level_0,g,ab,r,h,2b,3b,hr,rbi,...,k_percentage,bb_percentage,ba,slg_percent,obp,ops,iso,tb,gidp,inducted_y
1683,pujolal01,404149,2823,10687,1828,3202,661,16,656,2075.0,...,10.294118,11.041176,0.304118,0.560353,0.383118,0.943412,0.256176,323.588235,21.470588,


In [228]:
final_mlb_df.inducted_y.fillna(0)

0       1.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
2311    0.0
2312    0.0
2313    0.0
2314    0.0
2315    0.0
Name: inducted_y, Length: 2316, dtype: float64

In [229]:
final_mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2316 entries, 0 to 2315
Data columns (total 39 columns):
playerID                     2316 non-null object
level_0                      2316 non-null int64
g                            2316 non-null int64
ab                           2316 non-null int64
r                            2316 non-null int64
h                            2316 non-null int64
2b                           2316 non-null int64
3b                           2316 non-null int64
hr                           2316 non-null int64
rbi                          2316 non-null float64
sb                           2316 non-null float64
bb                           2316 non-null int64
so                           2316 non-null float64
asg_mvp                      2316 non-null float64
baberuth_award               2316 non-null float64
baseball_magazine_allstar    2316 non-null float64
comeback_poy                 2316 non-null float64
gold_glove_award             2316 non-null fl

In [230]:
final_mlb_df.inducted_y.value_counts()

1.0    145
Name: inducted_y, dtype: int64

In [231]:
import pickle

In [232]:
final_mlb_df.to_pickle('final_df.pkl')