# Batter Salary Data Cleaning
---
This notebook aims to clean the historical data of stats and salaries acquired from SeanLahmen.com. The data for salaries is from 1985-2016. 

## Import Libraries
---

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## Import Historical Salary Dataset
---

In [2]:
salary = pd.read_csv('../data/lahman_database/Salaries.csv')
salary.head()

Unnamed: 0,yearID,teamID,lgID,playerID,salary
0,1985,ATL,NL,barkele01,870000
1,1985,ATL,NL,bedrost01,550000
2,1985,ATL,NL,benedbr01,545000
3,1985,ATL,NL,campri01,633333
4,1985,ATL,NL,ceronri01,625000


## Import Historical Batting Stats
---

In [3]:
bat = pd.read_csv('../data/lahman_database/Batting.csv')
bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,0,0,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,0,0,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,5,0,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,2,2,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,3,0,16.0,6.0,2.0,2,1.0,,,,,0.0


### Only get stats after 1985 
*This is because salary dataset starts at 1985*

In [4]:
bat = bat[bat['yearID']>1985]
bat = bat[bat['AB']> 50]
bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
63258,aguaylu01,1986,1,PHI,NL,62,133,17,28,6,1,4,13.0,1.0,1.0,8,26.0,0.0,3.0,0.0,2.0,3.0
63259,aguilri01,1986,1,NYN,NL,32,51,4,8,0,0,2,6.0,0.0,0.0,3,12.0,0.0,0.0,3.0,0.0,0.0
63261,aldremi01,1986,1,SFN,NL,84,216,27,54,18,3,2,25.0,1.0,3.0,33,34.0,4.0,2.0,4.0,1.0,3.0
63264,allanan01,1986,1,CLE,AL,101,293,30,66,7,3,1,29.0,10.0,1.0,14,36.0,0.0,1.0,11.0,4.0,7.0
63266,almonbi01,1986,1,PIT,NL,102,196,29,43,7,2,7,27.0,11.0,4.0,30,38.0,2.0,0.0,1.0,3.0,5.0


### Create new column for batting average

In [5]:
bat['AVG'] = round((bat['H'] / bat['AB']), 3)
bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,AVG
63258,aguaylu01,1986,1,PHI,NL,62,133,17,28,6,1,4,13.0,1.0,1.0,8,26.0,0.0,3.0,0.0,2.0,3.0,0.211
63259,aguilri01,1986,1,NYN,NL,32,51,4,8,0,0,2,6.0,0.0,0.0,3,12.0,0.0,0.0,3.0,0.0,0.0,0.157
63261,aldremi01,1986,1,SFN,NL,84,216,27,54,18,3,2,25.0,1.0,3.0,33,34.0,4.0,2.0,4.0,1.0,3.0,0.25
63264,allanan01,1986,1,CLE,AL,101,293,30,66,7,3,1,29.0,10.0,1.0,14,36.0,0.0,1.0,11.0,4.0,7.0,0.225
63266,almonbi01,1986,1,PIT,NL,102,196,29,43,7,2,7,27.0,11.0,4.0,30,38.0,2.0,0.0,1.0,3.0,5.0,0.219


### Create new column for on-base percentage

In [6]:
numerator = bat['H'] + bat['BB'] + bat['HBP']
plate = bat['AB'] + bat['BB'] + bat['HBP'] + bat['SF']
bat['OBP'] = round((numerator / plate), 3)
bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,AVG,OBP
63258,aguaylu01,1986,1,PHI,NL,62,133,17,28,6,1,4,13.0,1.0,1.0,8,26.0,0.0,3.0,0.0,2.0,3.0,0.211,0.267
63259,aguilri01,1986,1,NYN,NL,32,51,4,8,0,0,2,6.0,0.0,0.0,3,12.0,0.0,0.0,3.0,0.0,0.0,0.157,0.204
63261,aldremi01,1986,1,SFN,NL,84,216,27,54,18,3,2,25.0,1.0,3.0,33,34.0,4.0,2.0,4.0,1.0,3.0,0.25,0.353
63264,allanan01,1986,1,CLE,AL,101,293,30,66,7,3,1,29.0,10.0,1.0,14,36.0,0.0,1.0,11.0,4.0,7.0,0.225,0.26
63266,almonbi01,1986,1,PIT,NL,102,196,29,43,7,2,7,27.0,11.0,4.0,30,38.0,2.0,0.0,1.0,3.0,5.0,0.219,0.319


### Create new column for slugging percentage

In [7]:
first = bat['H'] - (bat['2B'] + bat['3B'] + bat['HR'])

bat['SLG'] = round(((first + (2 * bat['2B']) + (3 * bat['3B']) + (4 * bat['HR'])) / bat['AB']), 3)

bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,AVG,OBP,SLG
63258,aguaylu01,1986,1,PHI,NL,62,133,17,28,6,1,4,13.0,1.0,1.0,8,26.0,0.0,3.0,0.0,2.0,3.0,0.211,0.267,0.361
63259,aguilri01,1986,1,NYN,NL,32,51,4,8,0,0,2,6.0,0.0,0.0,3,12.0,0.0,0.0,3.0,0.0,0.0,0.157,0.204,0.275
63261,aldremi01,1986,1,SFN,NL,84,216,27,54,18,3,2,25.0,1.0,3.0,33,34.0,4.0,2.0,4.0,1.0,3.0,0.25,0.353,0.389
63264,allanan01,1986,1,CLE,AL,101,293,30,66,7,3,1,29.0,10.0,1.0,14,36.0,0.0,1.0,11.0,4.0,7.0,0.225,0.26,0.28
63266,almonbi01,1986,1,PIT,NL,102,196,29,43,7,2,7,27.0,11.0,4.0,30,38.0,2.0,0.0,1.0,3.0,5.0,0.219,0.319,0.383


### Create new column for on-base plus slugging percentage

In [8]:
bat['OPS'] = bat['OBP'] + bat['SLG']

bat.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,AVG,OBP,SLG,OPS
63258,aguaylu01,1986,1,PHI,NL,62,133,17,28,6,1,4,13.0,1.0,1.0,8,26.0,0.0,3.0,0.0,2.0,3.0,0.211,0.267,0.361,0.628
63259,aguilri01,1986,1,NYN,NL,32,51,4,8,0,0,2,6.0,0.0,0.0,3,12.0,0.0,0.0,3.0,0.0,0.0,0.157,0.204,0.275,0.479
63261,aldremi01,1986,1,SFN,NL,84,216,27,54,18,3,2,25.0,1.0,3.0,33,34.0,4.0,2.0,4.0,1.0,3.0,0.25,0.353,0.389,0.742
63264,allanan01,1986,1,CLE,AL,101,293,30,66,7,3,1,29.0,10.0,1.0,14,36.0,0.0,1.0,11.0,4.0,7.0,0.225,0.26,0.28,0.54
63266,almonbi01,1986,1,PIT,NL,102,196,29,43,7,2,7,27.0,11.0,4.0,30,38.0,2.0,0.0,1.0,3.0,5.0,0.219,0.319,0.383,0.702


### Drop unwanted columns

In [9]:
bat.drop(columns = ['GIDP', 'IBB', 'stint', 'SB', 'HBP', 'SH', 'SF', 'CS'], inplace = True)

bat.head()

Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
63258,aguaylu01,1986,PHI,NL,62,133,17,28,6,1,4,13.0,8,26.0,0.211,0.267,0.361,0.628
63259,aguilri01,1986,NYN,NL,32,51,4,8,0,0,2,6.0,3,12.0,0.157,0.204,0.275,0.479
63261,aldremi01,1986,SFN,NL,84,216,27,54,18,3,2,25.0,33,34.0,0.25,0.353,0.389,0.742
63264,allanan01,1986,CLE,AL,101,293,30,66,7,3,1,29.0,14,36.0,0.225,0.26,0.28,0.54
63266,almonbi01,1986,PIT,NL,102,196,29,43,7,2,7,27.0,30,38.0,0.219,0.319,0.383,0.702


### Merge salary and batting dataset

In [10]:
df = bat.merge(salary, how = 'inner', left_on = ['playerID', 'yearID'], right_on = ['playerID', 'yearID'])
df.head()

Unnamed: 0,playerID,yearID,teamID_x,lgID_x,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,teamID_y,lgID_y,salary
0,aguaylu01,1986,PHI,NL,62,133,17,28,6,1,4,13.0,8,26.0,0.211,0.267,0.361,0.628,PHI,NL,275000
1,aguilri01,1986,NYN,NL,32,51,4,8,0,0,2,6.0,3,12.0,0.157,0.204,0.275,0.479,NYN,NL,130000
2,aldremi01,1986,SFN,NL,84,216,27,54,18,3,2,25.0,33,34.0,0.25,0.353,0.389,0.742,SFN,NL,60000
3,allanan01,1986,CLE,AL,101,293,30,66,7,3,1,29.0,14,36.0,0.225,0.26,0.28,0.54,CLE,AL,60000
4,almonbi01,1986,PIT,NL,102,196,29,43,7,2,7,27.0,30,38.0,0.219,0.319,0.383,0.702,PIT,NL,260000


### Drop repeated columns from merge

In [11]:
df.drop(columns = ['teamID_y', 'lgID_y'], inplace = True)
df.head()

Unnamed: 0,playerID,yearID,teamID_x,lgID_x,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,aguaylu01,1986,PHI,NL,62,133,17,28,6,1,4,13.0,8,26.0,0.211,0.267,0.361,0.628,275000
1,aguilri01,1986,NYN,NL,32,51,4,8,0,0,2,6.0,3,12.0,0.157,0.204,0.275,0.479,130000
2,aldremi01,1986,SFN,NL,84,216,27,54,18,3,2,25.0,33,34.0,0.25,0.353,0.389,0.742,60000
3,allanan01,1986,CLE,AL,101,293,30,66,7,3,1,29.0,14,36.0,0.225,0.26,0.28,0.54,60000
4,almonbi01,1986,PIT,NL,102,196,29,43,7,2,7,27.0,30,38.0,0.219,0.319,0.383,0.702,260000


In [12]:
df.isnull().sum()

playerID    0
yearID      0
teamID_x    0
lgID_x      0
G           0
AB          0
R           0
H           0
2B          0
3B          0
HR          0
RBI         0
BB          0
SO          0
AVG         0
OBP         0
SLG         0
OPS         0
salary      0
dtype: int64

### Save clean dataset

In [13]:
df.to_csv('../data/past_salaries_bat.csv')

## Recap
---
Merged and cleaned the dataset for use in the regression model. Added new columns to match the stats scrapped for batters. 