# Active Players Cleaning
---
This notebook aims to clean the current batters and pitchers with salary datasets. Removing unnecessary columns and checking for null. The stats must be the same stats gathered for other datasets. 

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

## Batters
---

In [2]:
df = pd.read_csv('../data/mlb_players_bat.csv').drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Pos,Age,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,SH,SF,HBP,AVG,OBP,SLG,OPS,salary
0,547989,Jose,Abreu,Y,Jose Abreu,CWS,1B,34,152,566,86,148,30,2,30,117,1,0,61,143,0,10,22,0.261,0.351,0.481,0.832,"$17,666,666"
1,642715,Willy,Adames,Y,Willy Adames,TB,SS,26,41,132,16,26,6,1,5,15,1,2,10,51,0,0,0,0.197,0.254,0.371,0.625,"$590,000"
2,501303,Ehire,Adrianza,Y,Ehire Adrianza,ATL,SS,32,109,182,32,45,9,2,5,28,0,0,21,42,1,3,2,0.247,0.327,0.401,0.728,"$1,500,000"
3,542583,Jesus,Aguilar,Y,Jesus Aguilar,MIA,1B,31,131,449,49,117,23,0,22,93,0,0,46,93,0,7,3,0.261,0.329,0.459,0.788,"$4,500,000"
4,605113,Nick,Ahmed,Y,Nick Ahmed,ARI,SS,31,129,434,46,96,30,3,5,38,7,2,34,104,2,1,2,0.221,0.28,0.339,0.619,"$8,125,000"


### Remove unwanted columns

In [3]:
df = df.drop(columns = ['ACTIVE', 'SB', 'HBP', 'SH', 'SF', 'CS'])
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Pos,Age,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,547989,Jose,Abreu,Jose Abreu,CWS,1B,34,152,566,86,148,30,2,30,117,61,143,0.261,0.351,0.481,0.832,"$17,666,666"
1,642715,Willy,Adames,Willy Adames,TB,SS,26,41,132,16,26,6,1,5,15,10,51,0.197,0.254,0.371,0.625,"$590,000"
2,501303,Ehire,Adrianza,Ehire Adrianza,ATL,SS,32,109,182,32,45,9,2,5,28,21,42,0.247,0.327,0.401,0.728,"$1,500,000"
3,542583,Jesus,Aguilar,Jesus Aguilar,MIA,1B,31,131,449,49,117,23,0,22,93,46,93,0.261,0.329,0.459,0.788,"$4,500,000"
4,605113,Nick,Ahmed,Nick Ahmed,ARI,SS,31,129,434,46,96,30,3,5,38,34,104,0.221,0.28,0.339,0.619,"$8,125,000"


### Check for Nulls

In [4]:
df.isnull().sum()

MLBID        0
FIRSTNAME    0
LASTNAME     0
Player       0
Team         0
Pos          0
Age          0
G            0
AB           0
R            0
H            0
2B           0
3B           0
HR           0
RBI          0
BB           0
SO           0
AVG          0
OBP          0
SLG          0
OPS          0
salary       0
dtype: int64

### Save clean dataset

In [5]:
df.to_csv('../data/mlb_players_bat.csv')

## Pitchers
---

In [6]:
df = pd.read_csv('../data/mlb_players_pitch.csv').drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Age,G,GS,CG,SHO,IP,H,ER,K,BB,HR,W,L,SV,BS,HLD,ERA,WHIP,salary
0,472551,Fernando,Abad,Y,Fernando Abad,BAL,35,16,0,0,0,17.2,23,11,10,7,1,0,0,0,0,2,5.6,1.7,"$570,500"
1,676265,Cory,Abbott,Y,Cory Abbott,CHC,26,7,1,0,0,17.1,20,13,12,11,7,0,0,0,0,0,6.75,1.79,"$570,500"
2,642758,Domingo,Acevedo,Y,Domingo Acevedo,OAK,27,10,0,0,0,11.0,9,4,9,4,3,0,0,0,0,0,3.27,1.18,"$570,500"
3,613534,Austin,Adams,Y,Austin Adams,SD,30,65,0,0,0,52.2,28,24,76,35,1,3,2,0,1,10,4.1,1.2,"$580,200"
4,669211,Keegan,Akin,Y,Keegan Akin,BAL,26,24,17,0,0,95.0,110,70,82,40,17,2,10,0,0,0,6.63,1.58,"$570,500"


### Remove unwanted columns

In [7]:
df = df[['MLBID', 'FIRSTNAME', 'LASTNAME', 'Player', 'Team', 'Age', 'W', 'L', 'ERA', 'IP', 'H', 'ER', 'HR', 'BB', 'K', 'WHIP', 'salary']]
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Age,W,L,ERA,IP,H,ER,HR,BB,K,WHIP,salary
0,472551,Fernando,Abad,Fernando Abad,BAL,35,0,0,5.6,17.2,23,11,1,7,10,1.7,"$570,500"
1,676265,Cory,Abbott,Cory Abbott,CHC,26,0,0,6.75,17.1,20,13,7,11,12,1.79,"$570,500"
2,642758,Domingo,Acevedo,Domingo Acevedo,OAK,27,0,0,3.27,11.0,9,4,3,4,9,1.18,"$570,500"
3,613534,Austin,Adams,Austin Adams,SD,30,3,2,4.1,52.2,28,24,1,35,76,1.2,"$580,200"
4,669211,Keegan,Akin,Keegan Akin,BAL,26,2,10,6.63,95.0,110,70,17,40,82,1.58,"$570,500"


### Check for nulls

In [8]:
df.isnull().sum()

MLBID        0
FIRSTNAME    0
LASTNAME     0
Player       0
Team         0
Age          0
W            0
L            0
ERA          0
IP           0
H            0
ER           0
HR           0
BB           0
K            0
WHIP         0
salary       0
dtype: int64

### Save clean dataset

In [9]:
df.to_csv('../data/mlb_players_pitch.csv')

## Recap
---
The active players dataset for pitchers and batters were cleaned and will be used throughout the project to check for player names and ID.