# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
def clean_data(mlbid, first, last):
    """
    Function to read in and clean data. 
    Since all the csv files are similar, this puts
    all files in similar formats.
    Deletes the "unnamed: 0" column and removes rows with the montly totals.
    Save new file with the clean data.
    """

    base_path = './datasets/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = 'Unnamed: 0', inplace = True)
        
        #check for null values
        total_nulls = df.isnull().sum().sum()
        
        if total_nulls == 0:

            #Only want rows with dates not the total of each month
            months = ['April', 'May', 'June', 'July', 'August', 'September', 'October']

            for month in months:
                df = df[df['date'] != month]
                df.reset_index(drop=True, inplace = True)

            #Save Clean Dataframe
            df.to_csv(f'./clean_data/{first}-{last}-{mlbid}.csv', index = False)
        else:
            print(f'{first} {last} has null values')
    
    except FileNotFoundError:
        print(f'{first} {last} is a Minor League Player or Shohei Ohtani!')

In [3]:
players = pd.read_csv('../mlb_players.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,POS,ACTIVE
0,682928,CJ,Abrams,SS,Y
1,547989,Jose,Abreu,1B,Y
2,554429,Dustin,Ackley,1B,Y
3,660670,Ronald,Acuna,OF,Y
4,542436,Cristhian,Adames,2B,Y


In [4]:
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    clean_data(mlbid, first, last)
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

CJ Abrams is a Minor League Player or Shohei Ohtani!
J.J. Bleday is a Minor League Player or Shohei Ohtani!
Zack Cox is a Minor League Player or Shohei Ohtani!
Yusniel Diaz is a Minor League Player or Shohei Ohtani!
Jeter Downs is a Minor League Player or Shohei Ohtani!
Lucius Fox is a Minor League Player or Shohei Ohtani!
Tyler Freeman is a Minor League Player or Shohei Ohtani!
Anthony Garcia is a Minor League Player or Shohei Ohtani!
Riley Greene is a Minor League Player or Shohei Ohtani!
Reese Havens is a Minor League Player or Shohei Ohtani!
Kyle Holder is a Minor League Player or Shohei Ohtani!
Ryan Howard is a Minor League Player or Shohei Ohtani!
James Jones is a Minor League Player or Shohei Ohtani!
Nolan Jones is a Minor League Player or Shohei Ohtani!
Josh Jung is a Minor League Player or Shohei Ohtani!
Royce Lewis is a Minor League Player or Shohei Ohtani!
Kevin Maitan is a Minor League Player or Shohei Ohtani!
Austin Martin is a Minor League Player or Shohei Ohtani!
Ernesto

In [6]:
df = pd.read_csv('./clean_data/Aaron-Judge-592450.csv')
df

Unnamed: 0,date,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
0,2021-04-01,5,5,0,1,0,0,0,0,0,2,0.200,0.200,0.200,0.400
1,2021-04-03,5,5,0,2,0,0,0,0,0,1,0.300,0.300,0.300,0.600
2,2021-04-04,4,4,0,0,0,0,0,0,0,0,0.214,0.214,0.214,0.429
3,2021-04-05,4,3,2,2,0,0,1,1,1,1,0.294,0.333,0.471,0.804
4,2021-04-06,5,5,1,3,0,0,1,4,0,0,0.364,0.391,0.636,1.028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,2021-09-29,4,3,0,0,0,0,0,1,0,1,0.285,0.371,0.537,0.908
144,2021-09-30,4,3,2,2,0,0,2,2,1,0,0.288,0.374,0.549,0.923
145,2021-10-01,4,4,0,1,0,0,0,0,0,1,0.287,0.373,0.547,0.920
146,2021-10-02,4,3,0,1,0,0,0,0,1,0,0.288,0.374,0.546,0.919
