# Data Cleaning

In [6]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [13]:
def merge_years (mlbid, first, last):
    """
    Function reads data and merges the data of player for all years
    Saves merged dataframe
    """
    base_path = '../data/og_players/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid) + '-2018'

    #Full path to file
    file_path = base_path + player_name + '.csv'

    df = pd.read_csv(file_path)
    
    years = ['2019', '2020', '2021']
    for year in years:

        #This string will be used to specifiy the player
        player_name = first + '-' + last + '-' + str(mlbid) + '-' + year

        #Full path to file
        file_path = base_path + player_name + '.csv'
        
        df_2 = pd.read_csv(file_path)
        
        df = df.append(df_2, ignore_index = True)
        
    df.to_csv(f'../data/clean_players/{first}-{last}-{mlbid}.csv', index = False)

In [8]:
def clean_data(mlbid, first, last):
    """
    Function to read in and clean data. 
    Since all the csv files are similar, this puts
    all files in similar formats.
    Deletes the "unnamed: 0" column and removes rows with the montly totals.
    Save new file with the clean data.
    """

    base_path = './datasets/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = 'Unnamed: 0', inplace = True)
        
        #check for null values
        total_nulls = df.isnull().sum().sum()
        
        if total_nulls == 0:

            #Only want rows with dates not the total of each month
            months = ['March', 'April', 'May', 'June', 'July', 'August', 'September', 'October']

            for month in months:
                df = df[df['date'] != month]
                df.reset_index(drop=True, inplace = True)

            #Save Clean Dataframe
            df.to_csv(f'./clean_data/{first}-{last}-{mlbid}.csv', index = False)
        else:
            print(f'{first} {last} has null values')
    
    except FileNotFoundError:
        pass

In [9]:
players = pd.read_csv('../mlb_players.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Pos,Age,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,SH,SF,HBP,AVG,OBP,SLG,OPS
0,547989,Jose,Abreu,Y,Jose Abreu,CWS,1B,34,152,566,86,148,30,2,30,117,1,0,61,143,0,10,22,0.261,0.351,0.481,0.832
1,660670,Ronald,Acuna,Y,Ronald Acuna,ATL,OF,23,82,297,72,84,19,1,24,52,17,6,49,85,0,5,9,0.283,0.394,0.596,0.99
2,642715,Willy,Adames,Y,Willy Adames,MIL,SS,26,99,365,61,104,26,0,20,58,4,2,47,105,0,1,0,0.285,0.366,0.521,0.887
3,642715,Willy,Adames,Y,Willy Adames,TB,SS,26,41,132,16,26,6,1,5,15,1,2,10,51,0,0,0,0.197,0.254,0.371,0.625
4,571431,Matt,Adams,Y,Matt Adams,COL,DH,33,22,36,3,6,1,0,0,2,0,0,4,9,0,0,0,0.167,0.25,0.194,0.444


In [14]:
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    merge_years(mlbid, first, last)
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

FileNotFoundError: [Errno 2] No such file or directory: '../data/og_players/Luis-Garcia-472610-2018.csv'

In [6]:
df = pd.read_csv('./clean_data/Aaron-Judge-592450.csv')
df

Unnamed: 0,date,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
0,2021-04-01,5,5,0,1,0,0,0,0,0,2,0.200,0.200,0.200,0.400
1,2021-04-03,5,5,0,2,0,0,0,0,0,1,0.300,0.300,0.300,0.600
2,2021-04-04,4,4,0,0,0,0,0,0,0,0,0.214,0.214,0.214,0.429
3,2021-04-05,4,3,2,2,0,0,1,1,1,1,0.294,0.333,0.471,0.804
4,2021-04-06,5,5,1,3,0,0,1,4,0,0,0.364,0.391,0.636,1.028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,2021-09-29,4,3,0,0,0,0,0,1,0,1,0.285,0.371,0.537,0.908
144,2021-09-30,4,3,2,2,0,0,2,2,1,0,0.288,0.374,0.549,0.923
145,2021-10-01,4,4,0,1,0,0,0,0,0,1,0.287,0.373,0.547,0.920
146,2021-10-02,4,3,0,1,0,0,0,0,1,0,0.288,0.374,0.546,0.919
