# Data Cleaning
---
*By Ihza Gonzales*

This notebook aims to clean the data that was collected in the previous notebook. Cleaning includes merging the different seasons of each player, checking for null values, and changing the date as the index.

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

## Functions Implemented
---

In [2]:
def merge_years (mlbid, first, last):
    """
    Function reads data and merges the data of player for all years
    Saves merged dataframe
    """
    base_path = '../data/og_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid) + '-2021'

    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        df = pd.read_csv(file_path)

        years = ['2018', '2019', '2020']
        for year in years:

            try:
                #This string will be used to specifiy the player
                player_name = first + '-' + last + '-' + str(mlbid) + '-' + year

                #Full path to file
                file_path = base_path + player_name + '.csv'

                df_2 = pd.read_csv(file_path)


                df = df.append(df_2, ignore_index = True)
            
            except:
                pass

        df.to_csv(f'../data/clean_players_pitch/{first}-{last}-{mlbid}.csv', index = False)
    
    except FileNotFoundError:
        pass

In [3]:
def clean_data(mlbid, first, last):
    """
    Function to read in and clean data. 
    Since all the csv files are similar, this puts
    all files in similar formats.
    Deletes the "unnamed: 0" column and removes rows with the montly totals.
    Save new file with the clean data.
    """

    base_path = '../data/clean_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = ['Unnamed: 0'], inplace = True)
        
        #check for null values
        total_nulls = df.isnull().sum().sum()
        
        if total_nulls == 0:

            #Only want rows with dates not the total of each month
            months = ['March', 'April', 'May', 'June', 'July', 'August', 'September', 'October']

            for month in months:
                df = df[df['date'] != month]
                df.reset_index(drop=True, inplace = True)
            
            #Sort rows by date then set it as index
            df["date"] = pd.to_datetime(df["date"])
            df = df.sort_values(by="date")
            
            #Save Clean Dataframe
            df.to_csv(f'../data/clean_players_pitch/{first}-{last}-{mlbid}.csv')
        else:
            print(f'{first} {last} has null values')
    
    except FileNotFoundError:
        pass

In [4]:
def convert_objects (mlbid, first, last):
    """
    Function converts objects to floats
    Returns the updated csv file
    """
    
    base_path = '../data/clean_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)
        
        #get boolean for numeric vs notnumeric columns
        not_numeric = df.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())

        #Copied from https://stackoverflow.com/questions/54426845/how-to-check-if-a-pandas-dataframe-contains-only-numeric-column-wise

        #Create dict of columns and booleans
        not_numeric = dict(zip(df.columns, not_numeric))
       
        #Some cells have this string hence it being an object. replace with 0
        df.replace('-.--', 0, inplace = True)
        
        #loop through dict and change object to float
        for key in not_numeric:
            if key != 'date':
                if not_numeric[key] == False:
                    df[key] = df[key].astype(float)
            
        #Save Clean Dataframe
        df.to_csv(f'../data/clean_players_pitch/{first}-{last}-{mlbid}.csv', index_label = False)
    
    except (FileNotFoundError, KeyError):
        print(f'{first} {last}')

In [5]:
def convert_date (mlbid, first, last):
    """
    Function converts date to datetime and sets it as index.
    Returns the updated csv file
    """
    
    base_path = '../data/clean_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = ['Unnamed: 0'], inplace = True)
        
        #Set data as index and remove date column   
        df.set_index(pd.DatetimeIndex(df['date']), inplace=True)
        df.drop(columns = ['date'], inplace = True)
            
        #Save Clean Dataframe
        df.to_csv(f'../data/clean_players_pitch/{first}-{last}-{mlbid}.csv', index_label = False)
    
    except (FileNotFoundError, KeyError):
        print(f'{first} {last}')

## Import the File with active pitching players
---

In [6]:
players = pd.read_csv('../data/mlb_players_pitch.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Age,G,GS,IP,H,ER,K,BB,HR,W,L,SV,ERA,WHIP,salary
0,472551,Fernando,Abad,Y,Fernando Abad,BAL,35,16,0,17.2,23,11,10,7,1,0,0,0,5.6,1.7,"$570,500"
1,676265,Cory,Abbott,Y,Cory Abbott,CHC,26,7,1,17.1,20,13,12,11,7,0,0,0,6.75,1.79,"$570,500"
2,642758,Domingo,Acevedo,Y,Domingo Acevedo,OAK,27,10,0,11.0,9,4,9,4,3,0,0,0,3.27,1.18,"$570,500"
3,613534,Austin,Adams,Y,Austin Adams,SD,30,65,0,52.2,28,24,76,35,1,3,2,0,4.1,1.2,"$580,200"
4,669211,Keegan,Akin,Y,Keegan Akin,BAL,26,24,17,95.0,110,70,82,40,17,2,10,0,6.63,1.58,"$570,500"


## Merging, Cleaning, and Converting All Player Files
---

In [7]:
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    merge_years(mlbid, first, last)
    clean_data(mlbid, first, last)
    convert_objects(mlbid, first, last)
    convert_date(mlbid, first, last)

print ('Finished')
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

Finished


## Recap
---
The datasets collected for batters from webscraping have been merged, cleaned, and index was set to date. These datasets will be used for the time series model. 