# Batter Stats from Data Collection Cleaning
---
This notebook aims to clean the data that was collected in a previous notebook. Cleaning includes merging the different seasons of each player, checking for null values, and changing the date as the index.

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

## Functions Implemented
---

In [2]:
def merge_years (mlbid, first, last):
    """
    Function reads data and merges the data of player for all years
    Saves merged dataframe
    """
    base_path = '../data/og_players_bat/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid) + '-2021'

    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        df = pd.read_csv(file_path)

        years = ['2018', '2019', '2020']
        for year in years:

            try:
                #This string will be used to specifiy the player
                player_name = first + '-' + last + '-' + str(mlbid) + '-' + year

                #Full path to file
                file_path = base_path + player_name + '.csv'

                df_2 = pd.read_csv(file_path)


                df = df.append(df_2, ignore_index = True)
            
            except:
                pass

        df.to_csv(f'../data/clean_players_bat/{first}-{last}-{mlbid}.csv', index = False)
    
    except FileNotFoundError:
        pass

In [3]:
def clean_data(mlbid, first, last):
    """
    Function to read in and clean data. 
    Since all the csv files are similar, this puts
    all files in similar formats.
    Deletes the "unnamed: 0" column and removes rows with the montly totals.
    Save new file with the clean data.
    """

    base_path = '../data/clean_players_bat/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = ['Unnamed: 0'], inplace = True)
        
        #check for null values
        total_nulls = df.isnull().sum().sum()
        
        if total_nulls == 0:

            #Only want rows with dates not the total of each month
            months = ['March', 'April', 'May', 'June', 'July', 'August', 'September', 'October']

            for month in months:
                df = df[df['date'] != month]
                df.reset_index(drop=True, inplace = True)
            
            #Sort rows by date then set it as index
            df["date"] = pd.to_datetime(df["date"])
            df = df.sort_values(by="date")
            
            #Save Clean Dataframe
            df.to_csv(f'../data/clean_players_bat/{first}-{last}-{mlbid}.csv')
        else:
            print(f'{first} {last} has null values')
    
    except FileNotFoundError:
        pass

In [4]:
def convert_date (mlbid, first, last):
    """
    Function converts date to datetime and sets it as index.
    Returns the updated csv file
    """
    
    base_path = '../data/clean_players_bat/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        #read in csv file of stats
        df = pd.read_csv(file_path)

        #drop unnamed: 0 column
        df.drop(columns = ['Unnamed: 0'], inplace = True)
        
        #Set data as index and remove date column   
        df.set_index(pd.DatetimeIndex(df['date']), inplace=True)
        df.drop(columns = ['date'], inplace = True)
            
        #Save Clean Dataframe
        df.to_csv(f'../data/clean_players_bat/{first}-{last}-{mlbid}.csv', index_label = False)
    
    except (FileNotFoundError, KeyError):
        print(f'{first} {last}')

## Import the File with Active Batters
---

In [5]:
players = pd.read_csv('../data/mlb_players_bat.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Pos,Age,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,547989,Jose,Abreu,Jose Abreu,CWS,1B,34,152,566,86,148,30,2,30,117,61,143,0.261,0.351,0.481,0.832,"$17,666,666"
1,642715,Willy,Adames,Willy Adames,TB,SS,26,41,132,16,26,6,1,5,15,10,51,0.197,0.254,0.371,0.625,"$590,000"
2,501303,Ehire,Adrianza,Ehire Adrianza,ATL,SS,32,109,182,32,45,9,2,5,28,21,42,0.247,0.327,0.401,0.728,"$1,500,000"
3,542583,Jesus,Aguilar,Jesus Aguilar,MIA,1B,31,131,449,49,117,23,0,22,93,46,93,0.261,0.329,0.459,0.788,"$4,500,000"
4,605113,Nick,Ahmed,Nick Ahmed,ARI,SS,31,129,434,46,96,30,3,5,38,34,104,0.221,0.28,0.339,0.619,"$8,125,000"


## Merging, Cleaning, and Converting All Player Files
---

In [6]:
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    merge_years(mlbid, first, last)
    clean_data(mlbid, first, last)
    convert_date(mlbid, first, last)

print ('Finished')
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

Finished


## Recap
---
The datasets collected for batters from webscraping have been merged, cleaned, and index was set to date. These datasets will be used for the time series model. 