# Batter Time Series Analysis
---
This notebook aims to create a time series model for batters to forecast their stats. The data being used here is the stats scraped from baseballsavant. 

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## Functions Implemented
---

In [2]:
def var_model(first, last, mlbid):
    
    """
    Function to create a VAR model for each individual player
    Returns a csv with forcasted stats
    """
    
    base_path = '../data/clean_players_bat/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'

    try:
        df = pd.read_csv(file_path)

        train, test = train_test_split(df, test_size = 0.15, shuffle = False)

        model = VAR(train)

        ts_model = model.fit()

        forecast = ts_model.forecast(train.values, len(test))

        player_stats = []
        
        for time in forecast:
            stats = {}

            stats['PA'] = round(time[0], 0)
            stats['AB'] = round(time[1], 0)
            stats['R'] = round(time[2], 0)
            stats['H'] = round(time[3], 0)
            stats['2B'] = round(time[4], 0)
            stats['3B'] = round(time[5], 0)
            stats['HR'] = round(time[6], 0)
            stats['RBI'] = round(time[7], 0)
            stats['BB'] = round(time[8], 0)
            stats['SO'] = round(time[9], 0)
            stats['AVG'] = round(time[10], 3)
            stats['OBP'] = round(time[11], 3)
            stats['SLG'] = round(time[12], 3)
            stats['OPS'] = round(time[13], 3)

            #Appends the row of stats to the list
            player_stats.append(stats)

        #Creates data frame of all stats
        df = pd.DataFrame(player_stats)

        #Saves Dataframe to a file with player name
        df.to_csv(f'../data/predictions_bat/{first}-{last}-{mlbid}.csv', index = False)
        
    except:
        print(f'{first} {last} something went wrong')

In [3]:
def average_forecast (first, last, mlbid):
    
    base_path = '../data/predictions_bat/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    df = pd.read_csv(file_path)

    stats = {
        "name" : f'{first} {last}',
        "AB" : round(df['AB'].sum(), 0),
        "R" : round(df['R'].sum(), 0),
        "H" : round(df['H'].sum(), 0),
        "2B" : round(df['2B'].sum(), 0),
        "3B" : round(df['3B'].sum(), 0),
        "HR" : round(df['HR'].sum(), 0),
        "RBI" : round(df['RBI'].sum(), 0),
        "BB" : round(df['BB'].sum(), 0),
        "SO" : round(df['SO'].sum(), 0),
        "AVG" : round(df['AVG'].mean(), 3),
        "OBP" : round(df['OBP'].mean(), 3),
        "SLG" : round(df['SLG'].mean(), 3),
        "OPS" : round(df['OPS'].mean(), 3)
    }
    
    return stats

## Import Dataset of Active Batters
---

In [4]:
players = pd.read_csv('../data/mlb_players_bat.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Pos,Age,G,AB,R,...,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,547989,Jose,Abreu,Jose Abreu,CWS,1B,34,152,566,86,...,2,30,117,61,143,0.261,0.351,0.481,0.832,"$17,666,666"
1,642715,Willy,Adames,Willy Adames,TB,SS,26,41,132,16,...,1,5,15,10,51,0.197,0.254,0.371,0.625,"$590,000"
2,501303,Ehire,Adrianza,Ehire Adrianza,ATL,SS,32,109,182,32,...,2,5,28,21,42,0.247,0.327,0.401,0.728,"$1,500,000"
3,542583,Jesus,Aguilar,Jesus Aguilar,MIA,1B,31,131,449,49,...,0,22,93,46,93,0.261,0.329,0.459,0.788,"$4,500,000"
4,605113,Nick,Ahmed,Nick Ahmed,ARI,SS,31,129,434,46,...,3,5,38,34,104,0.221,0.28,0.339,0.619,"$8,125,000"


## Run VAR Model for All Batters
---

In [5]:
%%time
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    var_model(first, last, mlbid)
    
print('Models Finished')
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

Models Finished
CPU times: user 10.9 s, sys: 436 ms, total: 11.4 s
Wall time: 6.67 s


## Take the Average and Count of Forecasted Stats to Use in Predicting Salaries
---

In [6]:
%%time
forecasts = []
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    forecasts.append(average_forecast(first, last, mlbid))
    
#Creates data frame of all stats
df = pd.DataFrame(forecasts)

#Saves Dataframe to a file with player name
df.to_csv(f'../data/player_forecasts_bat.csv')
    
#Save as pickle file
with open('../pickles/forecast_bat.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(df, pickle_out)
    
print('Finished')

Finished
CPU times: user 1.72 s, sys: 53 ms, total: 1.77 s
Wall time: 1.69 s


## Recap
---
A VAR model was ran and used to forcast about a third of the season. Then the forcasted results were counted and averaged to give estimated stats for the upcoming season. The stats will be used for the app and to predict their salary.