# Time Series Analysis
---

In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
def var_model(first, last, mlbid):
    
    """
    Function to create a VAR model for each individual player
    Returns a csv with forcasted stats
    """
    
    base_path = '../data/clean_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'

    try:
        df = pd.read_csv(file_path)

        train, test = train_test_split(df, test_size = 0.15, shuffle = False)

        model = VAR(train)

        ts_model = model.fit()

        forecast = ts_model.forecast(train.values, len(test))

        player_stats = []

        for time in forecast:
            stats = {}

            stats['W'] = round(time[0], 0)
            stats['L'] = round(time[1], 0)
            stats['ERA'] = round(time[2], 2)
            stats['IP'] = round(time[3], 1)
            stats['H'] = round(time[4], 0)
            stats['ER'] = round(time[5], 0)
            stats['HR'] = round(time[6], 0)
            stats['BB'] = round(time[7], 0)
            stats['SO'] = round(time[8], 0)
            stats['WHIP'] = round(time[9], 2)

            #Appends the row of stats to the list
            player_stats.append(stats)

        #Creates data frame of all stats
        df = pd.DataFrame(player_stats)

        #Saves Dataframe to a file with player name
        df.to_csv(f'../data/predictions_pitch/{first}-{last}-{mlbid}.csv', index = False)
        
    except:
        print(f'{first} {last} not enough data')

In [3]:
def average_forecast (first, last, mlbid):
    
    base_path = '../data/predictions_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    try:
        df = pd.read_csv(file_path)

        stats = {
            "name" : f'{first} {last}',
            "W" : round(df['W'].sum(), 0),
            "L" : round(df['L'].sum(), 0),
            "ERA" : round(df['ERA'].mean(), 3),
            "IP" : round(df['IP'].sum(), 1),
            "H" : round(df['H'].sum(), 0),
            "ER" : round(df['ER'].sum(), 0),
            "HR" : round(df['HR'].sum(), 0),
            "BB" : round(df['BB'].sum(), 0),
            "SO" : round(df['SO'].sum(), 0),
            "WHIP" : round(df['WHIP'].mean(), 2)
        }
        
        return stats
        
    except:
        pass

In [4]:
players = pd.read_csv('../data/mlb_players_pitch.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Age,W,L,ERA,IP,H,ER,HR,BB,K,WHIP,salary
0,472551,Fernando,Abad,Fernando Abad,BAL,35,0,0,5.6,17.2,23,11,1,7,10,1.7,"$570,500"
1,676265,Cory,Abbott,Cory Abbott,CHC,26,0,0,6.75,17.1,20,13,7,11,12,1.79,"$570,500"
2,642758,Domingo,Acevedo,Domingo Acevedo,OAK,27,0,0,3.27,11.0,9,4,3,4,9,1.18,"$570,500"
3,613534,Austin,Adams,Austin Adams,SD,30,3,2,4.1,52.2,28,24,1,35,76,1.2,"$580,200"
4,669211,Keegan,Akin,Keegan Akin,BAL,26,2,10,6.63,95.0,110,70,17,40,82,1.58,"$570,500"


In [5]:
%%time
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    var_model(first, last, mlbid)
    
print('Models Finished')
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

Shane Baz not enough data
Reid Detmers not enough data
Scott Kazmir not enough data
Models Finished
CPU times: user 9.21 s, sys: 474 ms, total: 9.68 s
Wall time: 7.06 s


In [7]:
%%time
forecasts = []
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    stats = average_forecast(first, last, mlbid)
    
    if stats != None:
        forecasts.append(stats)
    
#Creates data frame of all stats
df = pd.DataFrame(forecasts)

#Saves Dataframe to a file with player name
df.to_csv(f'../data/player_forecasts_pitch.csv')
    
#Save as pickle file
with open('../pickles/forecast_pitch.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(df, pickle_out)
    
print('Finished')

Finished
CPU times: user 1.85 s, sys: 83.7 ms, total: 1.94 s
Wall time: 2.16 s


## Recap
---
A VAR model was ran and used to forcast about a third of the season. Then the forcasted results were counted and averaged to give estimated stats for the upcoming season. The stats will be used for the app and to predict their salary.