# Time Series Analysis
---

In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
def var_model(first, last, mlbid):
    
    """
    Function to create a VAR model for each individual player
    Returns a csv with forcasted stats
    """
    
    base_path = '../data/clean_players_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'

    #try:
    df = pd.read_csv(file_path)

    train, test = train_test_split(df, test_size = 0.15, shuffle = False)

    model = VAR(train)

    ts_model = model.fit()

    forecast = ts_model.forecast(train.values, len(test))

    player_stats = []

    for time in forecast:
        stats = {}

        stats['W'] = round(time[0], 0)
        stats['L'] = round(time[1], 0)
        stats['ERA'] = round(time[2], 0)
        stats['G'] = round(time[3], 0)
        stats['GS'] = round(time[4], 0)
        stats['SV'] = round(time[5], 0)
        stats['IP'] = round(time[6], 0)
        stats['H'] = round(time[7], 0)
        stats['R'] = round(time[8], 0)
        stats['ER'] = round(time[9], 0)
        stats['HP'] = round(time[10], 0)
        stats['BB'] = round(time[11], 0)
        stats['SO'] = round(time[12], 0)
        stats['WHIP'] = round(time[13], 3)

        #Appends the row of stats to the list
        player_stats.append(stats)

    #Creates data frame of all stats
    df = pd.DataFrame(player_stats)

    #Saves Dataframe to a file with player name
    df.to_csv(f'../data/predictions_pitch/{first}-{last}-{mlbid}.csv', index = False)
        
    #except:
        #print(f'{first} {last} something went wrong')

In [3]:
def average_forecast (first, last, mlbid):
    
    base_path = '../data/predictions_pitch/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'
    
    df = pd.read_csv(file_path)

    stats = {
        "name" : f'{first} {last}',
        "AB" : round(df['AB'].sum(), 0),
        "R" : round(df['R'].sum(), 0),
        "H" : round(df['H'].sum(), 0),
        "2B" : round(df['2B'].sum(), 0),
        "3B" : round(df['3B'].sum(), 0),
        "HR" : round(df['HR'].sum(), 0),
        "RBI" : round(df['RBI'].sum(), 0),
        "BB" : round(df['BB'].sum(), 0),
        "SO" : round(df['SO'].sum(), 0),
        "AVG" : round(df['AVG'].mean(), 3),
        "OBP" : round(df['OBP'].mean(), 3),
        "SLG" : round(df['SLG'].mean(), 3),
        "OPS" : round(df['OPS'].mean(), 3)
    }
    
    return stats

In [4]:
players = pd.read_csv('../data/mlb_players_pitch.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Age,G,GS,CG,...,BB,HR,W,L,SV,BS,HLD,ERA,WHIP,salary
0,472551,Fernando,Abad,Y,Fernando Abad,BAL,35,16,0,0,...,7,1,0,0,0,0,2,5.6,1.7,"$570,500"
1,676265,Cory,Abbott,Y,Cory Abbott,CHC,26,7,1,0,...,11,7,0,0,0,0,0,6.75,1.79,"$570,500"
2,642758,Domingo,Acevedo,Y,Domingo Acevedo,OAK,27,10,0,0,...,4,3,0,0,0,0,0,3.27,1.18,"$570,500"
3,613534,Austin,Adams,Y,Austin Adams,SD,30,65,0,0,...,35,1,3,2,0,1,10,4.1,1.2,"$580,200"
4,669211,Keegan,Akin,Y,Keegan Akin,BAL,26,24,17,0,...,40,17,2,10,0,0,0,6.63,1.58,"$570,500"


In [5]:
%%time
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    var_model(first, last, mlbid)
    
print('Models Finished')
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

ValueError: x contains one or more constant columns. Column(s) 3 are constant. Adding a constant with trend='c' is not allowed.

In [6]:
%%time
forecasts = []
for index, row in players.iterrows():
    
    first = row['FIRSTNAME']
    last = row['LASTNAME']
    mlbid = row['MLBID']
    
    forecasts.append(average_forecast(first, last, mlbid))
    
#Creates data frame of all stats
df = pd.DataFrame(forecasts)

#Saves Dataframe to a file with player name
df.to_csv(f'../data/player_forecasts_pitch.csv')
    
#Save as pickle file
with open('../pickles/forecast_pitch.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(df, pickle_out)
    
print('Finished')

FileNotFoundError: [Errno 2] No such file or directory: '../data/predictions_pitch/Fernando-Abad-472551.csv'

In [6]:
df = pd.read_csv('../data/clean_players_pitch/Gerrit-Cole-543037.csv')
df.head()

Unnamed: 0,W,L,ERA,G,GS,SV,IP,H,R,ER,HP,BB,SO,WHIP
2018-04-01,1,0,1.29,1,1,0,7.0,2,1,1,1,3,11,0.71
2018-04-07,0,0,0.64,1,1,0,7.0,5,0,0,0,0,11,0.71
2018-04-13,0,0,1.29,1,1,0,7.0,3,2,2,2,1,14,0.67
2018-04-18,1,0,0.96,1,1,0,7.0,5,1,0,0,2,5,0.75
2018-04-23,0,1,1.29,1,1,0,7.0,4,2,2,0,2,8,0.77
