# Time Series Analysis

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action="ignore")

import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)

In [12]:
def model(first, last, mlbid, target):
    base_path = '../data/clean_players/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'

    try:
        df = pd.read_csv(file_path)

        y_train, y_test = train_test_split(df, test_size = 0.35, shuffle = False)

        p = d = q = range(0, 3)

        # Generate all different combinations of p, q and q triplets
        pdq = list(itertools.product(p, d, q))

        # Generate all different combinations of seasonal p, q and q triplets
        # Note: here we have 12 in the 's' position as we have monthly data
        # You'll want to change this according to your time series' frequency
        pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]


    
        print()
        print()
        print('MODEL FINISHED!')
        print(f'Our model that minimizes AIC on the training data is the SARIMAX({best_p},{best_d},{best_q}).')
        print(f'This model has an AIC of {best_aic} for player {first} {last}.')
        
    except:
        print()
        print(f'{first} {last} is a Minor League Player, Non-Active Player or Shohei Ohtani!')

In [None]:
### Run Grid Search ###

# Note: this code will take a while to run

# Define function
def sarimax_gridsearch(ts, pdq, pdqs, maxiter=50, freq='D'):
    '''
    Input: 
        ts : your time series data
        pdq : ARIMA combinations from above
        pdqs : seasonal ARIMA combinations from above
        maxiter : number of iterations, increase if your model isn't converging
        frequency : default='M' for month. Change to suit your time series frequency
            e.g. 'D' for day, 'H' for hour, 'Y' for year. 

    Return:
        Prints out top 5 parameter combinations
        Returns dataframe of parameter combinations ranked by BIC
    '''

    # Run a grid search with pdq and seasonal pdq parameters and get the best BIC value
    ans = []
    for comb in pdq:
        for combs in pdqs:
            try:
                mod = SARIMAX(ts, # this is your time series you will input
                                                order=comb,
                                                seasonal_order=combs,
                                                enforce_stationarity=False,
                                                enforce_invertibility=False,
                                                freq=freq)

                output = mod.fit(maxiter=maxiter) 
                ans.append([comb, combs, output.aic])
            except:
                continue

    # Find the parameters with minimal BIC value

    # Convert into dataframe
    ans_df = pd.DataFrame(ans, columns=['order', 'seasonal_order', 'aic'])

    # Sort and return top 5 combinations
    ans_df = ans_df.sort_values(by=['bic'],ascending=True)[0]

    return ans_df

In [4]:
players = pd.read_csv('../mlb_players.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,ACTIVE,Player,Team,Pos,Age,G,AB,...,CS,BB,SO,SH,SF,HBP,AVG,OBP,SLG,OPS
0,547989,Jose,Abreu,Y,Jose Abreu,CWS,1B,34,152,566,...,0,61,143,0,10,22,0.261,0.351,0.481,0.832
1,660670,Ronald,Acuna,Y,Ronald Acuna,ATL,OF,23,82,297,...,6,49,85,0,5,9,0.283,0.394,0.596,0.99
2,642715,Willy,Adames,Y,Willy Adames,TB,SS,26,41,132,...,2,10,51,0,0,0,0.197,0.254,0.371,0.625
3,666176,Jo,Adell,Y,Jo Adell,LAA,OF,22,35,130,...,1,8,32,1,0,1,0.246,0.295,0.408,0.703
4,501303,Ehire,Adrianza,Y,Ehire Adrianza,ATL,SS,32,109,182,...,0,21,42,1,3,2,0.247,0.327,0.401,0.728


In [None]:
%%time
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    model(first, last, mlbid, 'AVG')
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas



MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(5,0,2).
This model has an AIC of -1497.5794019942202 for player Jose Abreu.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(5,1,5).
This model has an AIC of -1091.7660902870166 for player Ronald Acuna.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(1,0,1).
This model has an AIC of -1077.224413932652 for player Willy Adames.

Jo Adell is a Minor League Player, Non-Active Player or Shohei Ohtani!


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(4,0,3).
This model has an AIC of -1035.7125618707755 for player Ehire Adrianza.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(5,0,4).
This model has an AIC of -975.1954454348839 for player Jesus Aguilar.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(2,0,1).
This model has an AIC of -1485.4233781327625 