# Time Series Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action="ignore")

In [2]:
df = pd.read_csv('Enrique-Hernandez-571771.csv')
df.head()

Unnamed: 0,date,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
0,2021-04-02,4,4,0,1,0,0,0,0,0,1,0.25,0.25,0.25,0.5
1,2021-04-03,5,4,0,0,0,0,0,0,1,0,0.125,0.222,0.125,0.347
2,2021-04-04,4,2,1,0,0,0,0,1,1,0,0.1,0.231,0.1,0.331
3,2021-04-05,5,5,0,1,1,0,0,1,0,3,0.133,0.222,0.2,0.422
4,2021-04-06,3,3,0,0,0,0,0,0,0,2,0.111,0.19,0.167,0.357


In [3]:
df.set_index(pd.DatetimeIndex(df['date']), inplace=True)
df.drop(columns = ['date'], inplace = True)

In [4]:
df.head()

Unnamed: 0_level_0,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-04-02,4,4,0,1,0,0,0,0,0,1,0.25,0.25,0.25,0.5
2021-04-03,5,4,0,0,0,0,0,0,1,0,0.125,0.222,0.125,0.347
2021-04-04,4,2,1,0,0,0,0,1,1,0,0.1,0.231,0.1,0.331
2021-04-05,5,5,0,1,1,0,0,1,0,3,0.133,0.222,0.2,0.422
2021-04-06,3,3,0,0,0,0,0,0,0,2,0.111,0.19,0.167,0.357


In [1]:
def model(first, last, mlbid, target):
    base_path = './clean_data/'
    
    #This string will be used to specifiy the player
    player_name = first + '-' + last + '-' + str(mlbid)
    
    #Full path to file
    file_path = base_path + player_name + '.csv'

    try:
        df = pd.read_csv(file_path)
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        y_train, y_test = train_test_split(df[target], test_size = 0.35, shuffle = False)

        # Starting AIC, p, and q.
        best_aic = 99 * (10 ** 16)
        best_p = 0
        best_q = 0
        best_d = 0
        # Use nested for loop to iterate over values of p and q.
        for p in range(6):
            for q in range(6):
                for d in range(5):


                    # Insert try and except statements.
                    try:

                        # Instantiate ARIMA model.
                        arima = ARIMA(endog = y_train.astype(float).dropna(), # endog = y - variable
                                 order = (p, d, q)) # values of p, d, q


                        # Fit ARIMA model.
                        model = arima.fit()


                        # Is my current model's AIC better than our best_aic?
                        if model.aic < best_aic:

                            # If so, let's overwrite best_aic, best_p, and best_q.
                            best_aic = model.aic
                            best_p = p
                            best_q = q
                            best_d = d

                    except:
                        pass
        print()
        print()
        print('MODEL FINISHED!')
        print(f'Our model that minimizes AIC on the training data is the ARIMA({best_p},{best_d},{best_q}).')
        print(f'This model has an AIC of {best_aic} for player {first} {last}.')
        
    except:
        print()
        print(f'{first} {last} is a Minor League Player, Non-Active Player or Shohei Ohtani!')

In [6]:
import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
                        FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
                        FutureWarning)

In [7]:
players = pd.read_csv('../mlb_players.csv').drop('Unnamed: 0', axis = 1)
players.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,POS,ACTIVE
0,682928,CJ,Abrams,SS,Y
1,547989,Jose,Abreu,1B,Y
2,554429,Dustin,Ackley,1B,Y
3,660670,Ronald,Acuna,OF,Y
4,542436,Cristhian,Adames,2B,Y


In [None]:
%%time
for index, row in players.iterrows():
    
    mlbid = row['MLBID']
    first = row['FIRSTNAME']
    last = row['LASTNAME']

    model(first, last, mlbid, 'AVG')
    
# Copied from https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas


CJ Abrams is a Minor League Player or Shohei Ohtani!


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(5,0,4).
This model has an AIC of -515.4293603928554 for player Jose Abreu.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(1,0,0).
This model has an AIC of -66.04495462022115 for player Dustin Ackley.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(0,0,3).
This model has an AIC of -191.13496994699037 for player Ronald Acuna.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(0,0,0).
This model has an AIC of -0.48064707945619567 for player Cristhian Adames.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(5,0,4).
This model has an AIC of -513.2635948807034 for player Willy Adames.


MODEL FINISHED!
Our model that minimizes AIC on the training data is the ARIMA(4,0,1).
This model has an AIC of -121.99639718448884 for player David

In [49]:
df = pd.read_csv('./clean_data/Jose-Abreu-547989.csv')
df.set_index(pd.DatetimeIndex(df['date']), inplace = True)
df.drop(columns = ['date'], inplace = True)

In [50]:
df.head()

Unnamed: 0_level_0,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-04-01,4,4,0,1,0,0,0,0,0,2,0.25,0.25,0.25,0.5
2021-04-02,5,4,2,2,0,0,1,4,1,1,0.375,0.444,0.75,1.194
2021-04-03,5,4,0,0,0,0,0,0,1,1,0.25,0.357,0.5,0.857
2021-04-04,5,3,1,0,0,0,0,0,2,3,0.2,0.368,0.4,0.768
2021-04-05,5,5,0,1,0,0,0,0,0,1,0.2,0.333,0.35,0.683


In [55]:
df = pd.read_csv('./clean_data/Jose-Abreu-547989.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.head()

Unnamed: 0_level_0,PA,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-04-01,4,4,0,1,0,0,0,0,0,2,0.25,0.25,0.25,0.5
2021-04-02,5,4,2,2,0,0,1,4,1,1,0.375,0.444,0.75,1.194
2021-04-03,5,4,0,0,0,0,0,0,1,1,0.25,0.357,0.5,0.857
2021-04-04,5,3,1,0,0,0,0,0,2,3,0.2,0.368,0.4,0.768
2021-04-05,5,5,0,1,0,0,0,0,0,1,0.2,0.333,0.35,0.683
