In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import statsmodels.tsa as ts
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r"all_stocks_2006-01-01_to_2018-01-01.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
0,2006-01-03,77.76,79.35,77.24,79.11,3117200,MMM
1,2006-01-04,79.49,79.49,78.25,78.71,2558000,MMM
2,2006-01-05,78.41,78.65,77.56,77.99,2529500,MMM
3,2006-01-06,78.64,78.9,77.64,78.63,2479500,MMM
4,2006-01-09,78.5,79.83,78.46,79.02,1845600,MMM


In [3]:
df.columns = [column.lower() for column in df.columns]
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'name'], dtype='object')

In [4]:
from sklearn.metrics import mean_absolute_error

In [5]:
def evaluate_sarimax_model(X, order, sorder, trend, train_size = 0.7, n = 5):
    print(order, sorder, trend)
    step = (1 - train_size) / n
    mae = list()
    
    for i in range(n):
        # Подготовим данные для кросс-валидации. Разделим выборку на тренировочный набор данных и тестовый
        
        size = int(len(X) * (train_size + step * i)) # 70% наблюдений будет в тренировочном наборе, 30% в тестовом
        train, test = X[0:size].copy(), X[size:].copy() # Тренировочный и тестовый наборы данных
        

        # Тренируем модель на тренировочном наборе
        model = ts.statespace.sarimax.SARIMAX(train, order = order, seasonal_order=sorder, trend = trend,
                                    enforce_stationarity=False, enforce_invertibility=False)
        model_fit = model.fit()
        
        # Предсказываем вне нашего тренировочного набора 
        predictions = model_fit.forecast(len(test))

        mae_score = mean_absolute_error(test, predictions)
        mae.append(mae_score)
        
        return (np.array(mae).mean())

In [None]:
def evaluate_models(dataset, p_values, d_values, q_values, P_values, D_values, Q_values, m_values, t_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                for P in P_values:
                    for D in D_values:
                        for Q in Q_values:
                            for m in m_values:
                                for t in t_values:
                                    order, sorder, trend = (p, d, q), (P, D, Q, m), t
                                    cfg = (order, sorder, trend)
                                    try:
                                        mae = evaluate_sarimax_model(dataset, order, sorder, trend)
                                        if mae < best_score:
                                            best_score, best_cfg = mae, cfg
                                        print('SARIMAX%s MAE=%.3f' % (cfg, mae))
                                    except:
                                        continue
    print('Best ARIMA%s MAE=%.3f' % (best_cfg, best_score))

In [7]:
high_IBM = df["high"][df["name"] == "IBM"].copy()
high_IBM.index = df["date"][df["name"] == "IBM"]
high_IBM.head()

date
2006-01-03    82.55
2006-01-04    82.50
2006-01-05    82.90
2006-01-06    85.03
2006-01-09    84.25
Name: high, dtype: float64

In [8]:
transformed_IBM, lambda_const = stats.boxcox(high_IBM)
transformed_IBM = pd.DataFrame(transformed_IBM, index = df["date"][df["name"] == "IBM"], columns = ["high"])
transformed_IBM.head()

Unnamed: 0_level_0,high
date,Unnamed: 1_level_1
2006-01-03,116.869066
2006-01-04,116.790409
2006-01-05,117.419806
2006-01-06,120.776567
2006-01-09,119.546317


In [9]:
# %%time
# order = (1, 0, 1)
# sorder = (0, 0, 0, 0)
# trend = "t"
# evaluate_sarimax_model(transformed_IBM, order, sorder, trend)

In [None]:
%%time
p_values = range(0, 3)
d_values = range(0, 2)
q_values = range(0, 3)
P_values = range(0, 3)
D_values = range(0, 2)
Q_values = range(0, 3)
m_values = range(0, 12)
t_values = ['n','c','t','ct']
evaluate_models(transformed_IBM, p_values, d_values, q_values, P_values, D_values, Q_values, m_values, t_values)

(0, 0, 0) (0, 0, 0, 0) n
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'n') MAE=242.965
(0, 0, 0) (0, 0, 0, 0) c
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=32.406
(0, 0, 0) (0, 0, 0, 0) t
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=214.009
(0, 0, 0) (0, 0, 0, 0) ct
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=132.613
(0, 0, 0) (0, 0, 0, 1) n
(0, 0, 0) (0, 0, 0, 1) c
(0, 0, 0) (0, 0, 0, 1) t
(0, 0, 0) (0, 0, 0, 1) ct
(0, 0, 0) (0, 0, 0, 2) n
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=242.965
(0, 0, 0) (0, 0, 0, 2) c
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=32.406
(0, 0, 0) (0, 0, 0, 2) t
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=214.009
(0, 0, 0) (0, 0, 0, 2) ct
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=132.613
(0, 0, 0) (0, 0, 0, 3) n
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=242.965
(0, 0, 0) (0, 0, 0, 3) c
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=32.406
(0, 0, 0) (0, 0, 0, 3) t
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=214.009
(0, 0, 0) (0, 0, 0, 3) ct
SARIMAX((0, 0, 0), (0, 0, 0, 0), 'c') MAE=132.613