In [15]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime
import optuna
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA 

from data_utils import *

In [16]:
train_df = pd.read_csv('../__input__/1_DataPreprocessing/train_df.csv')
test_df = pd.read_csv('../__input__/1_DataPreprocessing/valid_df.csv')
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df.head()

Unnamed: 0,Date,SecuritiesCode,ExpectedDividend,AdjustmentFactor,SupervisionFlag,Volume,Open,High,Low,Close,Target,CloseT1,CloseT2
0,2021-05-28,1301,-1.0,1.0,False,96000,2905.0,2909.0,2851.0,2869.0,0.026934,2896.0,2974.0
1,2021-05-28,1332,-1.0,1.0,False,1578600,509.0,517.0,508.0,514.0,0.013807,507.0,514.0
2,2021-05-28,1333,-1.0,1.0,False,148900,2405.0,2420.0,2400.0,2415.0,0.016701,2395.0,2435.0
3,2021-05-28,1375,-1.0,1.0,False,156900,1781.0,1797.0,1764.0,1770.0,-0.006384,1723.0,1712.0
4,2021-05-28,1376,-1.0,1.0,False,8300,1540.0,1540.0,1506.0,1524.0,0.006098,1476.0,1485.0


In [17]:
train_arima = train_df[train_df['Date'] > datetime(2021,1,1)][['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)
test_arima = test_df[test_df['Date'] < datetime(2021,7,1)][['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
def predict(traindf,testdf,order):
    train_data = traindf.copy()
    test_data = testdf.copy()
    dates = test_data['Date'].unique()
    mse = 0
    codes = test_data['SecuritiesCode'].unique()
    # print(len(codes))
    for j,code in enumerate(codes):
        X = train_data[(train_data['SecuritiesCode'] == code)]['Target'].values
        model = ARIMA(X, order=order)
        try:
            prediction = model.fit().forecast(len(dates))
        except:
            print(X)
            print(order)
            mse += 999999
            break  
        y_true = test_data[(test_data['SecuritiesCode'] == code)]['Target'].values
        y_pred = []
        i =0 
        for date in dates:
            if len(test_data[(test_data['SecuritiesCode'] == code) & (test_data['Date'] == date)]):
                y_pred.append(prediction[i])
            i+=1
        
        mse += mean_squared_error(y_true,y_pred)
    return mse

In [6]:
# predict(train_arima,test_arima,(10,0,5))

In [21]:
err = np.array([-0.00633333, -0.00321081 , 0.    ,      0.     ,     0.00164785 , 0.,
  0.        , -0.00648377, -0.00808455  ,0.0095388 ,  0.00944867 ,-0.00483092,
 -0.00627358 ,-0.00949352, -0.02875353 ,-0.00330587,  0.      ,    0.,
 -0.02060094])
model = ARIMA(err, order=(5,0,5))
model.fit().forecast(1)

LinAlgError: Schur decomposition solver error.

In [22]:
def objective(trial):
    order  = (trial.suggest_int("order", 1, 5),0,trial.suggest_int("order", 1, 5))
    return predict(train_arima,test_arima,order)

In [23]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2022-07-02 09:23:15,651][0m A new study created in memory with name: no-name-43b25219-9f72-4626-a5e3-7e9f9d8342cd[0m
[32m[I 2022-07-02 09:31:52,749][0m Trial 0 finished with value: 0.7262089745158895 and parameters: {'order': 2}. Best is trial 0 with value: 0.7262089745158895.[0m
[32m[I 2022-07-02 09:47:41,269][0m Trial 1 finished with value: 0.7442749470716109 and parameters: {'order': 5}. Best is trial 0 with value: 0.7262089745158895.[0m
[32m[I 2022-07-02 09:56:00,734][0m Trial 2 finished with value: 0.7262089745158895 and parameters: {'order': 2}. Best is trial 0 with value: 0.7262089745158895.[0m
[32m[I 2022-07-02 10:00:55,036][0m Trial 3 finished with value: 0.716906571509873 and parameters: {'order': 1}. Best is trial 3 with value: 0.716906571509873.[0m
[32m[I 2022-07-02 10:05:57,532][0m Trial 4 finished with value: 0.716906571509873 and parameters: {'order': 1}. Best is trial 3 with value: 0.716906571509873.[0m
[32m[I 2022-07-02 10:22:48,654][0m Trial

[ 0.00494234  0.00327869  0.00653595  0.          0.0130719   0.
  0.          0.00816993 -0.01944895  0.00813008 -0.00967742  0.01134522
 -0.0188383  -0.008      -0.01612903  0.        ]
(4, 0, 4)


[32m[I 2022-07-02 10:41:28,788][0m Trial 7 finished with value: 0.716906571509873 and parameters: {'order': 1}. Best is trial 3 with value: 0.716906571509873.[0m
[32m[I 2022-07-02 10:46:51,956][0m Trial 8 finished with value: 0.716906571509873 and parameters: {'order': 1}. Best is trial 3 with value: 0.716906571509873.[0m
[32m[I 2022-07-02 11:02:17,515][0m Trial 9 finished with value: 0.7442749470716109 and parameters: {'order': 5}. Best is trial 3 with value: 0.716906571509873.[0m
