In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime
import optuna
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA 

from data_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv('../__input__/1_DataPreprocessing/train_df.csv')
valid_df = pd.read_csv('../__input__/1_DataPreprocessing/valid_df.csv')
test_df = pd.read_csv('../__input__/1_DataPreprocessing/test_df.csv')
train_df['Date'] = pd.to_datetime(train_df['Date'])
valid_df['Date'] = pd.to_datetime(valid_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df.head()

Unnamed: 0,Date,SecuritiesCode,ExpectedDividend,AdjustmentFactor,SupervisionFlag,Volume,Open,High,Low,Close,Target,CloseT1,CloseT2
0,2021-12-06,1301,-1.0,1.0,False,8900,2982.0,2982.0,2965.0,2971.0,-0.003263,3065.0,3055.0
1,2021-12-06,1332,-1.0,1.0,False,1360800,592.0,599.0,588.0,589.0,-0.008993,556.0,551.0
2,2021-12-06,1333,-1.0,1.0,False,125900,2368.0,2388.0,2360.0,2377.0,-0.009963,2409.0,2385.0
3,2021-12-06,1375,-1.0,1.0,False,81100,1230.0,1239.0,1224.0,1224.0,-0.015032,1264.0,1245.0
4,2021-12-06,1376,-1.0,1.0,False,6200,1339.0,1372.0,1339.0,1351.0,0.002867,1395.0,1399.0


In [4]:
train_arima = train_df[['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)
valid_arima = valid_df[['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)
test_arima = test_df[['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)
train_arima = train_arima[train_arima['Date'] >= '2019-05-27'].reset_index(drop= True)

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
print('Train:' ,len(train_arima['Date'].unique()))
print('Valid:' ,len(valid_arima['Date'].unique()))
print('Test:' ,len(test_arima['Date'].unique()))

Train: 437
Valid: 117
Test: 100


In [8]:
from tqdm import tqdm
import time
def predict(traindf,testdf,p,t,q):
    train_data = traindf.copy()
    test_data = testdf.copy()
    test_dates = test_data['Date'].unique()
    loss = 0
    codes = test_data['SecuritiesCode'].unique()
    test_pred =  {}
    run_time= 0 
    for code in tqdm(codes):
        X = train_data[(train_data['SecuritiesCode'] == code)]['Target'].values
        model = ARIMA(X, order=(p,t,q))
        try:
            modelfit = model.fit()
            time_start =time.time()
            prediction = modelfit.forecast(len(test_dates))
            time_end = time.time()
            run_time+= time_end-time_start
        except:
            break  
        y_test_true = test_data[(test_data['SecuritiesCode'] == code)]['Target'].values
        y_test_pred = []
        i=0
        # print(prediction)
        for i,date in enumerate(test_dates):
            if len(test_data[(test_data['SecuritiesCode'] == code) & (test_data['Date'] == date)]) > 0 :
                y_test_pred.append(prediction[i])
        # models.append(model)
        loss += mean_squared_error(y_test_pred,y_test_true)  
        test_pred[code] = y_test_pred
    return loss,test_pred,run_time

In [7]:
# loss,test_pred,run_time = predict(pd.concat([train_arima,test_arima]),test_arima,1,0,4)

  0%|          | 5/2000 [00:05<35:58,  1.08s/it]


In [8]:
def objective(
    trial,
    random_state=22,
    n_jobs=-1,
): 
    params = {
        'p':trial.suggest_int("p", 1, 5),
        't':0,
        'q':trial.suggest_int("q", 1, 5)
    }
    return predict(pd.concat([train_arima,valid_arima]),test_arima,**params)[0]

In [9]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2022-07-20 01:38:59,960][0m A new study created in memory with name: no-name-ab3c6362-aafb-4af5-9a69-8886f6faf815[0m
100%|██████████| 2000/2000 [16:38<00:00,  2.00it/s]
[32m[I 2022-07-20 01:55:38,843][0m Trial 0 finished with value: 0.8974052280388131 and parameters: {'p': 1, 'q': 1}. Best is trial 0 with value: 0.8974052280388131.[0m
  3%|▎         | 55/2000 [01:06<39:15,  1.21s/it]
[32m[I 2022-07-20 01:56:45,504][0m Trial 1 finished with value: 0.017692056449561396 and parameters: {'p': 3, 'q': 3}. Best is trial 1 with value: 0.017692056449561396.[0m
  2%|▏         | 47/2000 [01:27<1:00:19,  1.85s/it]
[32m[I 2022-07-20 01:58:12,655][0m Trial 2 finished with value: 0.015789861893399336 and parameters: {'p': 5, 'q': 5}. Best is trial 2 with value: 0.015789861893399336.[0m
  0%|          | 9/2000 [00:11<40:47,  1.23s/it]

In [None]:
study.best_params

{'order': 2}

In [9]:
# best_params = study.best_trial.params
loss,test_pred,run_time = predict(pd.concat([train_arima,valid_arima]),test_arima,4,0,3)

100%|██████████| 2000/2000 [35:12<00:00,  1.06s/it]


In [10]:
test_pred_df = test_arima.copy()
for code in test_arima['SecuritiesCode'].unique():
    test_pred_df.loc[test_pred_df['SecuritiesCode'] == code, 'Target'] = test_pred[code]
    

In [11]:
run_time

12.51045298576355

In [12]:
y_pred = pd.DataFrame(test_pred_df['Target'],columns=['Target'])
yTest_true = test_df["Target"]
print(calc_score(test_arima, y_pred, yTest_true, render_info= False))

-0.08757440167017977


In [13]:
np.sqrt(mean_squared_error(y_pred, yTest_true))

0.024106826994299394