In [36]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime
import optuna
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from data_utils import *

In [74]:
train_df = pd.read_csv('../__input__/1_DataPreprocessing/train_df.csv')
test_df = pd.read_csv('../__input__/1_DataPreprocessing/valid_df.csv')
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df.head()

Unnamed: 0,Date,SecuritiesCode,ExpectedDividend,AdjustmentFactor,SupervisionFlag,Volume,Open,High,Low,Close,Target,CloseT1,CloseT2
0,2021-05-28,1301,-1.0,1.0,False,96000,2905.0,2909.0,2851.0,2869.0,0.026934,2896.0,2974.0
1,2021-05-28,1332,-1.0,1.0,False,1578600,509.0,517.0,508.0,514.0,0.013807,507.0,514.0
2,2021-05-28,1333,-1.0,1.0,False,148900,2405.0,2420.0,2400.0,2415.0,0.016701,2395.0,2435.0
3,2021-05-28,1375,-1.0,1.0,False,156900,1781.0,1797.0,1764.0,1770.0,-0.006384,1723.0,1712.0
4,2021-05-28,1376,-1.0,1.0,False,8300,1540.0,1540.0,1506.0,1524.0,0.006098,1476.0,1485.0


In [143]:
train_linear = train_df[train_df['Date'] > datetime(2021,1,1)][['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)
test_linear = test_df[test_df['Date'] < datetime(2021,7,1)][['SecuritiesCode','Date','Close','Target']].reset_index(drop = True)

In [144]:
train_linear.tail()

Unnamed: 0,SecuritiesCode,Date,Close,Target
175227,9990,2021-05-27,575.0,0.026756
175228,9991,2021-05-27,874.0,-0.023438
175229,9993,2021-05-27,1690.0,-0.005266
175230,9994,2021-05-27,2280.0,-0.004806
175231,9997,2021-05-27,1082.0,-0.014493


In [120]:
test_linear.tail()

Unnamed: 0,SecuritiesCode,Date,Close,Target
47751,9990,2021-06-30,623.0,0.019262
47752,9991,2021-06-30,878.0,0.006787
47753,9993,2021-06-30,1722.0,-0.000582
47754,9994,2021-06-30,2274.0,0.00833
47755,9997,2021-06-30,977.0,0.00924


In [129]:
codes = test_linear[test_linear['Date'] == datetime(2021,5,28)]['SecuritiesCode'].unique()
for code in codes:
    print(train_linear[train_linear['SecuritiesCode'] == code]['Close'].values[-5:])


[2984. 2947. 2924. 2946. 2890.]
[516. 519. 513. 511. 504.]
[2459. 2473. 2455. 2430. 2383.]
[1852. 1848. 1847. 1796. 1749.]
[1550. 1564. 1550. 1545. 1540.]
[3800. 3770. 3725. 3720. 3630.]
[1965. 1980. 1965. 1954. 1936.]
[3390. 3415. 3415. 3400. 3420.]
[3395. 3395. 3470. 3380. 3495.]
[2475. 2502. 2511. 2500. 2490.]
[4670. 4640. 4600. 4610. 4575.]
[1896. 1985. 1990. 1971. 1955.]
[2357. 2387. 2337. 2306. 2266.]
[543. 540. 532. 517. 515.]
[896. 904. 912. 915. 904.]
[249. 272. 276. 274. 272.]
[6280. 6060. 6060. 5900. 6020.]
[940. 948. 940. 933. 925.]
[748. 762. 765. 761. 742.]
[2036. 2071. 2059. 2017. 1996.]
[1363. 1363. 1351. 1342. 1377.]
[943. 934. 956. 963. 971.]
[1372. 1381. 1360. 1345. 1342.]
[862. 858. 857. 873. 839.]
[699. 698. 717. 714. 703.]
[3190. 3170. 3170. 3200. 3170.]
[3805. 3835. 3795. 3800. 3800.]
[534. 530. 518. 518. 508.]
[2129. 2113. 2074. 2073. 2047.]
[9700. 9740. 9680. 9610. 9570.]
[2785. 2835. 2930. 2960. 2999.]
[899. 904. 895. 918. 905.]
[5960. 5500. 5600. 5990. 6200.]

In [152]:
def predict(train_df,test_df,nums_lag):
    train  = train_df.copy()
    test = test_df.copy()
    dates = test['Date'].unique()
    mse = 0
    for date in dates:
        prediction = []
        true = []
        target = []
        codes = test[test['Date'] == date]['SecuritiesCode'].unique()
        X = np.arange(1,nums_lag+1).reshape(nums_lag,1)
        for code in codes:
            y = train[train['SecuritiesCode'] == code]['Close'].values[-nums_lag:]
            y_true = test[(test['Date'] == date)  &(test['SecuritiesCode'] == code)]['Close'].values
            reg = linear_model.LinearRegression().fit(X, y)
            y_pred = reg.predict([[nums_lag+1]])[0]
            close_t1 = reg.predict([[nums_lag+2]])[0]
            close_t2 = reg.predict([[nums_lag+3]])[0]
            tar = (close_t2-close_t1)/close_t1
            prediction.append(y_pred)
            true.append(y_true)
            target.append(tar)
            # print(y,y_pred)
        mse += mean_squared_error(true,prediction)
        data = {'Date':[date for _ in range(len(codes))],'SecuritiesCode':codes,'Close':prediction,'Target':target}
        train = pd.concat([train,pd.DataFrame(data)])
    return mse 

In [131]:
def objective(trial):
    nums_lag = trial.suggest_int("num_lags", 1, 10)
    return predict(train_linear,test_linear,nums_lag)

In [151]:
predict(train_linear,test_linear,1)

[2890.] 2890.0
[504.] 504.0
[2383.] 2383.0
[1749.] 1749.0
[1540.] 1540.0
[3630.] 3630.0
[1936.] 1936.0
[3420.] 3420.0
[3495.] 3495.0
[2490.] 2490.0
[4575.] 4575.0
[1955.] 1955.0
[2266.] 2266.0
[515.] 515.0
[904.] 904.0
[272.] 272.0
[6020.] 6020.0
[925.] 925.0
[742.] 742.0
[1996.] 1996.0
[1377.] 1377.0
[971.] 971.0
[1342.] 1342.0
[839.] 839.0
[703.] 703.0
[3170.] 3170.0
[3800.] 3800.0
[508.] 508.0
[2047.] 2047.0
[9570.] 9570.0
[2999.] 2999.0
[905.] 905.0
[6200.] 6200.0
[5340.] 5340.0
[1960.] 1960.0
[3875.] 3875.0
[931.] 931.0
[887.] 887.0
[1053.] 1053.0
[1523.] 1523.0
[726.] 726.0
[4370.] 4370.0
[1515.] 1515.0
[1833.] 1833.0
[968.] 968.0
[1839.] 1839.0
[3400.] 3400.0
[470.] 470.0
[4195.] 4195.0
[2873.] 2873.0
[2444.] 2444.0
[4505.] 4505.0
[797.] 797.0
[2958.] 2958.0
[715.] 715.0
[651.] 651.0
[334.] 334.0
[11650.] 11650.0
[847.] 847.0
[4520.] 4520.0
[7310.] 7310.0
[2472.] 2472.0
[591.] 591.0
[1347.] 1347.0
[551.] 551.0
[793.] 793.0
[854.] 854.0
[5050.] 5050.0
[2177.] 2177.0
[495.] 495.0


KeyboardInterrupt: 

In [153]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[32m[I 2022-07-01 16:06:08,209][0m A new study created in memory with name: no-name-4c809fe2-f4e4-4a70-ab85-11c8b85fc281[0m
[32m[I 2022-07-01 16:07:34,696][0m Trial 0 finished with value: 11605487.976426443 and parameters: {'num_lags': 5}. Best is trial 0 with value: 11605487.976426443.[0m
[32m[I 2022-07-01 16:08:59,052][0m Trial 1 finished with value: 12699323.976158762 and parameters: {'num_lags': 4}. Best is trial 0 with value: 11605487.976426443.[0m
[32m[I 2022-07-01 16:10:31,381][0m Trial 2 finished with value: 10851390.434145551 and parameters: {'num_lags': 6}. Best is trial 2 with value: 10851390.434145551.[0m
[32m[I 2022-07-01 16:12:03,187][0m Trial 3 finished with value: 8423093.189976055 and parameters: {'num_lags': 10}. Best is trial 3 with value: 8423093.189976055.[0m
[32m[I 2022-07-01 16:13:31,281][0m Trial 4 finished with value: 12699323.976158762 and parameters: {'num_lags': 4}. Best is trial 3 with value: 8423093.189976055.[0m
[32m[I 2022-07-01 16:15: