In [36]:
import time
import pandas as pd, numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder

from pycaret.regression import * 
#from pycaret.time_series import TSForecastingExperiment
from pycaret.utils import enable_colab

enable_colab()

Colab mode enabled.


In [37]:
prices = pd.read_pickle('./data/prices.pickle')
X_train = pd.read_pickle('./data/X_train.pickle')
X_val = pd.read_pickle('./data/X_val.pickle')
y_train = pd.read_pickle('./data/y_train.pickle')
y_val = pd.read_pickle('./data/y_val.pickle')

In [38]:
df = pd.read_csv('./data/stock_prices.csv')

df = df.drop("ExpectedDividend", axis=1)
df = df.dropna()

In [39]:
prices.sort_values(['Date', 'RowId']).head(15)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg
937963,20170104_1301,20170104,0.0,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.00073,2,0.000152
45860,20170104_1332,20170104,1.0,568.0,576.0,563.0,571.0,2798500,1.0,False,0.012324,0,0.000134
975193,20170104_1333,20170104,2.0,3150.0,3210.0,3140.0,3210.0,270800,1.0,False,0.006154,2,-0.000137
712338,20170104_1376,20170104,4.0,1510.0,1550.0,1510.0,1550.0,11300,1.0,False,0.011053,1,4.9e-05
2044199,20170104_1377,20170104,5.0,3270.0,3350.0,3270.0,3330.0,150800,1.0,False,0.003026,4,0.000134
587777,20170104_1379,20170104,6.0,2105.0,2147.0,2101.0,2143.0,77300,1.0,False,0.005169,1,-3.4e-05
1641075,20170104_1381,20170104,7.0,1950.0,1960.0,1949.0,1960.0,1300,1.0,False,-0.009326,3,0.00064
985428,20170104_1407,20170104,8.0,857.0,877.0,851.0,866.0,147000,1.0,False,-0.003437,2,0.002518
2204348,20170104_1414,20170104,10.0,4940.0,5060.0,4935.0,5050.0,119600,1.0,False,0.0,4,0.000711
1715773,20170104_1417,20170104,11.0,1051.0,1063.0,1048.0,1053.0,347500,1.0,False,-0.007463,3,0.00063


In [40]:
prices[prices.duplicated('RowId')]

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg


In [41]:
df.head(15)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,False,0.003026
5,20170104_1379,2017-01-04,1379,2105.0,2147.0,2101.0,2143.0,77300,1.0,False,0.005169
6,20170104_1381,2017-01-04,1381,1950.0,1960.0,1949.0,1960.0,1300,1.0,False,-0.009326
7,20170104_1407,2017-01-04,1407,857.0,877.0,851.0,866.0,147000,1.0,False,-0.003437
8,20170104_1414,2017-01-04,1414,4940.0,5060.0,4935.0,5050.0,119600,1.0,False,0.0
9,20170104_1417,2017-01-04,1417,1051.0,1063.0,1048.0,1053.0,347500,1.0,False,-0.007463


In [42]:
prices = pd.merge(prices.drop('Date', axis=1), df[['RowId','Date']], on=['RowId'], how='left')
prices.sort_values(['Date', 'RowId']).head()

Unnamed: 0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg,Date
937963,20170104_1301,0.0,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.00073,2,0.000152,2017-01-04
45860,20170104_1332,1.0,568.0,576.0,563.0,571.0,2798500,1.0,False,0.012324,0,0.000134,2017-01-04
975193,20170104_1333,2.0,3150.0,3210.0,3140.0,3210.0,270800,1.0,False,0.006154,2,-0.000137,2017-01-04
712338,20170104_1376,4.0,1510.0,1550.0,1510.0,1550.0,11300,1.0,False,0.011053,1,4.9e-05,2017-01-04
2044199,20170104_1377,5.0,3270.0,3350.0,3270.0,3330.0,150800,1.0,False,0.003026,4,0.000134,2017-01-04


In [43]:
prices.sort_values(['Date', 'RowId'], inplace=True)
prices.tail()

Unnamed: 0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg,Date
1271123,20211203_9990,1995.0,514.0,528.0,513.0,528.0,44200,1.0,False,0.034816,2,-0.000549,2021-12-03
1215569,20211203_9991,1996.0,782.0,794.0,782.0,794.0,35900,1.0,False,0.025478,2,-0.000167,2021-12-03
331374,20211203_9993,1997.0,1690.0,1690.0,1645.0,1645.0,7200,1.0,False,-0.004302,0,-4.5e-05,2021-12-03
1562101,20211203_9994,1998.0,2388.0,2396.0,2380.0,2389.0,6500,1.0,False,0.009098,3,0.000409,2021-12-03
74265,20211203_9997,1999.0,690.0,711.0,686.0,696.0,381100,1.0,False,0.018414,0,0.000201,2021-12-03


In [44]:
prices['Date'] = pd.to_datetime(prices['Date'])

In [45]:
type(prices['Date'].iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [46]:
prices[(prices['Date'] > dt.datetime(2021,10,1)) & (prices['Date'] < dt.datetime(2021,10,29))]

Unnamed: 0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg,Date
824436,20211004_1301,0.0,3010.0,3030.0,2991.0,2998.0,10000,1.0,False,0.008811,1,0.000152,2021-10-04
912796,20211004_1332,1.0,634.0,638.0,630.0,634.0,1147000,1.0,False,0.011327,1,0.000134,2021-10-04
1916128,20211004_1333,2.0,2634.0,2644.0,2616.0,2644.0,131800,1.0,False,0.015751,4,-0.000137,2021-10-04
728130,20211004_1375,3.0,1520.0,1526.0,1481.0,1496.0,244100,1.0,False,-0.005373,1,-0.001571,2021-10-04
302291,20211004_1376,4.0,1529.0,1529.0,1504.0,1517.0,3500,1.0,False,0.003497,0,0.000049,2021-10-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
181983,20211028_9990,1995.0,540.0,560.0,538.0,551.0,234400,1.0,False,0.001815,0,-0.000549,2021-10-28
192132,20211028_9991,1996.0,873.0,880.0,867.0,880.0,16700,1.0,False,0.017442,0,-0.000167,2021-10-28
2132137,20211028_9993,1997.0,1701.0,1725.0,1701.0,1725.0,4100,1.0,False,-0.003488,4,-0.000045,2021-10-28
1982166,20211028_9994,1998.0,2402.0,2415.0,2376.0,2409.0,6500,1.0,False,0.009148,4,0.000409,2021-10-28


In [47]:
prices['Day'] = [i.day for i in prices['Date']]
prices['Month'] = [i.month for i in prices['Date']]
prices['Year'] = [i.year for i in prices['Date']]

prices.head()

Unnamed: 0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold,Avg,Date,Day,Month,Year
937963,20170104_1301,0.0,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.00073,2,0.000152,2017-01-04,4,1,2017
45860,20170104_1332,1.0,568.0,576.0,563.0,571.0,2798500,1.0,False,0.012324,0,0.000134,2017-01-04,4,1,2017
975193,20170104_1333,2.0,3150.0,3210.0,3140.0,3210.0,270800,1.0,False,0.006154,2,-0.000137,2017-01-04,4,1,2017
712338,20170104_1376,4.0,1510.0,1550.0,1510.0,1550.0,11300,1.0,False,0.011053,1,4.9e-05,2017-01-04,4,1,2017
2044199,20170104_1377,5.0,3270.0,3350.0,3270.0,3330.0,150800,1.0,False,0.003026,4,0.000134,2017-01-04,4,1,2017


In [48]:
prices['Series'] = np.arange(1,len(prices)+1)

prices = prices[['Series', 'Day', 'Year', 'Month', 'Target']]

prices.head()

Unnamed: 0,Series,Day,Year,Month,Target
937963,1,4,2017,1,0.00073
45860,2,4,2017,1,0.012324
975193,3,4,2017,1,0.006154
712338,4,4,2017,1,0.011053
2044199,5,4,2017,1,0.003026


In [50]:
prices.tail()

Unnamed: 0,Series,Day,Year,Month,Target
1271123,2324919,3,2021,12,0.034816
1215569,2324920,3,2021,12,0.025478
331374,2324921,3,2021,12,-0.004302
1562101,2324922,3,2021,12,0.009098
74265,2324923,3,2021,12,0.018414


In [54]:
train, val = train_test_split(prices, shuffle=False,test_size=0.2)
train.shape, val.shape

((1859938, 5), (464985, 5))

In [57]:
reg = setup(
    data = prices,
    #train_size = 0.7, #学習データと検証データを7:3に分割
    target = 'Target',
    fold_strategy = 'timeseries',
    numeric_features = ['Series', 'Day', 'Year', 'Month'], # 欠損値を平均値で補完する
    transform_target = True,
    fold = 5,
    #log_data=True, # 学習データと検証データのCSVへの出力
    #silent = True, # 型推定の確認をスキップ
    session_id = 123
    ) 

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Target
2,Target type,regression
3,Data shape,"(2324923, 5)"
4,Train data shape,"(1627446, 5)"
5,Test data shape,"(697477, 5)"
6,Numeric features,4
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [None]:
""" # Sharpe Ratio 関数
import math

def shaperatio(y_true, y_pred):
    df = .copy()
    df["daily_ret"] = DF["close"].pct_change() #株価終値の前日との変化率を計算する。
    ret_ave=np.mean(df["daily_ret"])
    vol_sp = df["daily_ret"].std() 
    return  math.sqrt(256)*ret_ave/vol_sp """

In [None]:
""" from pycaret.classification import add_metric
add_metric('sharperatio', 'Sharpe Ratio', shaperatio, greater_is_better = True) """

In [58]:
""" best = compare_models(sort = 'MAE') """

In [None]:
""" best """