In [1]:
import os
import pickle
import sys
import warnings
from glob import glob
import pandas as pd, numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

#import shap

from tqdm.notebook import tqdm
pd.options.mode.use_inf_as_na = True

## load dataset

In [2]:
prices = pd.read_csv('./data/stock_prices.csv')

In [3]:
prices = prices.drop("ExpectedDividend", axis=1)
prices = prices.dropna()
prices.isnull().sum()

RowId               0
Date                0
SecuritiesCode      0
Open                0
High                0
Low                 0
Close               0
Volume              0
AdjustmentFactor    0
SupervisionFlag     0
Target              0
dtype: int64

In [4]:
prices.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,False,0.003026


In [5]:
prices['Date'].unique()

array(['2017-01-04', '2017-01-05', '2017-01-06', ..., '2021-12-01',
       '2021-12-02', '2021-12-03'], dtype=object)

## Cross validation split

In [6]:
def setup_cv(df, splits=5):
    df['fold'] = -1
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(df))))
    df.loc[:, "bins"] = pd.cut(
        df["Target"], bins=num_bins, labels=False
    )

    kf = StratifiedKFold(n_splits=splits)
    for f, (t_, v_) in enumerate(kf.split(X=df, y=df.bins.values)):
            df.loc[v_, 'fold'] = f

    df = df.drop("bins", axis=1)
    return df

In [7]:
prices = setup_cv(prices)

## Ordinal Encode Securities Code

In [8]:
enc = OrdinalEncoder()
prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])

In [9]:
prices.tail()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,fold
2324918,20180801_1799,2018-08-01,34.0,1779.0,1792.0,1760.0,1792.0,5500,1.0,False,0.020822,4
2324919,20170330_3951,2017-03-30,514.0,2659.0,2700.0,2659.0,2700.0,8500,1.0,False,-0.020045,4
2324920,20200818_6965,2020-08-18,1258.0,4940.0,4990.0,4920.0,4975.0,257700,1.0,False,-0.020305,4
2324921,20210924_9450,2021-09-24,1863.0,1530.0,1550.0,1517.0,1549.0,156600,1.0,False,-0.024224,4
2324922,20210915_9467,2021-09-15,1864.0,3535.0,3540.0,3485.0,3505.0,29900,1.0,False,0.001435,4


In [10]:
average = pd.DataFrame(prices.groupby("SecuritiesCode").Target.mean())
def get_avg(_id_):
    return average.loc[_id_]
prices["Avg"] = prices["SecuritiesCode"].apply(get_avg)

In [11]:
prices.Date = pd.to_datetime(prices.Date)
prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
X=prices[["Date","SecuritiesCode","Avg"]]
y=prices[["Target"]]
codes = X.SecuritiesCode.unique()

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X,y,shuffle=False,test_size=0.2)

In [13]:
pd.to_pickle(X_train, './data/X_train.pickle')
pd.to_pickle(X_val, './data/X_val.pickle')
pd.to_pickle(y_train, './data/y_train.pickle')
pd.to_pickle(y_val, './data/y_val.pickle')

In [14]:
pd.to_pickle(prices, './data/prices.pickle')