In [1]:
import pandas as pd
import numpy as np
import datetime
import json

In [2]:
# first day is NOT Jan 1st because closePrice, log_ret, trend, closePriceNorm, log_ret_normalised_by_day are labels and thus should be t+1, not t.
date = [datetime.date(year=2022, month=12, day = 31) + datetime.timedelta(days=i) for i in range(500)]

In [3]:
basic_data = pd.read_csv('../data/basic_data.csv')

# get ret
basic_data['log_ret'] = np.log(basic_data.groupby('stock')['closePrice'].pct_change()+1)


# get dates
basic_data['date'] = date*50

# get trend
basic_data['trend'] = [1 if ret >= 0 else -1 if ret < 0 else np.nan for ret in basic_data['log_ret']]


# get closed price normalised by stock
basic_data['closePriceNorm'] = basic_data.groupby('stock')['closePrice'].apply(lambda x: (x - x.mean()) / x.std())

basic_data['log_ret_normalised_by_day'] = basic_data.groupby('date')['log_ret'].apply(lambda x: (x - x.mean()) / x.std())

In [4]:
basic_data

Unnamed: 0,date,stock,closePrice,log_ret,trend,closePriceNorm,log_ret_normalised_by_day
0,2022-12-31,0,60.17,,,-1.274644,
1,2023-01-01,0,59.99,-0.002996,-1.0,-1.332277,-0.571724
2,2023-01-02,0,59.66,-0.005516,-1.0,-1.437935,-0.808744
3,2023-01-03,0,59.38,-0.004704,-1.0,-1.527586,-0.676304
4,2023-01-04,0,59.21,-0.002867,-1.0,-1.582016,-0.299596
...,...,...,...,...,...,...,...
24995,2024-05-09,49,43.77,0.003892,1.0,0.298860,0.398095
24996,2024-05-10,49,43.93,0.003649,1.0,0.406563,0.593729
24997,2024-05-11,49,44.41,0.010867,1.0,0.729669,1.972686
24998,2024-05-12,49,44.30,-0.002480,-1.0,0.655624,-0.177065


In [5]:
import pandas as pd
import numpy as np
import copy
import datetime

import scipy.stats as ss

def slope(x): return (x[-1] - x[0]) / x[0] if x[0] else 0
def abs_diff_mean(x): return np.mean(np.abs(x[1:] - x[:-1])) if len(x) > 1 else 0
def diff_std(x): return np.std(x[1:] - x[:-1]) if len(x) > 1 else 0



In [6]:
function_map = {'min':np.min, 'max':np.max, 'std':np.std, 'mean':np.mean, 'slope':slope, 'skew': ss.skew, 'abs_diff_mean': abs_diff_mean, 'diff_std': diff_std, 'sum':sum}

import warnings
warnings.filterwarnings('ignore')

df_list = []
# create the features
for stock in basic_data['stock'].unique():
    print(stock)

    sample_data = basic_data[basic_data['stock'] == stock]

    sample_index = sample_data['date']

    sample_data = sample_data.drop(['stock'], axis =1)

    sample_data = sample_data.dropna()

    df_feats = pd.DataFrame()

    for window in [3, 5, 10, 20]:
        for feature in [c for c in sample_data.columns if c != 'trend' and c != 'date']:
            original_data  = np.array(sample_data[feature])
            for function in function_map:
                out = np.array([])
                for i in range(len(sample_data)):
                    if i <= window:
                        out = np.append(out, np.nan)
                    else:
                        out = np.append(out, function_map[function](original_data[i-window-1:i-1]))
                
                df_feats[feature + '_' + function + '_' + str(window)] = out


    df_feats['stock'] = stock
    df_feats['date'] = sample_index.values[1:]

    df_list.append(df_feats)

signal_features = pd.concat(df_list)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [7]:
basic_data = pd.merge(basic_data, signal_features, on=['stock', 'date'], how='left')
basic_data

Unnamed: 0,date,stock,closePrice,log_ret,trend,closePriceNorm,log_ret_normalised_by_day,closePrice_min_3,closePrice_max_3,closePrice_std_3,...,closePriceNorm_sum_20,log_ret_normalised_by_day_min_20,log_ret_normalised_by_day_max_20,log_ret_normalised_by_day_std_20,log_ret_normalised_by_day_mean_20,log_ret_normalised_by_day_slope_20,log_ret_normalised_by_day_skew_20,log_ret_normalised_by_day_abs_diff_mean_20,log_ret_normalised_by_day_diff_std_20,log_ret_normalised_by_day_sum_20
0,2022-12-31,0,60.17,,,-1.274644,,,,,...,,,,,,,,,,
1,2023-01-01,0,59.99,-0.002996,-1.0,-1.332277,-0.571724,,,,...,,,,,,,,,,
2,2023-01-02,0,59.66,-0.005516,-1.0,-1.437935,-0.808744,,,,...,,,,,,,,,,
3,2023-01-03,0,59.38,-0.004704,-1.0,-1.527586,-0.676304,,,,...,,,,,,,,,,
4,2023-01-04,0,59.21,-0.002867,-1.0,-1.582016,-0.299596,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,2024-05-09,49,43.77,0.003892,1.0,0.298860,0.398095,43.30,43.69,0.159652,...,7.808144,-2.292613,1.484512,1.123729,-0.165319,-1.309619,-0.310634,1.371284,1.675919,-3.306371
24996,2024-05-10,49,43.93,0.003649,1.0,0.406563,0.593729,43.52,43.69,0.069442,...,7.478306,-1.859972,1.484512,1.016405,-0.074494,-2.223994,-0.231604,1.292566,1.589703,-1.489875
24997,2024-05-11,49,44.41,0.010867,1.0,0.729669,1.972686,43.60,43.77,0.069442,...,7.215781,-1.859972,1.484512,1.016615,-0.074038,-1.990236,-0.232520,1.296945,1.592089,-1.480766
24998,2024-05-12,49,44.30,-0.002480,-1.0,0.655624,-0.177065,43.60,43.93,0.134743,...,7.114811,-1.859972,1.484512,1.023691,-0.024251,-1.373470,-0.358229,1.244729,1.565593,-0.485016


In [8]:
# get lag
for i in range(1, 11):

    basic_data['log_ret_lag_{}'.format(i)] = basic_data.groupby('stock')['log_ret'].shift(i)

    basic_data['closePrice_lag_{}'.format(i)] = basic_data.groupby('stock')['closePrice'].shift(i)

    basic_data['closePriceNorm_lag_{}'.format(i)] = basic_data.groupby('stock')['closePriceNorm'].shift(i)

    basic_data['log_ret_normalised_by_day_lag_{}'.format(i)] = basic_data.groupby('stock')['log_ret_normalised_by_day'].shift(i)

In [9]:
log_ret_lag_1 = pd.DataFrame()
closePrice_lag_1 = pd.DataFrame()
closePriceNorm_lag_1 = pd.DataFrame()
log_ret_normalised_by_day_lag_1 = pd.DataFrame()

for stock, data in basic_data.groupby('stock'):
    log_ret_lag_1[stock] = list(data['log_ret_lag_1'])
    closePrice_lag_1[stock] = list(data['closePrice_lag_1'])
    closePriceNorm_lag_1[stock] = list(data['closePriceNorm_lag_1'])
    log_ret_normalised_by_day_lag_1[stock] = list(data['log_ret_normalised_by_day_lag_1'])

In [10]:
with open('../data/train_logret_corr.json', 'r') as f:
    train_logret_corr = json.load(f)

In [11]:
high_corrs = dict()
low_corrs = dict()

for stock in train_logret_corr:
    high_corrs[stock] = list()
    low_corrs[stock] = list()
    for group in train_logret_corr[stock]:
        if group[1] > 0.1:
            high_corrs[stock].append(group)
        elif group[1] < -0.1:
            low_corrs[stock].append(group)

In [14]:
mean_log_ret = list()
mean_closePrice = list()
mean_closePriceNorm = list()
mean_log_ret_normalised_by_day = list()

pos_mean_log_ret = list()
pos_mean_closePrice = list()
pos_mean_closePriceNorm = list()
pos_mean_log_ret_normalised_by_day = list()



for stock in range(50):

    for day in range(500):
        
        mean_log_ret.append(np.mean(log_ret_lag_1.iloc[day]))
        mean_closePrice.append(np.mean(closePrice_lag_1.iloc[day]))
        mean_closePriceNorm.append(np.mean(closePriceNorm_lag_1.iloc[day]))
        mean_log_ret_normalised_by_day.append(np.mean(log_ret_normalised_by_day_lag_1.iloc[day]))

        log_ret_mean = 0
        closePrice_mean = 0
        closePriceNorm_mean = 0
        log_ret_normalised_by_day_mean = 0

        for adj_stock in high_corrs[f'Stock {stock}']:
            
            log_ret_mean += log_ret_lag_1.iloc[day][adj_stock[0]]
            closePrice_mean += closePrice_lag_1.iloc[day][adj_stock[0]]
            closePriceNorm_mean += closePriceNorm_lag_1.iloc[day][adj_stock[0]]
            log_ret_normalised_by_day_mean += log_ret_normalised_by_day_lag_1.iloc[day][adj_stock[0]]

        pos_mean_log_ret.append(log_ret_mean / len(high_corrs[f'Stock {stock}']))
        pos_mean_closePrice.append(closePrice_mean / len(high_corrs[f'Stock {stock}']))
        pos_mean_closePriceNorm.append(closePriceNorm_mean / len(high_corrs[f'Stock {stock}']))
        pos_mean_log_ret_normalised_by_day.append(log_ret_normalised_by_day_mean / len(high_corrs[f'Stock {stock}']))

In [15]:
basic_data['mean_log_ret_lag_1'] = mean_log_ret
basic_data['mean_closePrice_lag_1'] = mean_closePrice
basic_data['mean_closePriceNorm_lag_1'] = mean_closePriceNorm
basic_data['mean_log_ret_normalised_by_day_lag_1'] = mean_log_ret_normalised_by_day

basic_data['pos_log_ret_lag_1'] = pos_mean_log_ret
basic_data['pos_closePrice_lag_1'] = pos_mean_closePrice
basic_data['pos_closePriceNorm_lag_1'] = pos_mean_closePriceNorm
basic_data['pos_log_ret_normalised_by_day_lag_1'] = pos_mean_log_ret_normalised_by_day

In [16]:
basic_data.to_csv('../data/curated_data.csv', index=False)