In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [17]:
def resample_data(file, resample_interval='5min'):
    df = pd.read_csv(file)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df.set_index('datetime', inplace=True)
    
    df_resampled = df.resample(resample_interval).agg({
        'open':'first',
        'high':'max',
        'low':'min',
        'close':'last',
        'volume':'sum'
    }).dropna()

    df_resampled['range'] = df_resampled['high'] - df_resampled['low']
    df_resampled['range_up'] = df_resampled['high'] - df_resampled['open']
    df_resampled['range_dn'] = df_resampled['open'] - df_resampled['low']
    df_resampled['delta_close'] = df_resampled['close'].diff()

    return df_resampled

In [22]:
def add_features_states(df, half_life=10, M=3, N=3, K=3):
    def compute_ewma_ewmv(series, half_life):
        lam = 2 ** (-1 / half_life)
        
        ewma = []
        ewmv = []

        sumW = 0.0
        sumWX = 0.0
        sumWSS = 0.0

        for j in range(len(series)):
            x = series.iloc[j - 1] if j > 0 else 0.0

            if j == 0:
                ewma.append(np.nan)
                ewmv.append(np.nan)
            elif j == 1:
                sumW = 1
                sumWX = x
                mean = sumWX / sumW
                sumWSS = (x - mean) ** 2
                var = sumWSS / sumW
                ewma.append(mean)
                ewmv.append(np.sqrt(var))
            else:
                sumW = lam * sumW + 1
                sumWX = lam * sumWX + x
                mean = sumWX / sumW
                sumWSS = lam * sumWSS + (x - mean) ** 2
                var = sumWSS / sumW
                ewma.append(mean)
                ewmv.append(np.sqrt(var))

        return pd.Series(ewma, index=series.index), pd.Series(ewmv, index=series.index)
    
    def bin_feature(series, num_bins):
        valid = series.dropna()
        bins = pd.qcut(valid, q=num_bins, labels=False, duplicates='drop')
        binned = pd.Series(index=series.index, dtype='float')
        binned.loc[valid.index] = bins
        return binned.astype('Int64')

    df['volume_ewma'], df['volume_ewmv'] = compute_ewma_ewmv(df['volume'], half_life)
    df['range_ewma'], df['range_ewmv'] = compute_ewma_ewmv(df['range'], half_life)

    df['volume_state'] = bin_feature(df['volume_ewma'], M)
    df['vol_state'] = bin_feature(df['range_ewmv'], N)
    df['trend_state'] = bin_feature(df['delta_close'], K)

    return df

In [23]:
import os

folder = './clean-data/'
csv_files = [f for f in os.listdir(folder) if f.endswith('.csv')]

all_data = {}

for file in csv_files:
    ticker = file.replace('.csv', '')
    path = os.path.join(folder, file)
    
    df = resample_data(path)
    df = add_features_states(df)
    
    all_data[ticker] = df


In [25]:
all_data

{'CL':                       open   high    low  close  volume  range  range_up  \
 datetime                                                                   
 2020-03-05 00:00:00  47.45  47.47  47.42  47.44      93   0.05      0.02   
 2020-03-05 00:05:00  47.45  47.45  47.43  47.45      12   0.02      0.00   
 2020-03-05 00:10:00  47.46  47.53  47.46  47.53      12   0.07      0.07   
 2020-03-05 00:15:00  47.52  47.55  47.52  47.54      63   0.03      0.03   
 2020-03-05 00:20:00  47.54  47.57  47.52  47.57      22   0.05      0.03   
 ...                    ...    ...    ...    ...     ...    ...       ...   
 2020-05-14 23:35:00  27.75  27.85  27.75  27.77      65   0.10      0.10   
 2020-05-14 23:40:00  27.77  27.80  27.75  27.75       8   0.05      0.03   
 2020-05-14 23:45:00  27.75  27.81  27.75  27.79      25   0.06      0.06   
 2020-05-14 23:50:00  27.78  27.78  27.77  27.77       3   0.01      0.00   
 2020-05-14 23:55:00  27.80  27.80  27.80  27.80       1   0.00      0

In [27]:
for ticker, df in all_data.items():
    df.to_csv(f"./Bayesian-data/{ticker}.csv")