In [51]:
import pandas as pd
from pandas.core.arrays.period import timedelta

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import os

In [52]:
def missing_val_count_by_column (df) :
    nan_count = (df.isnull().sum())
    print(nan_count[nan_count > 0]) 
    
def check_nans (df):
    is_NaN = df. isnull()
    row_has_NaN = is_NaN. any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    return rows_with_NaN

In [53]:
train = pd.read_excel('./data/train_data/train.xlsx')
val = pd.read_excel('./data/val_data/validate.xlsx')
# train = pd.read_excel('/content/train.xlsx')
# val = pd.read_excel('/content/validate.xlsx')

In [54]:
print(train.shape, val.shape)
missing_val_count_by_column(train)
missing_val_count_by_column(val)

(1096, 25) (730, 25)
Series([], dtype: int64)
Series([], dtype: int64)


In [55]:
def create_avg_day(df):
    if 'avg_day' not in df.columns:
        df['avg_day']=df.iloc[:, 1:].mean(axis = 1)
    else:
        print("There's already a column avg_day! No need to create another one.")
        
        
def rename_prices(df):
    if 'PRICES' in df.columns:
        df.rename(columns={"PRICES": "datetime"}, inplace = True)
    else:
        print("There's no column PRICES.")    

In [56]:
create_avg_day(train)
rename_prices(train)

create_avg_day(val)
rename_prices(val)

Date formatter: https://github.com/d3/d3-time-format/tree/v2.2.3#locale_format

# Data Formatting

In [57]:
def dataformatting(df):
    #wide to long
    df = df.melt(id_vars=['datetime'], value_vars=df.columns[1:25]).sort_values(['datetime', 'variable'])
    df.reset_index(inplace=True, drop=True)
    
    #creating master time column, ulgy but works
    time = df['datetime'].copy()
    for d in range(len(df['datetime'])):
        time[d] = df['datetime'][d]+timedelta(hours = d%24) #decided not to go for the +1, so hour 1 is midnight, makes more sense, now it ends in 2009, otherwise the last measurement was 01.01.2010 00:00:00
    df['time'] = time
    
    #hour from string to int
    df['variable'] = df['variable'].map(lambda x:int(x[-2:]))
    
    #renaming, shullfing columns (not important)
    df.rename(columns={"datetime": "date", "variable": "hour", "value":"price"}, inplace = True)

    df['month'] = pd.DatetimeIndex(df['time']).month
    df['day'] = pd.DatetimeIndex(df['time']).day
    df['hour'] = pd.DatetimeIndex(df['time']).hour + 1

    cols = df.columns.tolist()
    cols = [cols[0]] + [cols[3]] + [cols[2]] + [cols[4]] + [cols[5]] + [cols[1]]
    df = df[cols]
    
    return df
    

TO DO:
- make pipeline for big and small dfs
- remove 2 first columns at the end!

# Features

Long Term: 
- (hourly) bollinger bands (5d, 14d, 30d)
- (hourly) EMA (5d, 14d, 30d)
- (daily) ATR (5d, 14d, 30d)
- (hourly) stochastic oscilator (5d + smoothed, 14d + smoothed, 30d + smoothed)

Short Term:
- (hourly) bollinger bands (8h, 24h)
- (hourly) EMA (8h, 24h)
- (hourly) stochastic oscilator (24h + smoothed)

## Bollinger Bands
A Bollinger Band is a technical analysis tool defined by a set of lines plotted two standard deviations (positively and negatively) away from a simple moving average (SMA) of the Stocks’ price Bollinger Bands allow traders to monitor and take advantage of shifts in price volatilities

Main Components of a Bollinger Bands:
 - Upper Band: The upper band is simply two standard deviations above the moving average of a stock’s price.
 - Middle Band: The middle band is simply the moving average of the stock’s price.
 - Lower Band: Two standard deviations below the moving average is the lower band.
 
Bollinger Bands allow traders to monitor and take advantage of shifts in price volatilities.

When calculating the SMA for Bollinger bands, traders typically use a 20 day SMA. Here is how you would calculate the SMA of a stock.

In [58]:
def bollinger_bands(df, hours):
    out = df.copy()
    rolled_mean = out['price'].rolling(hours).mean() 
    rolled_mean.reset_index(inplace=True, drop=True)
    for i in range (1,hours):
        short = out['price'][0:i]
        rolled_mean.loc[i-1] = short.mean()
    
    rolled_std = out['price'].rolling(hours).std() 
    rolled_std.reset_index(inplace=True, drop=True)
    for i in range (1,hours):
        # that's an estimation, first few points are quite bad but evens out quickly
        short = out['price'][0:i+5] #to make it less biased, i+5, so no 0 at the beginning etc
        rolled_std.loc[i-1] = short.std()

    bollinger_up = rolled_mean + rolled_std * 2 # Calculate top band
    bollinger_middle = rolled_mean
    bollinger_down = rolled_mean - rolled_std * 2 # Calculate bottom band

    out[f'{hours}h / {int(hours/24)}day bollinger down'] = bollinger_down
    out[f'{hours}h / {int(hours/24)}day bollinger middle'] = bollinger_middle
    out[f'{hours}h / {int(hours/24)}day bollinger up'] = bollinger_up

    return out

## Exponential moving average (EMA)
An exponential moving average (EMA) is a type of moving average (MA) that places a greater weight and significance on the most recent data points. The exponential moving average is also referred to as the exponentially weighted moving average. An exponentially weighted moving average reacts more significantly to recent price changes than a simple moving average (SMA), which applies an equal weight to all observations in the period.

One of the most popular EMA spans to use are: 8, 12, 20 (for short term trading) and 50, 200 (for long term)

Some traders use Fibonacci numbers (5, 8, 13, 21 ...) to select moving averages.

In general, the 50- and 200-day EMAs are used as indicators for long-term trends. When a stock price crosses its 200-day moving average, it is a technical signal that a reversal has occurred.

In [59]:
def get_ema(df, period_length:int):
    #returns a new df, which contains new column
    x = df.copy()
    x[f'{period_length}h / {int(period_length/24)}day EMA'] = x['price'].ewm(span=period_length).mean()
    return x

## Average true range (ATR)
The average true range (ATR) is a technical analysis indicator that measures market volatility by decomposing the entire range of an asset price for that period. ATR measures market volatility. It is typically derived from the 14-day moving average of a series of true range indicators.

In [60]:
def get_atr(df, period:int):
    x = df.copy()
    maxes = (x.groupby(['date']).max()['price'])
    mins = (x.groupby(['date']).min()['price'])
    closing_prices = df.loc[df['hour'] == 24]['price']

    maxes.reset_index(inplace=True, drop=True)
    mins.reset_index(inplace=True, drop=True)
    closing_prices.reset_index(inplace=True, drop=True)

    yesterday_prices = closing_prices.shift(1)
    yesterday_prices[0] = yesterday_prices[1] #taking care of NA
    
    tr1 = maxes-mins
    tr2 = abs(maxes - yesterday_prices)
    tr3 = abs(mins - yesterday_prices)

    tmp = pd.DataFrame({"A": tr1, "B": tr2, "C":tr3})
    true_range = tmp[["A", "B", "C"]].max(axis=1)
    a_tr = pd.Series([true_range[0]], dtype = np.float64)
    for i in range(1, len(true_range)):
        if i < period:
            a_tr[i] = (a_tr[i-1] * (i-1) + true_range[i]) / i
        else:
            a_tr[i] = (a_tr[i-1] * (period-1) + true_range[i]) / period

    #extending it so it's same for the day
    a_tr = a_tr.loc[a_tr.index.repeat(24)]
    a_tr.reset_index(inplace=True, drop=True)
    x[f'{period*24}h / {int(period)}day ATR'] = a_tr
    return x

# Stochastic Oscillator
ranges 0-100, measure of over/underbuying

In [61]:
def stochastic_osc(df, hours, smoothing):
  out = df.copy()
  maxes = out['price'].rolling(hours).max()
  mins = out['price'].rolling(hours).min()
  sliced = out['price'][0:hours-1]
  mins[0:hours-1] = min(sliced)
  maxes[0:hours-1] = max(sliced)
  oscillator = (out['price'] - mins) / (maxes - mins) * 100
  out[f'{hours}h / {int(hours/24)}stochastic oscillator'] = oscillator
  #usually you use both the oscillator and rolling average of it, this function adds both if bool flag smoothing, hardcoded 8 hours moving avg
  if smoothing:
    osc_smoothed = oscillator.rolling(8).mean()
    #simple NaN handling
    osc_smoothed[0:7] = osc_smoothed[7]
    out[f'{hours}h / {int(hours/24)}smoothed oscillator'] = osc_smoothed
  return out  

In [62]:
def normalize_train(df):
    x = df.copy()
    #returns df with all columns normalized individually z-score
    #DANGER - DATA LEAKAGE
    u={}
    for c in range (0, len(x.columns)-1):
        col = x.iloc[:,c].copy()
        norm_col=(col-col.mean())/col.std()
        x.iloc[:,c] = norm_col
        u[str(x.columns[c])]= [col.mean(), col.std()]

    return x, u

In [63]:
def normalize_validation(df, u):
    x = df.copy()
    #returns df with all columns normalized individually z-score
    #DANGER - DATA LEAKAGE

    for c in range (0, len(x.columns)-1):
        col = x.iloc[:,c].copy()
        norm_col=(col-u[str(x.columns[c])][0])/u[str(x.columns[c])][1]
        x.iloc[:,c] = norm_col

    return x


In [64]:
def build_big(data_f):
    df = data_f.copy()
    df = dataformatting(df)
    
    df = bollinger_bands(df, 8)
    df = bollinger_bands(df, 24)
    df = bollinger_bands(df, 120)
    df = bollinger_bands(df, 336)
    df = bollinger_bands(df, 720)
    
    df = get_ema(df, period_length=8)
    df = get_ema(df, period_length=24)
    df = get_ema(df, period_length=120)
    df = get_ema(df, period_length=336)
    df = get_ema(df, period_length=720)

    df = get_atr(df,5) #here operating on days!
    df = get_atr(df,14) #here operating on days!
    df = get_atr(df,30) #here operating on days!

    df = stochastic_osc(df, 24, True)
    df = stochastic_osc(df, 120, True)
    df = stochastic_osc(df, 336, True)
    df = stochastic_osc(df, 720, True)

    df.drop(columns = ['date', 'time'], inplace = True)

    df['price_unnormalised'] = df['price']

    return df

In [65]:
def build_small(data_f):
    df = data_f.copy()
    df = dataformatting(df)

    df = bollinger_bands(df, 24)

    df = get_ema(df, period_length=24)
    
    df = get_atr(df,5) #here operating on days!
    
    df = stochastic_osc(df, 24, False)
    
    df.drop(columns = ['date', 'time'], inplace = True)
    
    return df

In [66]:
categorical_train = build_small(train)
categorical_train.tail(5)

Unnamed: 0,price,month,day,hour,24h / 1day bollinger down,24h / 1day bollinger middle,24h / 1day bollinger up,24h / 1day EMA,120h / 5day ATR,24h / 1stochastic oscillator
26299,36.0,12,31,20,4.305981,27.137083,49.968186,31.306249,39.162491,83.177851
26300,29.0,12,31,21,4.26855,26.928333,49.588117,31.121749,39.162491,62.112549
26301,24.82,12,31,22,4.119311,26.788333,49.457356,30.617609,39.162491,49.533554
26302,31.28,12,31,23,4.118847,26.716667,49.314486,30.6706,39.162491,68.973819
26303,31.0,12,31,24,4.181444,26.842083,49.502723,30.696952,39.162491,68.131207


In [67]:
categorical_val = build_small(val)
categorical_val.head(5)

Unnamed: 0,price,month,day,hour,24h / 1day bollinger down,24h / 1day bollinger middle,24h / 1day bollinger up,24h / 1day EMA,120h / 5day ATR,24h / 1stochastic oscillator
0,13.2,1,1,1,8.144952,13.2,18.255048,13.2,32.49,37.550015
1,13.14,1,1,2,6.424676,13.17,19.915324,13.16875,32.49,37.365343
2,11.83,1,1,3,3.987663,12.723333,21.459004,12.684818,32.49,33.333333
3,9.49,1,1,4,3.742843,11.915,20.087157,11.783622,32.49,26.131117
4,8.63,1,1,5,3.544821,11.258,18.971179,11.043593,32.49,23.484149


Creating DF for tabular learning

In [68]:
train_discrete_path = os.path.join(os.getcwd(),'data/train_data/train_discrete.npy')
val_discrete_path = os.path.join(os.getcwd(),'data/val_data/val_discrete.npy')

with open(train_discrete_path,'wb') as f: 
    np.save(f,categorical_train.to_numpy())

with open(val_discrete_path,'wb') as f:
    np.save(f,categorical_val.to_numpy())

In [69]:
train_preprocessed = build_big(train)
train_preprocessed, dictionary = normalize_train(train_preprocessed)
train_preprocessed.tail(5)


Unnamed: 0,price,month,day,hour,8h / 0day bollinger down,8h / 0day bollinger middle,8h / 0day bollinger up,24h / 1day bollinger down,24h / 1day bollinger middle,24h / 1day bollinger up,...,720h / 30day ATR,24h / 1stochastic oscillator,24h / 1smoothed oscillator,120h / 5stochastic oscillator,120h / 5smoothed oscillator,336h / 14stochastic oscillator,336h / 14smoothed oscillator,720h / 30stochastic oscillator,720h / 30smoothed oscillator,price_unnormalised
26299,-0.364244,1.588292,1.734621,1.083452,0.036487,-0.458624,-0.418633,-0.23015,-0.926965,-0.534187,...,-0.50663,1.092969,1.064988,0.792683,0.838053,0.344565,-0.062018,-0.464728,-0.51021,36.0
26300,-0.538851,1.588292,1.734621,1.227912,-0.010356,-0.509256,-0.437911,-0.231116,-0.935214,-0.539529,...,-0.50663,0.400533,0.945372,0.304471,0.733837,-0.061936,-0.068727,-0.693938,-0.568215,29.0
26301,-0.643116,1.588292,1.734621,1.372373,-0.094414,-0.568696,-0.44515,-0.234971,-0.940747,-0.541367,...,-0.50663,-0.01295,0.784122,0.012939,0.60547,-0.304676,-0.098582,-0.830809,-0.636309,24.82
26302,-0.481979,1.588292,1.734621,1.516833,-0.101688,-0.575613,-0.447321,-0.234983,-0.943579,-0.543375,...,-0.50663,0.62607,0.821655,0.463489,0.609066,0.070467,-0.03442,-0.619281,-0.644233,31.28
26303,-0.488963,1.588292,1.734621,1.661293,-0.093957,-0.570867,-0.447283,-0.233366,-0.938623,-0.540729,...,-0.50663,0.598372,0.898734,0.44396,0.640845,0.054207,0.046533,-0.62845,-0.638797,31.0


In [70]:
train_preprocessed.day.unique()

array([-1.67387396, -1.56025744, -1.44664093, -1.33302441, -1.2194079 ,
       -1.10579139, -0.99217487, -0.87855836, -0.76494184, -0.65132533,
       -0.53770881, -0.4240923 , -0.31047579, -0.19685927, -0.08324276,
        0.03037376,  0.14399027,  0.25760679,  0.3712233 ,  0.48483982,
        0.59845633,  0.71207284,  0.82568936,  0.93930587,  1.05292239,
        1.1665389 ,  1.28015542,  1.39377193,  1.50738844,  1.62100496,
        1.73462147])

In [71]:
val_preprocessed = build_big(val)
val_preprocessed = normalize_validation(val_preprocessed, dictionary)
val_preprocessed.head(5)

Unnamed: 0,price,month,day,hour,8h / 0day bollinger down,8h / 0day bollinger middle,8h / 0day bollinger up,24h / 1day bollinger down,24h / 1day bollinger middle,24h / 1day bollinger up,...,720h / 30day ATR,24h / 1stochastic oscillator,24h / 1smoothed oscillator,120h / 5stochastic oscillator,120h / 5smoothed oscillator,336h / 14stochastic oscillator,336h / 14smoothed oscillator,720h / 30stochastic oscillator,720h / 30smoothed oscillator,price_unnormalised
0,-0.932963,-1.60099,-1.673874,-1.661293,-0.457116,-1.203307,-0.805619,-0.13099,-1.4777,-0.979926,...,-0.866671,-0.406859,-1.174508,-1.102868,-1.599493,-1.449662,-1.798393,-1.32683,-1.618676,13.2
1,-0.93446,-1.60099,-1.673874,-1.516833,-0.502616,-1.204272,-0.782366,-0.175424,-1.478885,-0.95659,...,-0.866671,-0.41293,-1.174508,-1.10589,-1.599493,-1.451007,-1.798393,-1.32827,-1.618676,13.14
2,-0.967136,-1.60099,-1.673874,-1.372373,-0.567072,-1.218642,-0.760745,-0.238372,-1.496536,-0.934893,...,-0.866671,-0.545465,-1.174508,-1.171879,-1.599493,-1.480383,-1.798393,-1.35969,-1.618676,11.83
3,-1.025505,-1.60099,-1.673874,-1.227912,-0.573548,-1.244649,-0.779959,-0.244695,-1.528477,-0.954175,...,-0.866671,-0.782209,-1.174508,-1.289754,-1.599493,-1.532854,-1.798393,-1.415815,-1.618676,9.49
4,-1.046957,-1.60099,-1.673874,-1.083452,-0.578785,-1.265786,-0.795589,-0.24981,-1.554439,-0.96986,...,-0.866671,-0.869217,-1.174508,-1.333075,-1.599493,-1.552139,-1.798393,-1.436442,-1.618676,8.63


In [72]:
columns_to_select = ['price','month','hour','price_unnormalised']
new_train_preprocessed = train_preprocessed[columns_to_select]
new_train_preprocessed.head(5)

Unnamed: 0,price,month,hour,price_unnormalised
0,-0.655837,-1.60099,-1.661293,24.31
1,-0.655837,-1.60099,-1.516833,24.31
2,-0.720691,-1.60099,-1.372373,21.71
3,-1.052195,-1.60099,-1.227912,8.42
4,-1.261973,-1.60099,-1.083452,0.01


In [73]:
new_val_preprocessed = val_preprocessed[columns_to_select]
new_val_preprocessed.head(5)

Unnamed: 0,price,month,hour,price_unnormalised
0,-0.932963,-1.60099,-1.661293,13.2
1,-0.93446,-1.60099,-1.516833,13.14
2,-0.967136,-1.60099,-1.372373,11.83
3,-1.025505,-1.60099,-1.227912,9.49
4,-1.046957,-1.60099,-1.083452,8.63


In [74]:
#to save to .npy file
# np_train = train_preprocessed.to_numpy()
np_train = new_train_preprocessed.to_numpy()
with open('./data/train_data/train_big.npy', 'wb') as f:
    np.save(f, np_train)
    
# np_val = val_preprocessed.to_numpy()
np_val = new_val_preprocessed.to_numpy()
with open('./data/val_data/val_big.npy', 'wb') as f:
    np.save(f, np_val)

In [75]:
#to open .npy file
with open('./data/train_data/train.npy', 'rb') as f:
    np_train = np.load(f)