In [None]:
# default_exp core

# OHLCV Preprocessing

> This module holds all the functions necessary to preprocess the ohlcv data in a dataframe. The goal is to calculate what the highest percentage change was over the following n days. This will serve as a prediction target for ML purposes.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

In [None]:
def json_parse(filename):
    data = pd.read_json('data/' + filename)
    data = json_normalize(data['result'])
    return data

In [None]:
filename = 'btc-ltc-daily.json'

In [None]:
data = json_parse(filename)

In [None]:
data.head()

Unnamed: 0,BV,C,H,L,O,T,V
0,1.704125,0.0254,0.0254,0.02525,0.02525,2014-03-07T00:00:00,67.445876
1,0.312342,0.02525,0.02525,0.02525,0.02525,2014-03-10T00:00:00,12.37
2,0.000339,0.02525,0.02525,0.02525,0.02525,2014-03-11T00:00:00,0.013434
3,5.041498,0.027,0.027,0.0254,0.0254,2014-03-12T00:00:00,194.099661
4,0.055185,0.01001,0.02525,1e-08,0.02525,2014-03-13T00:00:00,9.420337


In [None]:
# truncate timestamp from T column (only call once, otherwise reload json into dataframe)
data['T'] = data['T'].str[:-9]

In [None]:
data[0:30]

Unnamed: 0,BV,C,H,L,O,T,V
0,1.704125,0.0254,0.0254,0.02525,0.02525,2014-03-07,67.445876
1,0.312342,0.02525,0.02525,0.02525,0.02525,2014-03-10,12.37
2,0.000339,0.02525,0.02525,0.02525,0.02525,2014-03-11,0.013434
3,5.041498,0.027,0.027,0.0254,0.0254,2014-03-12,194.099661
4,0.055185,0.01001,0.02525,1e-08,0.02525,2014-03-13,9.420337
5,0.181362,0.0267,0.0267,0.01001,0.0267,2014-03-14,6.926632
6,0.193406,0.0267,0.0267,5e-06,0.0267,2014-03-15,11.862332
7,3.074014,0.0279,0.27,1e-05,0.0267,2014-03-16,177.266215
8,1.744987,0.0258,0.0284,0.0256,0.0257,2014-03-17,64.057663
9,1.325303,0.0303,0.035,0.0256,0.0279,2014-03-18,47.424872


In [None]:
data[0:30]['H'].max()
len(data.index)

2122

In [None]:
highs = data.rolling(window=30, min_periods=0)['H'].max().shift(-30)
highs

0       0.27000
1       0.27000
2       0.27000
3       0.27000
4       0.27000
5       0.27000
6       0.27000
7       0.03500
8       0.03500
9       0.03150
10      0.03000
11      0.03000
12      0.03000
13      0.03000
14      0.03000
15      0.03000
16      0.02950
17      0.02930
18      0.02930
19      0.02930
20      0.02929
21      0.02929
22      0.02880
23      0.02880
24      0.02880
25      0.02880
26      0.02880
27      0.02880
28      0.02880
29      0.02880
         ...   
2092        NaN
2093        NaN
2094        NaN
2095        NaN
2096        NaN
2097        NaN
2098        NaN
2099        NaN
2100        NaN
2101        NaN
2102        NaN
2103        NaN
2104        NaN
2105        NaN
2106        NaN
2107        NaN
2108        NaN
2109        NaN
2110        NaN
2111        NaN
2112        NaN
2113        NaN
2114        NaN
2115        NaN
2116        NaN
2117        NaN
2118        NaN
2119        NaN
2120        NaN
2121        NaN
Name: H, Length: 2122, d

> Calculates the highest percentage change in the next n days compared to the close ('C').

In [None]:
def high_pct_change(data, n):
    highs = data.rolling(window=n, min_periods=0)['H'].max().shift(-n)
    pcts = highs/data['C'] - 1
    return pcts

In [None]:
pcts = high_pct_change(data, 30)
pcts.head()

0     9.629921
1     9.693069
2     9.693069
3     9.000000
4    25.973135
dtype: float64

In [None]:
def round_pct(pct, interval=.25, cutoff=1):
    if pct >= cutoff:
        pct = cutoff
    elif pct < 0:
        pct = 0
    else:
        pct = interval*(pct // interval)
    return pct

In [None]:
pct = round_pct(.45)
pct

0.25

> Takes list of percents and rounds them down based on interval. Anything above the cutoff is set to the cutoff. For example, if interval is 0.25 and cutoff is 1: 0.12 -> 0, 0.26 -> 0.25, 0.73 -> 0.50, 0.99 -> 0.75, 2.25 -> 1.00.

In [None]:
def round_pcts(pcts, interval=.25, cutoff=1):
    pcts = pcts.apply(np.vectorize(round_pct))
    return pcts

In [None]:
rounded_pcts = round_pcts(pcts)
rounded_pcts.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
dtype: float64