In [10]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import pyfinancialdata
from datetime import datetime, timedelta

In [28]:
candle_len = 5
# raw_data = pyfinancialdata.get(provider='oanda', instrument='EUR_USD', year=2010, time_group=f"{candle_len}min", price_calculation=False)
raw_data = pyfinancialdata.get_multi_year(provider='oanda', instrument='EUR_USD', years=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019], time_group=f"{candle_len}min", price_calculation=False)
raw_data.head()

Unnamed: 0_level_0,close,high,low,open,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-03 17:55:00,1.43172,1.43172,1.43172,1.43172,1
2010-01-03 18:00:00,1.43328,1.43328,1.43172,1.43172,10
2010-01-03 18:05:00,1.4315,1.43425,1.4315,1.43323,17
2010-01-03 18:10:00,1.4311,1.4315,1.43105,1.43105,9
2010-01-03 18:15:00,1.43155,1.43155,1.4315,1.4315,2


In [29]:
def add_date_column(data):
    dates = pd.DataFrame(data.index.values, columns=['date'])
    size = len(data)
    data.index = np.arange(0, size, 1)
    return dates.join(data)

data = add_date_column(raw_data)
data = data.drop(data[np.isnan(data['close'])].index)
data.head()

Unnamed: 0,date,close,high,low,open,volume
0,2010-01-03 17:55:00,1.43172,1.43172,1.43172,1.43172,1
1,2010-01-03 18:00:00,1.43328,1.43328,1.43172,1.43172,10
2,2010-01-03 18:05:00,1.4315,1.43425,1.4315,1.43323,17
3,2010-01-03 18:10:00,1.4311,1.4315,1.43105,1.43105,9
4,2010-01-03 18:15:00,1.43155,1.43155,1.4315,1.4315,2


In [30]:
def transform_data(data):
    values = []
    for i in range(data.shape[0]):
        cur = data.iloc[i]
        open = cur['open']
        high = cur['high']
        close = cur['close']
        low = cur['low']
        values.append([cur['date'], high / open, low / open, close / open, cur['volume'], close])
    return pd.DataFrame(values, columns=['date', 'high_p', 'low_p', 'close_p', 'volume', 'close'])

transformed_data = transform_data(data)

In [31]:
err = 0.0000001

def normalize_and_clear(data):
    high_mean = data['high_p'].mean()
    low_mean = data['low_p'].mean()
    close_mean = data['close_p'].mean()
    volume_mean = data['volume'].mean()
    high_std = data['high_p'].std()
    low_std = data['low_p'].std()
    close_std = data['close_p'].std()
    volume_std = data['volume'].std()
    data = data.drop(data[(abs(data['high_p'] - high_mean) > 3 * high_std)\
                | (abs(data['low_p'] - low_mean) > 3 * low_std)\
                | (abs(data['close_p'] - close_mean) > 3 * close_std)\
                | (abs(data['volume'] - volume_mean) > 3 * volume_std)].index)
    data['high_p'] = (data['high_p'] - data['high_p'].min()) / (data['high_p'].max() - data['high_p'].min())
    data['low_p'] = (data['low_p'] - data['low_p'].min()) / (data['low_p'].max() - data['low_p'].min())
    data['close_p'] = (data['close_p'] - data['close_p'].min()) / (data['close_p'].max() - data['close_p'].min())
    data['volume'] = (data['volume'] - data['volume'].min()) / (data['volume'].max() - data['volume'].min())
    return data

def create_train_data(data, interval_len, pred_interval_minutes):
    values = []
    labels = []
    pred_interval = timedelta(minutes=pred_interval_minutes)
    n = data.shape[0]
    for start in range(n - interval_len):
        if is_sequential(data[start : start + interval_len]['date']):
            last_ind = start + interval_len - 1
            last_candle = data.iloc[last_ind]
            try:
                pred_candle = data.loc[data['date'] == last_candle['date'] + pred_interval]
            except KeyError:
                continue
            if pred_candle.shape[0] != 1:
                continue
            pred_candle = pred_candle.iloc[0]
            if is_price_changed(last_candle, pred_candle):
                values.append(collect_data(data[start : start + interval_len]))
                if last_candle['close'] < pred_candle['close']:
                    labels.append(np.array([1, 0], dtype=float))
                else:
                    labels.append(np.array([0, 1], dtype=float))
    return np.array(values), np.array(labels)


def is_sequential(data):
    n = len(data)
    first = data.iloc[0]
    last = data.iloc[n - 1]
    return (first + timedelta(minutes = n * candle_len)) == last

def is_price_changed(candle1, candle2):
    return abs(candle1['close'] - candle2['close']) > err

def collect_data(data):
    values = []
    for i in range(data.shape[0]):
        cur = data.iloc[i]
        values.append(cur['high_p'])
        values.append(cur['low_p'])
        values.append(cur['close_p'])
        values.append(cur['volume'])
    return values

In [32]:
norm_data = normalize_and_clear(transformed_data)
values, labels = create_train_data(norm_data, interval_len=10, pred_interval_minutes=15)

In [33]:
all_data = np.hstack((values, labels))
df = pd.DataFrame(data=all_data, columns=np.hstack((np.arange(0, values.shape[1], 1), np.array(['up', 'down']))))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,up,down
0,0.0,0.064327,0.225133,0.132626,0.359389,0.943263,0.678927,0.094164,0.110546,0.659689,...,0.0,0.419183,0.263994,0.157825,0.055193,0.844248,0.418667,0.061008,1.0,0.0
1,0.359389,0.943263,0.678927,0.094164,0.110546,0.659689,0.415038,0.090186,0.117471,0.836911,...,0.055193,0.844248,0.418667,0.061008,0.407119,0.879626,0.65053,0.094164,0.0,1.0
2,0.110546,0.659689,0.415038,0.090186,0.117471,0.836911,0.415026,0.054377,0.635815,1.0,...,0.407119,0.879626,0.65053,0.094164,0.689811,0.971686,0.79799,0.155172,0.0,1.0
3,0.117471,0.836911,0.415026,0.054377,0.635815,1.0,0.819669,0.049072,0.04144,0.978738,...,0.689811,0.971686,0.79799,0.155172,0.082726,0.278432,0.137927,0.156499,1.0,0.0
4,0.635815,1.0,0.819669,0.049072,0.04144,0.978738,0.510012,0.015915,0.531796,1.0,...,0.082726,0.278432,0.137927,0.156499,0.068986,0.730997,0.411649,0.128647,1.0,0.0


In [34]:
df.to_csv('resources/EUR_USD_2010-2019_c5min_10_pred15min.csv')

In [51]:
# def plot_candles(candles):
#     fig = go.Figure(data=[go.Candlestick(x=candles['date'], open=candles['open'], high=candles['high'],
#                                          low=candles['low'], close=candles['close'])],
#                     layout=go.Layout(height=600, width=1000))
#     fig.show()
#
# plot_candles(data[9000:10000])
