In [46]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import pyfinancialdata
from datetime import datetime, timedelta

In [47]:
raw_data = pyfinancialdata.get(provider='oanda', instrument='EUR_USD', year=2010)
raw_data.head()

Unnamed: 0_level_0,close,high,low,open,volume,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-03 17:56:00,1.43172,1.43172,1.43172,1.43172,1,1.43172
2010-01-03 18:02:00,1.43325,1.43325,1.43172,1.43172,4,1.43325
2010-01-03 18:03:00,1.43278,1.43288,1.43248,1.43288,5,1.43278
2010-01-03 18:04:00,1.43328,1.43328,1.43328,1.43328,1,1.43328
2010-01-03 18:05:00,1.43323,1.43323,1.43323,1.43323,1,1.43323


In [48]:
def add_date_column(data):
    dates = pd.DataFrame(data.index.values, columns=['date'])
    size = len(data)
    data.index = np.arange(0, size, 1)
    return dates.join(data)

data = add_date_column(raw_data.drop('price', 1))
data.head()

Unnamed: 0,date,close,high,low,open,volume
0,2010-01-03 17:56:00,1.43172,1.43172,1.43172,1.43172,1
1,2010-01-03 18:02:00,1.43325,1.43325,1.43172,1.43172,4
2,2010-01-03 18:03:00,1.43278,1.43288,1.43248,1.43288,5
3,2010-01-03 18:04:00,1.43328,1.43328,1.43328,1.43328,1
4,2010-01-03 18:05:00,1.43323,1.43323,1.43323,1.43323,1


In [49]:
err = 0.0000001

def normalize_data(data):
    data['close'] = (data['close'] - data['close'].min()) / (data['close'].max() - data['close'].min())
    data['high'] = (data['high'] - data['high'].min()) / (data['high'].max() - data['high'].min())
    data['low'] = (data['low'] - data['low'].min()) / (data['low'].max() - data['low'].min())
    data['open'] = (data['open'] - data['open'].min()) / (data['open'].max() - data['open'].min())
    data['volume'] = (data['volume'] - data['volume'].min()) / (data['volume'].max() - data['volume'].min())
    return data

def create_train_data(data, interval_len, pred_interval_minutes):
    values = []
    labels = []
    pred_interval = timedelta(minutes=pred_interval_minutes)
    n = data.shape[0]
    for start in range(n - interval_len):
        if is_sequential(data[start : start + interval_len]['date']):
            last_ind = start + interval_len - 1
            last_candle = data.iloc[last_ind]
            pred_candle = find_candle_to_predict(data, last_ind, last_candle['date'] + pred_interval)
            if (pred_candle is not None) and (is_price_changed(last_candle, pred_candle)):
                values.append(collect_data(data[start : start + interval_len]))
                if last_candle['close'] < pred_candle['close']:
                    labels.append(np.array([1, 0], dtype=float))
                else:
                    labels.append(np.array([0, 1], dtype=float))
    return np.array(values), np.array(labels)


def is_sequential(data):
    cur = data.iloc[0]
    one_min = timedelta(minutes=1)
    res = True
    for i in range(1, len(data)):
        if cur + one_min != data.iloc[i]:
            res = False
            break
        cur = data.iloc[i]
    return res

def find_candle_to_predict(data, from_ind, time):
    i = from_ind
    res = None
    while i < data.shape[0]:
        if data.iloc[i]['date'] == time:
            res = data.iloc[i]
            break
        elif data.iloc[i]['date'] > time:
            break
        i += 1
    return res

def is_price_changed(candle1, candle2):
    return abs(candle1['close'] - candle2['close']) > err

def collect_data(data):
    data = data[['close', 'high', 'low', 'open', 'volume']].values
    return data.ravel()

# values, labels = create_train_data(data, interval_len=10, pred_interval_minutes=15)

In [50]:
norm_data = normalize_data(data)
values, labels = create_train_data(norm_data, interval_len=10, pred_interval_minutes=15)

In [63]:
all_data = np.hstack((values, labels))
df = pd.DataFrame(data=all_data, columns=np.hstack((np.arange(0, 50, 1), np.array(['up', 'down']))))
df.head()
df.to_csv('resources/EUR_USD_2010.csv')

In [51]:
# def plot_candles(candles):
#     fig = go.Figure(data=[go.Candlestick(x=candles['date'], open=candles['open'], high=candles['high'],
#                                          low=candles['low'], close=candles['close'])],
#                     layout=go.Layout(height=600, width=1000))
#     fig.show()
#
# plot_candles(data[9000:10000])
