# Downloading Historical OHLC mid, bid and ask Data for EUR/USD

In [1]:
import warnings
warnings.filterwarnings("ignore")

from api.oanda_api import OandaApi
from infrastructure.instrument_collection import instrumentCollection
from infrastructure.collect_data import run_collection
import pandas as pd

# Initializing our session
api = OandaApi()

# Downloading the instruments available and storing it in a jason file
instruments = api.get_account_instruments()
instrumentCollection.CreateFile(instruments, './data')

instrumentCollection.LoadInstruments('./data')

# Downloading OHLC prices, 2     years worth 5 minute candles (this might take a while)
run_collection(instrumentCollection, api, currencies=["EUR", "USD"], years=2)

df = pd.read_pickle("./data/EUR_USD_M5.pkl")

EUR_USD M5
EUR_USD M5 2023-01-25 18:46:38.614685+00:00 2023-02-05 04:46:38.614685+00:00 --> 2055 candles loaded
EUR_USD M5 2023-02-05 04:46:38.614685+00:00 2023-02-15 14:46:38.614685+00:00 --> 2218 candles loaded
EUR_USD M5 2023-02-15 14:46:38.614685+00:00 2023-02-26 00:46:38.614685+00:00 --> 2103 candles loaded
EUR_USD M5 2023-02-26 00:46:38.614685+00:00 2023-03-08 10:46:38.614685+00:00 --> 2170 candles loaded
EUR_USD M5 2023-03-08 10:46:38.614685+00:00 2023-03-18 20:46:38.614685+00:00 --> 2151 candles loaded
EUR_USD M5 2023-03-18 20:46:38.614685+00:00 2023-03-29 06:46:38.614685+00:00 --> 2134 candles loaded
EUR_USD M5 2023-03-29 06:46:38.614685+00:00 2023-04-08 16:46:38.614685+00:00 --> 2186 candles loaded
EUR_USD M5 2023-04-08 16:46:38.614685+00:00 2023-04-19 02:46:38.614685+00:00 --> 2085 candles loaded
EUR_USD M5 2023-04-19 02:46:38.614685+00:00 2023-04-29 12:46:38.614685+00:00 --> 2235 candles loaded
EUR_USD M5 2023-04-29 12:46:38.614685+00:00 2023-05-09 22:46:38.614685+00:00 -->

# Preview Of Our DataFrame

In [2]:
df.head()

Unnamed: 0,time,volume,mid_o,mid_h,mid_l,mid_c,bid_o,bid_h,bid_l,bid_c,ask_o,ask_h,ask_l,ask_c
0,2023-01-25 18:45:00+00:00,245,1.09116,1.09147,1.09112,1.09142,1.0911,1.0914,1.09105,1.09135,1.09123,1.09155,1.0912,1.09148
1,2023-01-25 18:50:00+00:00,309,1.09142,1.09157,1.09102,1.09102,1.09135,1.0915,1.09094,1.09094,1.09149,1.09164,1.09108,1.09109
2,2023-01-25 18:55:00+00:00,181,1.09101,1.0911,1.09084,1.09092,1.09093,1.09102,1.09076,1.09085,1.09109,1.09117,1.09091,1.09098
3,2023-01-25 19:00:00+00:00,317,1.09093,1.09101,1.09068,1.09098,1.09086,1.09094,1.09061,1.09091,1.091,1.09108,1.09075,1.09106
4,2023-01-25 19:05:00+00:00,359,1.09098,1.09108,1.09088,1.09092,1.09092,1.09101,1.0908,1.09085,1.09105,1.09116,1.09094,1.09099


# Dataset Structure and Overview

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149167 entries, 0 to 149166
Data columns (total 14 columns):
 #   Column  Non-Null Count   Dtype                  
---  ------  --------------   -----                  
 0   time    149167 non-null  datetime64[ns, tzutc()]
 1   volume  149167 non-null  int64                  
 2   mid_o   149167 non-null  float64                
 3   mid_h   149167 non-null  float64                
 4   mid_l   149167 non-null  float64                
 5   mid_c   149167 non-null  float64                
 6   bid_o   149167 non-null  float64                
 7   bid_h   149167 non-null  float64                
 8   bid_l   149167 non-null  float64                
 9   bid_c   149167 non-null  float64                
 10  ask_o   149167 non-null  float64                
 11  ask_h   149167 non-null  float64                
 12  ask_l   149167 non-null  float64                
 13  ask_c   149167 non-null  float64                
dtypes: datetime64[ns, tz

# Statistical Summary of the Dataset

In [4]:
df.describe()

Unnamed: 0,volume,mid_o,mid_h,mid_l,mid_c,bid_o,bid_h,bid_l,bid_c,ask_o,ask_h,ask_l,ask_c
count,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0,149167.0
mean,334.96876,1.080475,1.080638,1.080311,1.080475,1.080394,1.080558,1.080228,1.080394,1.080556,1.08072,1.080391,1.080556
std,307.151709,0.018964,0.018952,0.018975,0.018964,0.018964,0.018953,0.018975,0.018964,0.018964,0.018952,0.018975,0.018964
min,1.0,1.01843,1.01908,1.01779,1.01844,1.01836,1.01901,1.01771,1.01836,1.0185,1.01916,1.01786,1.01852
25%,151.0,1.06984,1.07001,1.0697,1.06984,1.06976,1.06993,1.06961,1.06976,1.06993,1.0701,1.06977,1.06993
50%,263.0,1.08287,1.08302,1.08273,1.08288,1.0828,1.08294,1.08265,1.0828,1.08295,1.08309,1.08281,1.08295
75%,421.0,1.09266,1.0928,1.09251,1.09266,1.09258,1.09272,1.09243,1.09258,1.09274,1.09288,1.09259,1.09274
max,10707.0,1.12756,1.12757,1.12675,1.12755,1.12748,1.12749,1.12667,1.12747,1.12765,1.12765,1.12683,1.12763


# Transforming Data from Time Bars into Dollar Value Bars

In [5]:
from technicals.dollar_value_bars import generate_dollar_bars

df = generate_dollar_bars(df, 360)

# Feature Engineering: Adding Technical Indicators

In [6]:
from models.add_indicators import apply_indicators

df = apply_indicators(df)

# Feature Engineering: Adding Technical Patterns

In [7]:
from technicals.patterns import apply_candle_props

df = apply_candle_props(df)

# Adding Labels for Model Training using the Triple Barrier Method

In [8]:
from technicals.labeling import tripple_barrier_labeling

df = tripple_barrier_labeling(df, win=4, loss=2).dropna()

# Overview of Dataset Features

In [9]:
df.columns

Index(['time', 'volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h',
       'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 'spread', 'hour',
       'day_of_week', 'month', 'minute', 'BB_MA10', 'BB_UP10', 'BB_LW10',
       'BB_MA30', 'BB_UP30', 'BB_LW30', 'BB_MA50', 'BB_UP50', 'BB_LW50',
       'ATR_7', 'ATR_14', 'ATR_40', 'EMA20', 'KeUp20_10', 'KeLo20_10', 'EMA50',
       'KeUp50_50', 'KeLo50_50', 'EMA200', 'KeUp200_50', 'KeLo200_50', 'RSI_7',
       'RSI_14', 'RSI_50', 'MACD26_12', 'SIGNAL26_12', 'HIST26_12',
       'MACD52_24', 'SIGNAL52_24', 'HIST52_24', 'direction', 'body_size',
       'body_perc', 'body_lower', 'body_upper', 'body_bottom_perc',
       'body_top_perc', 'mid_point', 'low_change', 'high_change',
       'body_size_change', 'body_size_prev', 'direction_prev',
       'direction_prev_2', 'body_perc_prev', 'body_perc_prev_2',
       'mid_point_prev_2', 'Label', 'trade_duration'],
      dtype='object')

# Applying Stationarization to Selected Columns

In [10]:
from technicals.stationarize_data import stationarize_data

# Columns that we want to stationarize
stationary_cols = ['volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h',
       'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 'BB_MA10', 'BB_UP10', 'BB_LW10',
       'BB_MA30', 'BB_UP30', 'BB_LW30', 'BB_MA50', 'BB_UP50', 'BB_LW50',
       'ATR_7', 'ATR_14', 'ATR_40', 'EMA20', 'KeUp20_10', 'KeLo20_10', 'EMA50',
       'KeUp50_50', 'KeLo50_50', 'EMA200', 'KeUp200_50', 'KeLo200_50', 'RSI_7',
       'RSI_14', 'RSI_50', 'MACD26_12', 'SIGNAL26_12', 'HIST26_12',
       'MACD52_24', 'SIGNAL52_24', 'HIST52_24']

df = stationarize_data(df, stationary_cols).dropna()

# Selecting Features for Prediction

In [11]:
predictors = ['volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h',
       'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 'spread', 'hour',
       'day_of_week', 'month', 'minute', 'BB_MA10', 'BB_UP10', 'BB_LW10',
       'BB_MA30', 'BB_UP30', 'BB_LW30', 'BB_MA50', 'BB_UP50', 'BB_LW50',
       'ATR_7', 'ATR_14', 'ATR_40', 'EMA20', 'KeUp20_10', 'KeLo20_10', 'EMA50',
       'KeUp50_50', 'KeLo50_50', 'EMA200', 'KeUp200_50', 'KeLo200_50', 'RSI_7',
       'RSI_14', 'RSI_50', 'MACD26_12', 'SIGNAL26_12', 'HIST26_12',
       'MACD52_24', 'SIGNAL52_24', 'HIST52_24', 'direction', 'body_size',
       'body_perc', 'body_lower', 'body_upper', 'body_bottom_perc',
       'body_top_perc', 'mid_point', 'low_change', 'high_change',
       'body_size_prev', 'direction_prev',
       'direction_prev_2', 'body_perc_prev', 'body_perc_prev_2',
       'mid_point_prev_2']

# Splitting Dataset for Hyperparameter Tuning and Model Evaluation

In [12]:
# Will be used for hyperparameter tuning
validation_set = df[:30_000].copy()

# Will be used in the model evaluation faze
test_set = df[30_000:].copy()

# Hyperparameter Tuning Using Halving Random Search

In [13]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint


param_distributions = {
    'n_estimators': randint(100, 500),         
    'max_depth': randint(5, 20),                
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}

base_rf = RandomForestClassifier(random_state=42)
tscv = TimeSeriesSplit(n_splits=5)  

halving_search = HalvingRandomSearchCV(
    estimator=base_rf,param_distributions=param_distributions,factor=3,
    resource='n_samples', max_resources=10_000, scoring='precision', cv=tscv, verbose=0, random_state=42
)

X = validation_set[predictors]
y = validation_set['Label']
halving_search.fit(X, y)

print("Best Score:", halving_search.best_score_)
print("Best Params:", halving_search.best_params_)
best_rf = halving_search.best_estimator_

Best Score: 0.45323481116584563
Best Params: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 224}


# Evaluating the Optimized Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier
from technicals.backtesting import model_evaluation

test_set.dropna(inplace=True)
rf_predictions = model_evaluation(test_set, best_rf, predictors, start=10_000, step=10_000, memory='off')

17.25% there...
34.50% there...
51.75% there...
69.00% there...
86.25% there...


# Model Precision VS Benchmark Performance

In [15]:
loosing_trades = len(rf_predictions[(rf_predictions['Predictions']==1) & (rf_predictions['Label'] == 0)])
winning_trades = len(rf_predictions[(rf_predictions['Predictions']==1) & (rf_predictions['Label'] == 1)])

precision = (winning_trades / (winning_trades + loosing_trades)) * 100
benchmark = (len(rf_predictions[rf_predictions['Label'] == 1]) / (len(rf_predictions))) * 100

print(f"Precision: {precision:.3f} %")
print(f"BenchMark: {benchmark:.3f} %")
rf_predictions.value_counts()

Precision: 35.041 %
BenchMark: 34.098 %


Label  Predictions
0      0.0            26705
1      0.0            13709
0      1.0             4907
1      1.0             2647
Name: count, dtype: int64

# 

# Saving the model

In [19]:
from joblib import dump

# Save the model
dump(best_rf, 'ML_models/random_forest.pkl')

['ML_models/random_forest.pkl']

## Running the Live Trading Bot with our ML Model

In [20]:
from infrastructure.instrument_collection import instrumentCollection
from stream_bot.stream_bot import run_bot

instrumentCollection.LoadInstruments("./data")
run_bot()

Granularity: M1
EUR_USD: {'pair': 'EUR_USD', 'dollar_threshold': 360, 'model': 'ML_models/random_forest.pkl', 'reward': 4, 'risk': 2, 'probability': 0.6, 'predictors': [['volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h', 'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 'spread', 'hour', 'day_of_week', 'month', 'minute', 'BB_MA10', 'BB_UP10', 'BB_LW10', 'BB_MA30', 'BB_UP30', 'BB_LW30', 'BB_MA50', 'BB_UP50', 'BB_LW50', 'ATR_7', 'ATR_14', 'ATR_40', 'EMA20', 'KeUp20_10', 'KeLo20_10', 'EMA50', 'KeUp50_50', 'KeLo50_50', 'EMA200', 'KeUp200_50', 'KeLo200_50', 'RSI_7', 'RSI_14', 'RSI_50', 'MACD26_12', 'SIGNAL26_12', 'HIST26_12', 'MACD52_24', 'SIGNAL52_24', 'HIST52_24', 'direction', 'body_size', 'body_perc', 'body_lower', 'body_upper', 'body_bottom_perc', 'body_top_perc', 'mid_point', 'low_change', 'high_change', 'body_size_prev', 'direction_prev', 'direction_prev_2', 'body_perc_prev', 'body_perc_prev_2', 'mid_point_prev_2']]}
dict_keys(['EUR_USD'])
PriceProcessor: 2025-01-25 1