In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [3]:
def transformSupervised(df, n_prev_candles, n_target_candles):
    df.drop('time',axis=1)
    # Shift the close prices down by the number of target candles to get the target values
    features_target = ['open', 'high', 'low', 'close']
    for i in range( n_target_candles ,0,-1):
        for feature in features_target:
            df[f'{feature}_target{i}'] = df[feature].shift(-i)

    # Use the previous candle data as features
    features = ['open', 'high', 'low', 'close', 'tick_volume', 'spread']

    # Create lag features for the previous candles
    for i in range(n_prev_candles,0,-1):
        for feature in features:
            df[f'{feature}_lag{n_prev_candles-i+1}'] = df[feature].shift(i)

    # Drop rows with NaN values due to shifting
    df.dropna(inplace=True)

    X_cols = [f'{feature}_lag{i}' for i in range(1, n_prev_candles + 1) for feature in features]
    X = df[X_cols]

    Y_cols =  [f'{feature}_target{i}' for i in range(1, n_target_candles + 1) for feature in features_target]
    y = df[Y_cols]

    return X, y

In [4]:
def trainTestSplit(X,y, testPorcent=0.3):
	splitPoint = int(X.shape[0]*0.3)
	trainX,trainY = X[:splitPoint], y[:splitPoint]
	testX,testY = X[splitPoint:], y[splitPoint:]
	return trainX,trainY,testX,testY

In [5]:
def randomForestForecastBuilder(X, y,parameters:dict):
	model = RandomForestRegressor(**parameters)
	model.fit(X, y)
	return model

In [6]:
def candleFlowAccuracy(yTrue,yPred):
    bullBearCounter = 0
    flowCounter = 0
    for i in range(0,len(yTrue)):
        predCandle = yPred[i]
        trueCandle = yTrue[i]
        candleFlowMatch = False
        candleBullBearMatch = False
        predOpen,predHigh,predLow,predClose = predCandle[0],predCandle[1],predCandle[2],predCandle[3]
        trueOpen,trueHigh,trueLow,trueClose = trueCandle[0],trueCandle[1],trueCandle[2],trueCandle[3]

        candleBullBearMatch  = ((predClose-predOpen)<0) == ((trueClose-trueOpen)<0)
        candleFlowMatch = ((predOpen-predLow)>=0) == ((trueOpen-trueLow)>=0) and \
                          ((predHigh-predLow)>=0) == ((trueHigh-trueLow)>=0) and \
                          ((predHigh-predClose)>=0) == ((trueHigh-trueClose)>=0) 
        if (candleBullBearMatch):   bullBearCounter+=1
        if (candleFlowMatch): flowCounter+=1
    bullBearAcc = bullBearCounter/len(yTrue)
    flowAcc = flowCounter/len(yTrue)
    return bullBearAcc,flowAcc

In [7]:
macroCandles = pd.read_csv("XAUUSD_W1.csv")
microCandles = pd.read_csv("XAUUSD_M15.csv")

macroCandles['time'] = pd.to_datetime(macroCandles['time']) 
microCandles['time'] : pd.to_datetime(microCandles['time'])

macroCandles = macroCandles.drop('real_volume',axis=1)
microCandles = microCandles.drop('real_volume',axis=1)

In [8]:
macroCandlesInput = 3*4 # Data of 3 months (3 * 4 weeks)
macroCandlesOutput = 1 # Prediction of 1 week 
microCandlesInput = int(4 * 7 * 24 / 15) # Data of 4 week ( 1 week is 7 days which is 24 hours)
microCandlesOutput = int(7 * 24 / 15)  # Prediction of 1 week

In [9]:
macroCandlesShiftedX,macroCandlesShiftedY= transformSupervised(macroCandles.copy(), macroCandlesInput, macroCandlesOutput)

In [10]:
trainX,trainY,testX,testY = trainTestSplit(macroCandlesShiftedX,macroCandlesShiftedY)

In [11]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

validationList = []

estimatorsOptions = [5, 10, 100,1000,5000]
criterionOptions = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
bootstrapOptions = [False,True]

model = "random Forest"
for bootstrapOption in bootstrapOptions:
    for criterionOption in criterionOptions:
        for estimatorsOption in estimatorsOptions:
            randomForestParameters = {'n_estimators':estimatorsOption,
                                        'criterion':criterionOption,
                                        'max_depth':None,
                                        'min_samples_split':2, 
                                        'min_samples_leaf':1,
                                        'min_weight_fraction_leaf':0.0,
                                        'max_features':1.0,
                                        'max_leaf_nodes':None,
                                        'min_impurity_decrease':0.0,
                                        'bootstrap':bootstrapOption,
                                        'oob_score':False,
                                        'n_jobs':None,
                                        'random_state':None,
                                        'verbose':0,
                                        'warm_start':False,
                                        'ccp_alpha':0.0,
                                        'max_samples':None,
                                        'monotonic_cst':None}
            randomForest = randomForestForecastBuilder(trainX,trainY,randomForestParameters)
            predY = randomForest.predict(testX)
            testYList = testY.to_numpy()
            r2 = r2_score(testYList, predY)
            mae =mean_absolute_error(testYList, predY)
            mse = mean_squared_error(testYList, predY)
            bullBearAcc,flowAcc = candleFlowAccuracy(testYList,predY)
            validationList.append([model,estimatorsOption,criterionOption,bootstrapOption,r2,mae,mse,bullBearAcc])

validationDf = pd.DataFrame(columns=['model','Num Estimator','criterion','bootstrap','r2','mae','mse','bullBearAcc'],data=validationList)
validationDf
            

Unnamed: 0,model,Num Estimator,criterion,bootstrap,r2,mae,mse,bullBearAcc
0,random Forest,5,squared_error,False,0.036657,92.965129,13681.306966,0.598639
1,random Forest,10,squared_error,False,0.061489,90.881294,13334.81923,0.605442
2,random Forest,100,squared_error,False,0.064463,90.95305,13290.594123,0.585034
3,random Forest,1000,squared_error,False,0.068375,90.602905,13233.376511,0.619048
4,random Forest,5000,squared_error,False,0.068743,90.639833,13229.334715,0.605442
5,random Forest,5,absolute_error,False,0.165612,84.567755,11871.643043,0.503401
6,random Forest,10,absolute_error,False,0.1687,84.060032,11826.073803,0.496599
7,random Forest,100,absolute_error,False,0.178493,83.394115,11687.604959,0.482993
8,random Forest,1000,absolute_error,False,0.167209,84.063633,11845.015279,0.496599
9,random Forest,5000,absolute_error,False,0.168975,83.956229,11820.616811,0.503401


In [12]:
import pickle
bestRandomForestParameters = {'n_estimators':10,
                            'criterion':'squared_error',
                            'max_depth':None,
                            'min_samples_split':2, 
                            'min_samples_leaf':1,
                            'min_weight_fraction_leaf':0.0,
                            'max_features':1.0,
                            'max_leaf_nodes':None,
                            'min_impurity_decrease':0.0,
                            'bootstrap':True,
                            'oob_score':False,
                            'n_jobs':None,
                            'random_state':None,
                            'verbose':0,
                            'warm_start':False,
                            'ccp_alpha':0.0,
                            'max_samples':None,
                            'monotonic_cst':None}
bestRandomForest = randomForestForecastBuilder(trainX,trainY,bestRandomForestParameters)
pickle.dump(bestRandomForest,open('model.pkl', 'wb'))
