### Import modules

In [None]:
!pip install gplearn

In [None]:

!pip install yfinance

In [43]:
import graphviz
from gplearn.genetic import SymbolicTransformer
from gplearn.genetic import SymbolicRegressor
from sklearn.utils import check_random_state
from sklearn.datasets import load_diabetes
import numpy as np

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

### Load Data

In [72]:
forex_data_1 = yf.download('EURUSD=X', start='2020-01-01', end='2021-12-31')

[*********************100%***********************]  1 of 1 completed


In [73]:
#Removing unnecessary columns
df=forex_data_1.drop(['Open', 'High', 'Low', 'Close', 'Volume'],axis=1) 
df.rename(columns={'Adj Close': 'eur'},inplace=True) 
df.head(2)

Unnamed: 0_level_0,eur
Date,Unnamed: 1_level_1
2020-01-01,1.122083
2020-01-02,1.122083


In [74]:
df = df.dropna(axis=0)

In [75]:
#Scaling the values between 0 to 1
from sklearn.preprocessing import MinMaxScaler
ss= MinMaxScaler(feature_range=(0,1))
train = ss.fit_transform(np.array(df["eur"]).reshape(-1,1))

In [76]:
# Feature selection
xtrain=[]
ytrain=[]
for i in range(5,len(train)):
    xtrain.append(train[i-5:i,0])
    ytrain.append(train[i,0])

xtrain, ytrain = np.array(xtrain), np.array(ytrain)

### ML Model Prediction

In [77]:
from sklearn.linear_model import Ridge
est = Ridge()
est.fit(xtrain, ytrain)
print(est.score(xtrain, ytrain))

0.9820575757224339


In [78]:
test_df = yf.download('EURUSD=X', start='2022-01-01', end='2022-06-30')

[*********************100%***********************]  1 of 1 completed


In [79]:
test = pd.DataFrame()

In [80]:
test['eur'] = test_df['Adj Close']
test = test.dropna()

In [81]:
total= pd.concat([df['eur'],test['eur']],axis=0)

In [82]:
test_input = total[len(total)-len(test)-5:].values
test_input= test_input.reshape(-1,1) # reshaping it to get it transformed
test_input= ss.transform(test_input)

In [89]:
xtest= []
for i in range(60,80):
    xtest.append(test_input[i-5:i,0]) #creating input for lstm prediction

In [90]:
xtest= np.array(xtest)

In [91]:
predicted_value= est.predict(xtest)

In [92]:
predicted_value2 = ss.inverse_transform(predicted_value.reshape(-1,1))

In [93]:

test_actual = np.array(test['eur'][0:20])

In [94]:
from sklearn.metrics import r2_score
r2_score(test_actual, predicted_value2)

-24.0565126188782

### Symbolic Transformer Usage

In [95]:
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                'abs', 'neg', 'inv', 'max', 'min']

In [96]:
### Check genetic programming performance
gp = SymbolicTransformer(generations=10, population_size=2000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0)
gp.fit(xtrain,ytrain)

gp_features = gp.transform(xtrain)
new_xtrain = np.hstack((xtrain, gp_features))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    13.13         0.653299        2         0.994426         0.984291     18.73s
   1     6.03         0.880453        2         0.994403         0.987718     26.15s
   2     2.18         0.980443        2         0.994551         0.986662     13.34s
   3     1.21           0.9901        1          0.99447         0.986831     10.62s
   4     1.03         0.991014        1         0.994509         0.986821      8.84s
   5     1.04         0.990447        1         0.994674         0.985553      7.08s
   6     1.06         0.990658        1         0.994482         0.985879      6.80s
   7     1.05         0.990908        1         0.994546         0.984646      8.67s
   8     1.03         0.991883        1         0.994514         0.983878  

In [97]:
est2 = Ridge()
est2.fit(new_xtrain,ytrain)
print(est2.score(new_xtrain,ytrain))

0.987153309029447


We got some performance boost by adding new features using Symbolic Regression.
0.981-->0.987


In [107]:
# dot_data = gp._programs[0][10].export_graphviz()
# graph = graphviz.Source(dot_data)
# graph.render('images/ex3_fig1', format='png', cleanup=True)
# graph

### Symbolic Regressor Usage

In [108]:
sr = SymbolicRegressor(population_size=5000,
                           generations=20, stopping_criteria=0.01,
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.01, random_state=0)

In [109]:
sr.fit(xtrain, ytrain)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    29.43          19870.2        5         0.021346        0.0228942      1.84m
   1     6.61         0.833912        5        0.0206267        0.0297639      1.20m
   2     2.63         0.845658        1        0.0205119        0.0303695      1.02m
   3     1.48         0.453901        1        0.0204952         0.030519     58.05s
   4     1.53         0.272292        1        0.0203546        0.0317785     52.42s
   5     1.50         0.436497        1        0.0205656        0.0298879     49.17s
   6     1.45         0.314511        1        0.0203604        0.0317269     47.30s
   7     1.46         0.409417        1        0.0204383        0.0310288     41.94s
   8     1.56         0.619743        1        0.0203743        0.0316023  

SymbolicRegressor(max_samples=0.9, p_crossover=0.7, p_hoist_mutation=0.05,
                  p_point_mutation=0.1, p_subtree_mutation=0.1,
                  parsimony_coefficient=0.01, population_size=5000,
                  random_state=0, stopping_criteria=0.01, verbose=1)

In [110]:
sr_predict = sr.predict(xtest)

In [111]:
predicted_value_sr = ss.inverse_transform(sr_predict.reshape(-1,1))

In [112]:
r2_score(test_actual, predicted_value_sr)

-26.759489612667714

In [113]:
print(sr._program)

X4


We see a massive boost in r2_score but our model seems to be predicting last value only

In [114]:
xtrain.shape

(518, 5)

In [115]:
ytrain.shape

(518,)