# Walk Forward: A Realistic Approach to Backtesting

![](<src/10_Table_Validation Methods.png>)

In [1]:
import multiprocessing as mp
mp.set_start_method('fork')

In [2]:
%load_ext autoreload
%autoreload 2

## Load the data

In [65]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df = df.drop(columns='change_tomorrow_direction')

## Walk Forward Validation

### How `TimeSeriesSplit` works

- Imagine we are in 2020, we can only train the data up until 31st December 2020, how good would have the model been going foward?

In [52]:
from sklearn.model_selection import TimeSeriesSplit

In [53]:
ts = TimeSeriesSplit(max_train_size=600, test_size=200)

In [54]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    list_df_train.append(df.iloc[index_train])
    list_df_test.append(df.iloc[index_test])

In [55]:
list_df_train[0]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-08,56.325228,56.582507,55.902560,56.058762,21220800,1.549143
2016-12-09,56.214968,56.959234,56.169027,56.940857,27349400,0.321692
...,...,...,...,...,...,...
2019-03-25,111.863188,113.254764,111.632861,112.918869,27067100,0.212035
2019-03-26,113.840195,113.926565,112.141514,113.158806,26097700,-0.976298


In [56]:
list_df_test[0]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-03-27,113.129993,113.446697,110.865089,112.064720,22733400,0.136831
2019-03-28,112.707721,112.842079,111.450503,112.218269,18334800,0.856386
...,...,...,...,...,...,...
2020-01-08,154.122900,155.936349,153.172546,155.247818,27746500,1.233897
2020-01-09,156.944909,157.313420,156.159411,157.187347,21385000,-0.464873


In [57]:
list_df_train[1]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-08-22,67.623224,68.455078,67.623224,68.380310,14343700,-0.605051
2017-08-23,68.193381,68.370970,67.791474,67.969063,13766500,-0.041291
...,...,...,...,...,...,...
2020-01-08,154.122900,155.936349,153.172546,155.247818,27746500,1.233897
2020-01-09,156.944909,157.313420,156.159411,157.187347,21385000,-0.464873


In [58]:
list_df_test[1]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-10,157.895252,158.283148,156.304843,156.460007,20725900,1.188150
2020-01-13,156.867310,158.370431,156.382433,158.341339,21626500,-0.709280
...,...,...,...,...,...,...
2020-10-22,209.109629,211.191639,206.929880,210.048004,22351500,0.619714
2020-10-23,210.184860,211.406695,208.357001,211.357819,18879600,-2.927458


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [66]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [60]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]


### Simulate one computation of the ML model

- Compute the model
- Calculate predictions on the test set
- Evaluate how good the model is

In [61]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

In [62]:
model_dt.fit(X_train, y_train)

In [27]:
from sklearn.metrics import mean_squared_error

y_pred = model_dt.predict(X_test)
error = mean_squared_error(y_test, y_pred)

### Add the procedure inside the for loop

In [28]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

list_df_train = []
list_df_test = []

list_y_preds =  []
list_errors = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    
    model_dt.fit(X_train, y_train)
    y_pred = model_dt.predict(X_test)
    error = mean_squared_error(y_test, y_pred)

    list_y_preds.append(y_pred)
    list_errors.append(error)

In [29]:
list_errors

[9.37728476681153,
 11.93334281841984,
 33.695782586127095,
 4.759242524119641,
 4.713438223196523]

In [30]:
import  numpy as np

In [31]:
np.mean(list_errors)

12.895818183734926

## Walk Forward evaluation in backtesting

### Create a new strategy in the library

- Within your library of strategies

### Import the strategy and perform the backtest

In [30]:
import strategies



In [36]:
from backtesting import Backtest

In [37]:
bt = Backtest(df, strategies.WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [41]:
%%time

stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 5), limit_sell = range(-5, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]')
dff

CPU times: user 8.4 s, sys: 23.4 ms, total: 8.42 s
Wall time: 8.42 s


Unnamed: 0,limit_buy,limit_sell,Return [%]
8,2,-3,-61.384028
10,2,-1,-60.946987
...,...,...,...
17,4,-2,36.666346
18,4,-1,36.666346


### Interpret the strategy's performance

In [36]:
bt.plot(filename='a.html')

## Anchored Walk Forward

## [ ] How to improve the strategy?

- Adding new finantial signals to the model