# Walk Forward: A Realistic Approach to Backtesting

In [14]:
import multiprocessing as mp
mp.set_start_method('fork')

## Load the data

In [15]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df = df.drop(columns='change_tomorrow_direction')

## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [16]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [31]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)
model_rf = RandomForestRegressor(max_depth=15, random_state=42)
model_gb = GradientBoostingRegressor(max_depth=15, random_state=42)

In [42]:
ts = TimeSeriesSplit(max_train_size=600, test_size=200)

list_model_evaluation = []

for model_dt in [model_dt, model_rf, model_gb]:

    list_errors = []

    for index_train, index_test in ts.split(df):
        X_train, y_train = X.iloc[index_train], y.iloc[index_train]
        X_test, y_test = X.iloc[index_test], y.iloc[index_test]
        
        model_dt.fit(X_train, y_train)
        y_pred = model_dt.predict(X_test)
        error = mean_squared_error(y_test, y_pred)

        list_errors.append(error)
        
    model_name = model_dt.__class__.__name__

    dic_evaluation = {'model': model_name, 'error': list_errors}
    df_evaluation = pd.DataFrame(dic_evaluation)

    list_model_evaluation.append(df_evaluation)


In [48]:
df_evaluation = pd.concat(list_model_evaluation)
df_evaluation = df_evaluation.pivot(columns='model', values='error')
df_evaluation

model,DecisionTreeRegressor,GradientBoostingRegressor,RandomForestRegressor
0,9.377285,8.862129,3.183943
1,11.933343,11.226811,9.690221
2,33.695783,29.572435,12.946489
3,4.759243,4.411631,3.772971
4,4.713438,5.194259,4.781834


In [50]:
df_evaluation.mean()

model
DecisionTreeRegressor        12.895818
GradientBoostingRegressor    11.853453
RandomForestRegressor         6.875092
dtype: float64

In [17]:
from backtesting import Strategy

In [52]:
class Regression(Strategy):
    model = RandomForestRegressor(max_depth=15, random_state=42)

    limit_buy = 1
    limit_sell = -5

    N_TRAIN = 600

    def init(self):
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.N_TRAIN, :-1]
        y_train = self.data.df.iloc[:self.N_TRAIN, -1]

        self.model.fit(X_train, y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        # conditions to sell or buy
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [53]:
class WalkForward(Regression):
    def next(self):

        if len(self.data) < self.N_TRAIN:
            return # we don't take any action and move on to the following day
        
        if len(self.data) % 200 != 0:
            return super().next()
        
        X_train = self.data.df.iloc[-self.N_TRAIN:, :-1]
        y_train = self.data.df.iloc[-self.N_TRAIN:, -1]

        self.model.fit(X_train, y_train)

        super().next()

In [54]:
from backtesting import Backtest
bt = Backtest(df, WalkForward, cash=10000, commission=.002, exclusive_orders=True)

In [57]:
%%time

stats_skopt, heatmap, optimize_result = bt.optimize(
    N_TRAIN = range(200, 1000, 200),
    # model = [model_rf],
    limit_buy = range(0, 5), limit_sell = range(-5, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=0,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

CPU times: user 2min 44s, sys: 578 ms, total: 2min 44s
Wall time: 2min 45s


In [60]:
dff = heatmap.reset_index()

In [61]:
dff.sort_values('Return [%]', ascending=False).head(10)

Unnamed: 0,N_TRAIN,limit_buy,limit_sell,Return [%]
10,306,2,-3,194.173358
11,307,2,-3,194.173358
7,283,0,-2,184.660849
9,305,2,-3,184.083945
8,299,2,-3,184.083945
14,315,1,-3,183.393347
20,354,1,-3,181.24104
18,349,1,-5,180.893777
22,359,1,-4,178.2818
24,371,1,-4,173.244107


In [None]:
dff.sort_values('Return [%]', ascending=False).head(10)

Unnamed: 0,N_TRAIN,model,limit_buy,limit_sell,Return [%]
1,200,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,310.866395
6,201,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,310.866395
11,202,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,308.181746
16,203,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,302.889076
20,204,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,300.0814
25,208,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,299.487162
14,202,"DecisionTreeRegressor(max_depth=15, random_sta...",1,-4,296.9686
31,211,"DecisionTreeRegressor(max_depth=15, random_sta...",0,-4,294.169261
4,200,"DecisionTreeRegressor(max_depth=15, random_sta...",1,-4,278.793174
24,207,"DecisionTreeRegressor(max_depth=15, random_sta...",1,-4,278.006079


In [239]:
result = bt.run(N_TRAIN=200, model=model_dt, limit_buy=0, limit_sell=-4)

In [185]:
bt.plot(filename='a.html')

In [240]:
result.to_frame(name='Value').loc[:'Return [%]'].style

Unnamed: 0,Value
Start,2016-12-08 00:00:00
End,2023-03-15 00:00:00
Duration,2288 days 00:00:00
Exposure Time [%],87.182741
Equity Final [$],41086.639496
Equity Peak [$],49179.374064
Return [%],310.866395
