# Challenge: Walk Forward on Other Datasets

## Download data from `yfinance`

In [3]:
import yfinance as yf

ticker = 'AAPL'
df = yf.download(ticker)
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.099058,469033600
1980-12-15,0.122210,0.122210,0.121652,0.121652,0.093890,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086999,105728000
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089152,86441600
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091737,73449600
...,...,...,...,...,...,...
2024-06-03,192.899994,194.990005,192.520004,194.029999,194.029999,50080500
2024-06-04,194.639999,195.320007,193.029999,194.350006,194.350006,47471400
2024-06-05,195.399994,196.899994,194.869995,195.869995,195.869995,54156800
2024-06-06,195.690002,196.500000,194.169998,194.479996,194.479996,41181800


## Preprocess the data

### Filter the date range

In [4]:
df = df.loc['2018-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [5]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [6]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-02,42.540001,43.075001,42.314999,43.064999,40.615883,102223600,-0.017378
2018-01-03,43.132500,43.637501,42.990002,43.057499,40.608826,118071600,0.462290
2018-01-04,43.134998,43.367500,43.020000,43.257500,40.797428,89738400,1.125764
2018-01-05,43.360001,43.842499,43.262501,43.750000,41.261940,94640000,-0.372834
2018-01-08,43.587502,43.902500,43.482498,43.587502,41.108673,82271200,-0.011471
...,...,...,...,...,...,...,...
2024-05-31,191.440002,192.570007,189.910004,192.250000,192.250000,75158300,0.917383
2024-06-03,192.899994,194.990005,192.520004,194.029999,194.029999,50080500,0.164655
2024-06-04,194.639999,195.320007,193.029999,194.350006,194.350006,47471400,0.776019
2024-06-05,195.399994,196.899994,194.869995,195.869995,195.869995,54156800,-0.714726


## Machine Learning modelling

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [7]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

### Time Series Split

In [8]:
from sklearn.model_selection import TimeSeriesSplit

In [9]:
ts = TimeSeriesSplit(test_size = 200)

### Compute and evaluate model in a for loop

1. Separate the data in train and test
2. Compute the model on the train set
3. Evaluate the model (mse) on the test set
4. Append the errors (mse) in an empty list

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [11]:
list_rmse = []

for index_train, index_test in ts.split(X=df):
  X_train, y_train = X.iloc[index_train], y.iloc[index_train]
  X_test, y_test = X.iloc[index_test], y.iloc[index_test]
  
  model = RandomForestRegressor()
  model.fit(X_train, y_train)

  y_pred_test = model.predict(X_test)

  mse = mean_squared_error(y_test, y_pred_test)
  rmse = mse ** .5
  
  list_rmse.append(rmse)
list_rmse

[2.9527575135353614,
 1.97867746018195,
 2.3609652842759608,
 1.8502509287499302,
 1.405120666557147]

In [12]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [13]:
splits = ts.split(X = df)
split = next(splits)

index_train = split[0]
index_test = split[1]

X_train, y_train = X.iloc[index_train], y.iloc[index_train]
X_test, y_test = X.iloc[index_test], y.iloc[index_test]


model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

mse = mean_squared_error(y_train, y_pred_train)
rmse = mse ** .5

print("Training Set Error:", rmse)

y_pred_test = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = mse ** .5

print("Testing Set Error:", rmse)




Training Set Error: 0.86321791647874
Testing Set Error: 3.1258350616941715


In [14]:
list_rmse_train = []
list_rmse_test = []

for index_train, index_test in ts.split(X=df):
  X_train, y_train = X.iloc[index_train], y.iloc[index_train]
  X_test, y_test = X.iloc[index_test], y.iloc[index_test]

  model = RandomForestRegressor()
  model.fit(X_train, y_train)

  y_pred_train = model.predict(X_train)

  mse = mean_squared_error(y_train, y_pred_train)
  rmse = mse ** .5

  list_rmse_train.append(rmse)

  y_pred_test = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred_test)
  rmse = mse ** .5

  list_rmse_test.append(rmse)
  
print("Training:",list_rmse_train)
print("Testing:", list_rmse_test)

print()
print("Training:", np.mean(list_rmse_train))
print("Testing:", np.mean(list_rmse_test))

Training: [0.8688447054097643, 0.9019589569116868, 0.8291792018631354, 0.8503819117386842, 0.8279675258688046]
Testing: [3.071997817295114, 1.9797576786130937, 2.3495061406688875, 1.8674428928174693, 1.4193968022884664]

Training: 0.8556664603584151
Testing: 2.137620266336606


## Anchored Walk Forward evaluation in backtesting

![](<src/10_Table_Validation Methods.png>)

### Create a new strategy

In [15]:
from backtesting import Strategy

In [16]:
class Regression(Strategy):
    n_train = 600
    coef_retrain = 200

    limit_buy = 4
    limit_sell = -4
    
    def init(self):
        self.already_bought = False
        self.model =  RandomForestRegressor(random_state = 42, max_depth = 15)

        X_train = self.data.df.iloc[:self.n_train, :-1]
        y_train = self.data.df.iloc[:self.n_train, -1]
        
        self.model.fit(X_train, y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [17]:
class WalkForwardAnchored(Regression):
  
    def next(self):
      
      # conditions to retrain the model
      if(len(self.data) < self.n_train):
         return
      
      if(len(self.data) % self.coef_retrain == 0):
                
        X_train = self.data.df.iloc[:, :-1]
        y_train = self.data.df.iloc[:, -1]

        self.model.fit(X_train, y_train)
        super().next()
         
      else:
        super().next()

### Run the backtest with optimization

In [18]:
import multiprocessing as mp
mp.set_start_method('fork', True)

In [19]:
from backtesting import Backtest
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [20]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'  
    )


In [21]:
dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

Unnamed: 0,limit_buy,limit_sell,Return [%]
0,0,-6,135.604729
1,0,-3,125.727771
4,1,-5,78.568706
5,1,-4,78.568706
6,1,-3,71.029124
2,0,-2,2.724296
14,3,-4,0.0
21,5,-5,0.0
20,4,-1,0.0
19,4,-2,0.0


## Unanchored Walk Forward

### Create a library of strategies

[strategies.py](strategies.py)

### Create the unanchored walk forward class

In the previously created library

![](<src/10_Table_Validation Methods.png>)

### Import the strategy and perform the backtest with optimization

In [22]:
%load_ext autoreload
%autoreload 2

In [23]:
from strategies2 import WalkForwardUnanchored

In [24]:
bt_unanchored = Backtest(df, WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'  
    )

In [None]:
dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

Unnamed: 0,limit_buy,limit_sell,Return [%]
0,0,-6,135.604729
2,0,-3,135.604729
1,0,-5,135.604729
4,1,-5,78.568706
5,1,-4,78.568706
6,1,-3,78.568706
14,3,-4,0.0
21,5,-5,0.0
20,4,-1,0.0
19,4,-2,0.0


### Interpret the strategies' performance

In [None]:
bt.plot(filename='reports_backtesting/walk_forward_anchored.html')

NameError: name 'bt' is not defined

In [None]:
bt_unanchored.plot(filename='reports_backtesting/walk_forward_unanchored.html')

Start                     2018-01-02 00:00:00
End                       2024-06-06 00:00:00
Duration                   2347 days 00:00:00
Exposure Time [%]                   62.793572
Equity Final [$]                 23560.472867
Equity Peak [$]                  23850.872647
Return [%]                         135.604729
Buy & Hold Return [%]              351.596428
Return (Ann.) [%]                   14.279111
Volatility (Ann.) [%]               26.607549
Sharpe Ratio                         0.536656
Sortino Ratio                        0.922789
Calmar Ratio                         0.457656
Max. Drawdown [%]                   -31.20051
Avg. Drawdown [%]                   -5.013573
Max. Drawdown Duration      525 days 00:00:00
Avg. Drawdown Duration       41 days 00:00:00
# Trades                                    3
Win Rate [%]                        66.666667
Best Trade [%]                      78.956227
Worst Trade [%]                      -1.06396
Avg. Trade [%]                    

## Course Conclusion

Watch video → [Next steps]()