# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [1]:
import yfinance as yf

ticker = 'META'
df = yf.download(ticker, multi_level_index=False, auto_adjust=False)
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-05-18,42.049999,45.000000,38.000000,38.230000,38.230000,573576400
2012-05-21,36.529999,36.660000,33.000000,34.029999,34.029999,168192700
...,...,...,...,...,...,...
2023-05-11,233.050003,238.210007,232.300003,235.789993,235.789993,20449000
2023-05-12,236.740005,236.960007,231.449997,233.809998,233.809998,16155300


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [2]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [3]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [4]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,206.750000,209.789993,206.270004,209.779999,209.779999,12077100,-0.531941
2020-01-03,207.210007,210.399994,206.949997,208.669998,208.669998,11188400,1.848546
...,...,...,...,...,...,...,...
2023-05-10,236.169998,236.750000,230.720001,233.080002,233.080002,19119000,1.149324
2023-05-11,233.050003,238.210007,232.300003,235.789993,235.789993,20449000,-0.846840


## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [5]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

In [6]:
n_days = len(df.index)
n_days

846

In [7]:
n_days_split = int(n_days*0.7)
n_days_split

592

In [8]:
X_train, y_train = X.iloc[:n_days_split], y.iloc[:n_days_split]
X_test, y_test = X.iloc[n_days_split:], y.iloc[n_days_split:]

### Fit the model on train set

In [9]:
from sklearn.tree import DecisionTreeRegressor

In [10]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [11]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [12]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

30.190822994915397

#### On train set

In [13]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

2.9786228619427435

## Backtesting

In [14]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [15]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [21]:
bt_test = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [22]:
results = bt_test.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2022-05-09 00:00:00
End,2023-05-11 00:00:00
Duration,367 days 00:00:00
Exposure Time [%],98.425197
Equity Final [$],11777.599015
Equity Peak [$],15048.499192
Return [%],17.77599


### Run the backtest on `train` data

In [23]:
bt_train = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt_train.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2020-01-02 00:00:00
End,2022-05-06 00:00:00
Duration,855 days 00:00:00
Exposure Time [%],93.412162
Equity Final [$],18421.327962
Equity Peak [$],18976.204845
Return [%],84.21328


### Compare both backtests

#### In the same DataFrame

- HINT: Concatenate the previous `DataFrames`

In [24]:
import pandas as pd

In [25]:
df_results = pd.concat([df_results_train, df_results_test], axis=1)
df_results

Unnamed: 0,In Sample (Train),Out of Sample (Test)
Start,2020-01-02 00:00:00,2022-05-09 00:00:00
End,2022-05-06 00:00:00,2023-05-11 00:00:00
Duration,855 days 00:00:00,367 days 00:00:00
Exposure Time [%],93.412162,98.425197
Equity Final [$],18421.327962,11777.599015
Equity Peak [$],18976.204845,15048.499192
Return [%],84.21328,17.77599


#### Plot both backtest reports

In [27]:
bt_test.plot(filename='reports_backtesting/regression_test_set.html')
bt_train.plot(filename='reports_backtesting/regression_train_set.html')

## How to solve the overfitting problem?

> Walk Forward Validation as a realistic approach to backtesting.

Next tutorial → [Walk Forward Validation]()

![](<src/10_Table_Validation Methods.png>)