# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [1]:
import yfinance as yf

ticker = 'AAPL'
df = yf.download(ticker)
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.099722,469033600
1980-12-15,0.122210,0.122210,0.121652,0.121652,0.094519,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087582,105728000
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089749,86441600
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.092351,73449600
...,...,...,...,...,...,...
2023-05-05,170.979996,174.300003,170.759995,173.570007,173.570007,113316400
2023-05-08,172.479996,173.850006,172.110001,173.500000,173.500000,55962800
2023-05-09,173.050003,173.539993,171.600006,171.770004,171.770004,45326900
2023-05-10,173.020004,174.029999,171.899994,173.559998,173.559998,53724500


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [2]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [3]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [4]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,74.059998,75.150002,73.797501,75.087502,73.449387,135480400,-0.981742
2020-01-03,74.287498,75.144997,74.125000,74.357498,72.735313,146322800,0.790528
2020-01-06,73.447502,74.989998,73.187500,74.949997,73.314888,118387200,-0.472537
2020-01-07,74.959999,75.224998,74.370003,74.597504,72.970078,108872000,1.583165
2020-01-08,74.290001,76.110001,74.290001,75.797501,74.143898,132079200,2.079900
...,...,...,...,...,...,...,...
2023-05-04,164.889999,167.039993,164.309998,165.789993,165.789993,81235400,4.482349
2023-05-05,170.979996,174.300003,170.759995,173.570007,173.570007,113316400,-0.040350
2023-05-08,172.479996,173.850006,172.110001,173.500000,173.500000,55962800,-1.007158
2023-05-09,173.050003,173.539993,171.600006,171.770004,171.770004,45326900,1.031340


## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [5]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

In [6]:
n_days = len(df.index)
n_days

845

In [7]:
n_days_split = int(n_days*0.7)
n_days_split

591

In [9]:
X_train, y_train = X.iloc[:n_days_split], y.iloc[:n_days_split]
X_test, y_test = X.iloc[n_days_split:], y.iloc[n_days_split:]

### Fit the model on train set

In [10]:
from sklearn.tree import DecisionTreeRegressor

In [11]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [12]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [13]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

5.357520725392334

#### On train set

In [14]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

1.5665932948394417

## Backtesting

In [15]:
from backtesting import Backtest, Strategy



### Create the `Strategy`

In [16]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [17]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [18]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2022-05-06 00:00:00
End,2023-05-10 00:00:00
Duration,369 days 00:00:00
Exposure Time [%],98.031496
Equity Final [$],11941.717128
Equity Peak [$],12047.287044
Return [%],19.417171


### Run the backtest on `train` data

In [19]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2020-01-02 00:00:00
End,2022-05-05 00:00:00
Duration,854 days 00:00:00
Exposure Time [%],96.277496
Equity Final [$],23402.254628
Equity Peak [$],26045.994071
Return [%],134.022546


### Compare both backtests

In [21]:
import pandas as pd

In [22]:
df_results = pd.concat([df_results_train, df_results_test], axis=1)
df_results

Unnamed: 0,In Sample (Train),Out of Sample (Test)
Start,2020-01-02 00:00:00,2022-05-06 00:00:00
End,2022-05-05 00:00:00,2023-05-10 00:00:00
Duration,854 days 00:00:00,369 days 00:00:00
Exposure Time [%],96.277496,98.031496
Equity Final [$],23402.254628,11941.717128
Equity Peak [$],26045.994071,12047.287044
Return [%],134.022546,19.417171


## Continue with the tutorials on the following chapter

**How to solve the overfitting problem?**

Walk Forward Validation: A Realistic Approach to Algorithmic Trading

[LinkedIn Course Chapter]()

![](<src/10_Table_Validation Methods.png>)