# The overfitting problem

![](<src/09_Table_The Overfitting.png>)

## Load the data

In [1]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-08,56.325228,56.582507,55.902560,56.058762,21220800,1.549143,UP
2016-12-09,56.214968,56.959234,56.169027,56.940857,27349400,0.321692,UP
...,...,...,...,...,...,...,...
2023-03-14,256.750000,261.070007,255.860001,260.790009,33620300,1.751806,UP
2023-03-15,259.980011,266.480011,259.209991,265.440002,46028000,3.895731,UP


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [2]:
target = df.change_tomorrow
explanatory = df[['Open','High','Low','Close','Volume']]

## Train test split

### Split the dataset

In [3]:
n_days = len(df.index)
n_days

1576

In [4]:
n_days_split = int(n_days*0.7)
n_days_split

1103

In [5]:
X_train, y_train = explanatory.iloc[:n_days_split], target.iloc[:n_days_split]
X_test, y_test = explanatory.iloc[n_days_split:], target.iloc[n_days_split:]

### Fit the model on train set

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [8]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [9]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

9.937085384182808

#### On train set

In [10]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

0.7335232619210788

## [ ] Backtesting

In [12]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [13]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [14]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [15]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2021-04-29 00:00:00
End,2023-03-15 00:00:00
Duration,685 days 00:00:00
Exposure Time [%],92.811839
Equity Final [$],11034.131324
Equity Peak [$],13371.824828
Return [%],10.341313


### Run the backtest on `train` data

In [16]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2016-12-08 00:00:00
End,2021-04-28 00:00:00
Duration,1602 days 00:00:00
Exposure Time [%],99.365367
Equity Final [$],62957.753308
Equity Peak [$],64404.1712
Return [%],529.577533


### Compare both backtests

In [17]:
df_results = pd.concat([df_results_train, df_results_test], axis=1)
df_results

Unnamed: 0,In Sample (Train),Out of Sample (Test)
Start,2016-12-08 00:00:00,2021-04-29 00:00:00
End,2021-04-28 00:00:00,2023-03-15 00:00:00
Duration,1602 days 00:00:00,685 days 00:00:00
Exposure Time [%],99.365367,92.811839
Equity Final [$],62957.753308,11034.131324
Equity Peak [$],64404.1712,13371.824828
Return [%],529.577533,10.341313


## How to solve the overfitting problem?

Walk Forward Validation: A Realistic Approach to Algorithmic Trading

![](<src/10_Table_Validation Methods.png>)