# The overfitting problem

![](<src/09_Table_The Overfitting.png>)

## Load the data

In [1]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-08,56.325228,56.582507,55.902560,56.058762,21220800,1.549143,UP
2016-12-09,56.214968,56.959234,56.169027,56.940857,27349400,0.321692,UP
...,...,...,...,...,...,...,...
2023-03-14,256.750000,261.070007,255.860001,260.790009,33620300,1.751806,UP
2023-03-15,259.980011,266.480011,259.209991,265.440002,46028000,3.895731,UP


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [2]:
target = df.change_tomorrow
explanatory = df[['Open','High','Low','Close','Volume']]

## Train test split

### Split the dataset

In [3]:
n_days = len(df.index)
n_days

1576

In [4]:
n_days_split = int(n_days*0.7)
n_days_split

1103

In [5]:
X_train, y_train = explanatory.iloc[:n_days_split], target.iloc[:n_days_split]
X_test, y_test = explanatory.iloc[n_days_split:], target.iloc[n_days_split:]

### Fit the model on train set

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [8]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [9]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

9.937085384182808

#### On train set

In [10]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

0.7335232619210788

## [ ] Backtesting

In [11]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [12]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [13]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [14]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2021-04-29 00:00:00
End,2023-03-15 00:00:00
Duration,685 days 00:00:00
Exposure Time [%],92.811839
Equity Final [$],11034.131324
Equity Peak [$],13371.824828
Return [%],10.341313


### Run the backtest on `train` data

In [15]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2016-12-08 00:00:00
End,2021-04-28 00:00:00
Duration,1602 days 00:00:00
Exposure Time [%],99.365367
Equity Final [$],62957.753308
Equity Peak [$],64404.1712
Return [%],529.577533


### Compare both backtests

In [17]:
df_results = pd.concat([df_results_test, df_results_train], axis=1)
df_results

Unnamed: 0,Out of Sample (Test),In Sample (Train)
Start,2021-04-29 00:00:00,2016-12-08 00:00:00
End,2023-03-15 00:00:00,2021-04-28 00:00:00
Duration,685 days 00:00:00,1602 days 00:00:00
Exposure Time [%],92.811839,99.365367
Equity Final [$],11034.131324,62957.753308
Equity Peak [$],13371.824828,64404.1712
Return [%],10.341313,529.577533


Bad pipe message: %s [b'\xfc\x91\xd6T\x8c\xf4\xda$\x04\x1a\x15N\x8b\x10\xfa\xcan\xad \xcf\x18n\xd1H7\x84\x0f\xcb\t\x1c\xdf\xce\x9a,\xf5[\x16\xa2\xfe\xa1\xa7~\xab\xae\xd6\x8cs)s70\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00', b'\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00']
Bad pipe message: %s [b'\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01']
Bad pipe message: %s [b"7)\xd0B\xd9\xad{Z3~-\x87B\n\x01\x05\xa29\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\

## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="09D_The Overfitting Problem.ipynb">notebook</a>.