# The overfitting problem

![](<src/09_Table_The Overfitting.png>)

## Load the data

In [None]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df

## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [None]:
target = df.change_tomorrow
explanatory = df[['Open','High','Low','Close','Volume']]

## Train test split

### Split the dataset

In [None]:
n_days = len(df.index)
n_days

In [None]:
n_days_split = int(n_days*???)
n_days_split

In [None]:
X_train, y_train = ???
X_test, y_test = ???

### Fit the model on train set

### Evaluate model

#### On test set

In [None]:
from sklearn.metrics import ???

In [None]:
mean_squared_error(???)

#### On train set

## [ ] Backtesting

In [None]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [None]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [None]:
bt = Backtest(???, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [None]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

### Run the backtest on `train` data

In [None]:
bt = Backtest(???, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

### Compare both backtests

In [None]:
df_results = pd.concat(???)
df_results

## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="09C_The Overfitting Problem.ipynb">notebook</a>.