# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [None]:
import yfinance as yf

ticker = '???'
df = yf.download(ticker, multi_level_index=False, auto_adjust=False)
df

## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [None]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [None]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [None]:
df = df.dropna().copy()
df

## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [None]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

### Fit the model on train set

### Evaluate model

#### On test set

In [None]:
from sklearn.metrics import ???

#### On train set

## Backtesting

In [None]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [None]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        ???

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [None]:
bt = Backtest(???, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [None]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

### Run the backtest on `train` data

In [None]:
bt = Backtest(???, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

### Compare both backtests

- HINT: Concatenate the previous `DataFrames`

#### Plot both backtest reports

## How to solve the overfitting problem?

> Walk Forward Validation as a realistic approach to backtesting.

Next tutorial → [Walk Forward Validation]()

![](<src/10_Table_Validation Methods.png>)