# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [1]:
import yfinance as yf

ticker = 'INTC'
df = yf.download(ticker)
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-03-17,0.325521,0.330729,0.325521,0.325521,0.182651,10924800
1980-03-18,0.325521,0.328125,0.322917,0.322917,0.181190,17068800
1980-03-19,0.330729,0.335938,0.330729,0.330729,0.185573,18508800
1980-03-20,0.330729,0.334635,0.329427,0.329427,0.184843,11174400
1980-03-21,0.322917,0.322917,0.317708,0.317708,0.178267,12172800
...,...,...,...,...,...,...
2024-05-21,31.980000,32.240002,31.620001,31.740000,31.740000,42975400
2024-05-22,31.910000,32.070000,31.100000,31.420000,31.420000,36706400
2024-05-23,31.450001,31.570000,29.870001,30.080000,30.080000,62014500
2024-05-24,30.290001,31.020000,30.129999,30.719999,30.719999,42408200


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [2]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [3]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [4]:
df = df.dropna().copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,60.240002,60.970001,60.220001,60.840000,54.006916,18056000,-1.231292
2020-01-03,59.810001,60.700001,59.810001,60.099998,53.350021,15293900,-0.283676
2020-01-06,59.590000,60.200001,59.330002,59.930000,53.199108,17755200,-1.696936
2020-01-07,59.779999,59.799999,58.889999,58.930000,52.311417,21876100,0.067852
2020-01-08,58.889999,59.320000,58.520000,58.970001,52.346935,23133500,0.556488
...,...,...,...,...,...,...,...
2024-05-20,31.850000,32.419998,31.760000,32.099998,32.099998,31989000,-1.134211
2024-05-21,31.980000,32.240002,31.620001,31.740000,31.740000,42975400,-1.018459
2024-05-22,31.910000,32.070000,31.100000,31.420000,31.420000,36706400,-4.454788
2024-05-23,31.450001,31.570000,29.870001,30.080000,30.080000,62014500,2.083331


## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [5]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

In [6]:
n_days_split = int(len(df) * .7)
n_days_split

774

In [7]:
X_train, y_train = X[:n_days_split], y[:n_days_split]
X_test, y_test = X[n_days_split:], y[n_days_split:]

In [8]:
print(X_train.shape)
print(y_train.shape)

print()

print(X_test.shape)
print(y_test.shape)

(774, 6)
(774,)

(333, 6)
(333,)


### Fit the model on train set

In [18]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state = 42, max_depth = 15)

In [19]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=15, random_state=42)

### Evaluate model

#### On test set

In [20]:
from sklearn.metrics import mean_squared_error

forecast_test = dt.predict(X_test)

In [21]:
mse = mean_squared_error(forecast_test, y_test)

rmse = mse ** .5

print(mse)
print(rmse)

10.390616008695735
3.2234478448853077


#### On train set

In [22]:
forecast_train = dt.predict(X_train)

## Backtesting

In [23]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [24]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X_train, y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [25]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [26]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Buy & Hold Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test
#results

Unnamed: 0,Out of Sample (Test)
Start,2023-01-30 00:00:00
End,2024-05-24 00:00:00
Duration,480 days 00:00:00
Exposure Time [%],97.597598
Equity Final [$],8579.548462
Equity Peak [$],12823.277094
Return [%],-14.204515
Buy & Hold Return [%],9.910549


### Run the backtest on `train` data

In [132]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Buy & Hold Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2020-01-02 00:00:00
End,2022-12-01 00:00:00
Duration,1064 days 00:00:00
Exposure Time [%],98.233696
Equity Final [$],13262.87671
Equity Peak [$],20607.133113
Return [%],32.628767
Buy & Hold Return [%],-50.969757


### Compare both backtests

- HINT: Concatenate the previous `DataFrames`

In [133]:
import pandas as pd
pd.concat([df_results_train, df_results_test], axis=1)

Unnamed: 0,In Sample (Train),Out of Sample (Test)
Start,2020-01-02 00:00:00,2022-12-02 00:00:00
End,2022-12-01 00:00:00,2024-03-07 00:00:00
Duration,1064 days 00:00:00,461 days 00:00:00
Exposure Time [%],98.233696,97.151899
Equity Final [$],13262.87671,14762.897614
Equity Peak [$],20607.133113,14782.338059
Return [%],32.628767,47.628976
Buy & Hold Return [%],-50.969757,56.919421


#### Plot both backtest reports

In [134]:
#bt.plot(filename='backtest_report_9C.html')

## How to solve the overfitting problem?

> Walk Forward Validation as a realistic approach to backtesting.

Next tutorial → [Walk Forward Validation]()

![](<src/10_Table_Validation Methods.png>)