# Decision Trees for Regression PFE

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'PFE'
start = '2020-01-01'
end = '2022-09-03'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,37.28653,37.333965,36.888046,37.134724,33.538963,16514072
2020-01-03,36.736244,37.229603,36.688805,36.935486,33.35902,14922848
2020-01-06,36.83112,37.0019,36.71727,36.888046,33.316177,15771951
2020-01-07,37.115749,37.125237,36.698292,36.764706,33.204781,20108107
2020-01-08,36.774193,37.210625,36.764706,37.058823,33.470413,16403507


In [3]:
# Create more data
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,-1)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,-1)
dataset['Return'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-03,36.736244,37.229603,36.688805,36.935486,33.35902,14922848,1,1,-1,-0.005365
2020-01-06,36.83112,37.0019,36.71727,36.888046,33.316177,15771951,1,1,-1,-0.001284
2020-01-07,37.115749,37.125237,36.698292,36.764706,33.204781,20108107,0,-1,1,-0.003344
2020-01-08,36.774193,37.210625,36.764706,37.058823,33.470413,16403507,1,1,-1,0.008
2020-01-09,37.258064,37.258064,36.802658,36.897533,33.324745,21971895,0,-1,1,-0.004352


In [4]:
dataset.shape

(673, 10)

In [5]:
X = dataset.drop(['Adj Close', 'Close'], axis=1)  
y = dataset['Adj Close'] 

In [6]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [7]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor()  
regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [8]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(regressor, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', random_seed=123)

print('MSE: %.3f' % avg_expected_loss)
print('Bias: %.3f' % avg_bias)
print('Variance: %.3f' % avg_var)

MSE: 0.722
Bias: 0.405
Variance: 0.317


In [9]:
y_pred = regressor.predict(X_test)

In [10]:
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
print(df.head())
print(df.tail())

               Actual  Predicted
Date                            
2022-03-09  47.983334  48.426258
2021-12-30  57.047832  56.246819
2021-11-02  44.011349  44.234066
2021-04-30  36.728748  36.719246
2020-01-24  34.121662  37.608868
               Actual  Predicted
Date                            
2020-04-24  32.358025  33.649803
2022-08-11  48.290001  46.848602
2020-09-30  32.399586  33.503120
2020-10-08  32.567326  32.505531
2021-10-22  41.793831  41.813202


In [11]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

Mean Absolute Error: 0.697496979324906
Mean Squared Error: 0.9597733136342194
Root Mean Squared Error: 0.9796802098818876


In [12]:
print(y_test.shape)
print(y_pred.shape)

(135,)
(135,)


In [13]:
from sklearn.model_selection import cross_val_score

dt_fit = regressor.fit(X_train, y_train)
dt_scores = cross_val_score(dt_fit, X_train, y_train, cv = 5)

print("Mean cross validation score: {}".format(np.mean(dt_scores)))
print("Score without cv: {}".format(dt_fit.score(X_train, y_train)))

Mean cross validation score: 0.9916452345060606
Score without cv: 1.0


In [14]:
from sklearn.metrics import r2_score

print('r2 score:', r2_score(y_test, dt_fit.predict(X_test)))
print('Accuracy Score:', dt_fit.score(X_test, y_test))

r2 score: 0.9898089088670187
Accuracy Score: 0.9898089088670187
