# Decision Trees for Regression JNJ

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'JNJ'
start = '2020-01-01'
end = '2022-09-03'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,145.869995,146.020004,145.080002,145.970001,135.936539,5777000
2020-01-03,143.5,145.369995,143.0,144.279999,134.362701,5752400
2020-01-06,144.0,144.199997,142.850006,144.100006,134.195114,7731300
2020-01-07,144.009995,145.449997,141.380005,144.979996,135.014587,7382900
2020-01-08,144.869995,145.979996,143.699997,144.960007,134.995956,6605800


In [3]:
# Create more data
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,-1)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,-1)
dataset['Return'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-03,143.5,145.369995,143.0,144.279999,134.362701,5752400,1,1,-1,-0.011578
2020-01-06,144.0,144.199997,142.850006,144.100006,134.195114,7731300,0,1,1,-0.001247
2020-01-07,144.009995,145.449997,141.380005,144.979996,135.014587,7382900,0,1,-1,0.006107
2020-01-08,144.869995,145.979996,143.699997,144.960007,134.995956,6605800,0,1,1,-0.000138
2020-01-09,145.759995,146.029999,144.990005,145.389999,135.396378,6112700,0,-1,-1,0.002966


In [4]:
dataset.shape

(673, 10)

In [5]:
X = dataset.drop(['Adj Close', 'Close'], axis=1)  
y = dataset['Adj Close'] 

In [6]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [7]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor()  
regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [8]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(regressor, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', random_seed=123)

print('MSE: %.3f' % avg_expected_loss)
print('Bias: %.3f' % avg_bias)
print('Variance: %.3f' % avg_var)

MSE: 6.806
Bias: 3.708
Variance: 3.098


In [9]:
y_pred = regressor.predict(X_test)

In [10]:
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
print(df.head())
print(df.tail())

                Actual   Predicted
Date                              
2022-03-09  167.155533  164.150040
2021-12-30  168.982391  166.387390
2021-11-02  161.335403  162.696198
2021-04-30  156.635071  160.205246
2020-01-24  138.125031  139.912766
                Actual   Predicted
Date                              
2020-04-24  145.135101  144.600906
2022-08-11  166.024475  160.324265
2020-09-30  141.432968  138.320602
2020-10-08  141.442459  139.223907
2021-10-22  159.513474  159.084763


In [11]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

Mean Absolute Error: 2.309075701678241
Mean Squared Error: 8.559503145089925
Root Mean Squared Error: 2.925662855677312


In [12]:
print(y_test.shape)
print(y_pred.shape)

(135,)
(135,)


In [13]:
from sklearn.model_selection import cross_val_score

dt_fit = regressor.fit(X_train, y_train)
dt_scores = cross_val_score(dt_fit, X_train, y_train, cv = 5)

print("Mean cross validation score: {}".format(np.mean(dt_scores)))
print("Score without cv: {}".format(dt_fit.score(X_train, y_train)))

Mean cross validation score: 0.9718841419769569
Score without cv: 1.0


In [14]:
from sklearn.metrics import r2_score

print('r2 score:', r2_score(y_test, dt_fit.predict(X_test)))
print('Accuracy Score:', dt_fit.score(X_test, y_test))

r2 score: 0.9715083905992271
Accuracy Score: 0.9715083905992271
