# Decision Trees for Regression MRNA

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'MRNA'
start = '2020-01-01'
end = '2022-09-03'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,19.57,19.809999,18.879999,19.23,19.23,1233600
2020-01-03,19.02,19.360001,18.719999,18.889999,18.889999,1751000
2020-01-06,18.700001,18.74,18.01,18.129999,18.129999,1606500
2020-01-07,18.15,18.32,17.68,17.780001,17.780001,1461400
2020-01-08,17.99,18.08,17.76,17.98,17.98,1041600


In [3]:
# Create more data
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,-1)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,-1)
dataset['Return'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-03,19.02,19.360001,18.719999,18.889999,18.889999,1751000,0,-1,-1,-0.017681
2020-01-06,18.700001,18.74,18.01,18.129999,18.129999,1606500,0,-1,-1,-0.040233
2020-01-07,18.15,18.32,17.68,17.780001,17.780001,1461400,0,-1,1,-0.019305
2020-01-08,17.99,18.08,17.76,17.98,17.98,1041600,1,1,1,0.011249
2020-01-09,18.0,18.549999,17.9,18.4,18.4,1495700,1,1,1,0.023359


In [4]:
dataset.shape

(673, 10)

In [5]:
X = dataset.drop(['Adj Close', 'Close'], axis=1)  
y = dataset['Adj Close'] 

In [6]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [7]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor()  
regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [8]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(regressor, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', random_seed=123)

print('MSE: %.3f' % avg_expected_loss)
print('Bias: %.3f' % avg_bias)
print('Variance: %.3f' % avg_var)

MSE: 51.994
Bias: 22.628
Variance: 29.367


In [9]:
y_pred = regressor.predict(X_test)

In [10]:
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
print(df.head())
print(df.tail())

                Actual   Predicted
Date                              
2022-03-09  142.490005  133.880005
2021-12-30  251.600006  247.880005
2021-11-02  348.640015  349.559998
2021-04-30  178.820007  179.520004
2020-01-24   21.120001   20.940001
                Actual   Predicted
Date                              
2020-04-24   50.500000   50.500000
2022-08-11  171.809998  173.250000
2020-09-30   70.750000   71.279999
2020-10-08   72.930000   75.309998
2021-10-22  326.540009  325.839996


In [11]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

Mean Absolute Error: 4.091926659478082
Mean Squared Error: 46.55669878026184
Root Mean Squared Error: 6.823246938244419


In [12]:
print(y_test.shape)
print(y_pred.shape)

(135,)
(135,)


In [13]:
from sklearn.model_selection import cross_val_score

dt_fit = regressor.fit(X_train, y_train)
dt_scores = cross_val_score(dt_fit, X_train, y_train, cv = 5)

print("Mean cross validation score: {}".format(np.mean(dt_scores)))
print("Score without cv: {}".format(dt_fit.score(X_train, y_train)))

Mean cross validation score: 0.9956884196568676
Score without cv: 1.0


In [14]:
from sklearn.metrics import r2_score

print('r2 score:', r2_score(y_test, dt_fit.predict(X_test)))
print('Accuracy Score:', dt_fit.score(X_test, y_test))

r2 score: 0.9964731457444133
Accuracy Score: 0.9964731457444133
