# Multioutput Regression With Cross-Validation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fetch yahoo data
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# Only keep close columns 
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.95,3.95,3.98,3.84,3.85,20548400
2014-01-03,4.0,4.0,4.0,3.88,3.98,22887200
2014-01-06,4.13,4.13,4.18,3.99,4.01,42398300
2014-01-07,4.18,4.18,4.25,4.11,4.19,42932100
2014-01-08,4.18,4.18,4.26,4.14,4.23,30678700


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [4]:
dataset.tail()

Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-08-20,19.98,19.98,20.08,19.35,19.790001,62983200,0,1,1,0.010622
2018-08-21,20.4,20.4,20.42,19.860001,19.98,55629000,1,1,1,0.021021
2018-08-22,20.9,20.9,20.92,20.209999,20.280001,62002700,1,1,1,0.02451
2018-08-23,22.290001,22.290001,22.32,21.139999,21.190001,113444100,1,1,1,0.066507
2018-08-24,23.98,23.98,24.0,22.67,22.91,164328200,0,0,0,0.075819


In [5]:
X = dataset.drop(['Adj Close', 'Open'], axis=1)
Y = dataset[['Adj Close', 'Open']]

In [6]:
print(X.shape, Y.shape)

(1170, 8) (1170, 2)


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

model = DecisionTreeRegressor()
model.fit(X, Y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [8]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [9]:
print(dataset.describe())

         Adj Close        Close         High          Low         Open  \
count  1170.000000  1170.000000  1170.000000  1170.000000  1170.000000   
mean      7.002803     7.002803     7.146650     6.855419     7.001342   
std       4.861608     4.861608     4.953517     4.754049     4.855620   
min       1.620000     1.620000     1.690000     1.610000     1.620000   
25%       2.702500     2.702500     2.780000     2.660000     2.710000   
50%       4.275000     4.275000     4.350000     4.175000     4.250000   
75%      11.550000    11.550000    11.775000    11.307500    11.572500   
max      23.980000    23.980000    24.000000    22.670000    22.910000   

             Volume  Increase_Decrease  Buy_Sell_on_Open     Buy_Sell  \
count  1.170000e+03        1170.000000       1170.000000  1170.000000   
mean   3.763371e+07           0.454701          0.505128     0.499145   
std    3.355409e+07           0.498157          0.500188     0.500213   
min    0.000000e+00           0.000000   

In [10]:
from numpy import absolute, mean, std
n_scores = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)

In [11]:
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 0.086 (0.012)


In [12]:
model.score(X, Y)

1.0