# Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Import Database

## read data from excel

In [2]:
import pandas as pd
data = pd.read_excel(r'C:\Users\ASUS\Desktop\machine learning\MLfinalReport\AHIdata.xlsx')
print(data)

     Aligned CVHR-OR-CEI   BMI  Age  supine index  neck  desaturation index  \
0                   24.9  23.2   32          19.9  37.0                 0.2   
1                   16.9  23.9   50           5.4  35.0                 5.8   
2                   11.5  28.0   63           9.8  42.0                15.8   
3                   53.7  30.4   43          41.7  40.0                56.1   
4                   13.5  26.4   27           7.3  37.0                 6.7   
..                   ...   ...  ...           ...   ...                 ...   
114                 37.9  30.7   36           NaN  43.0                26.7   
115                 76.8  39.4   44           NaN  44.0                79.9   
116                 35.6  26.1   56           NaN  38.0                27.9   
117                 28.7  27.2   47           NaN  39.0                65.0   
118                 30.0  29.4   22           NaN  37.0                 1.7   

     rescored AHI  CVHRI   CEI  
0            15.2 

## extract y from excel

In [3]:
df = pd.DataFrame(data, columns=['rescored AHI'])
print(type(df))#datatype is pd.dataframe not numpy array
df_2D = df.to_numpy()#convert into numpy array
print(df_2D)#now it is numpy 2d array

<class 'pandas.core.frame.DataFrame'>
[[ 15.2]
 [ 12. ]
 [ 27.8]
 [ 71.2]
 [ 34.3]
 [ 86.9]
 [ 17.5]
 [ 78.3]
 [ 40.5]
 [ 21.2]
 [ 94.7]
 [ 40.2]
 [ 41.1]
 [ 11.5]
 [ 61.4]
 [108.8]
 [ 59.6]
 [ 24.1]
 [  9.5]
 [ 33.2]
 [ 11.9]
 [ 76.7]
 [ 29.7]
 [ 23.7]
 [ 38. ]
 [ 79.9]
 [ 13.8]
 [ 61.9]
 [  8.8]
 [  8.5]
 [ 37.1]
 [ 38.7]
 [ 33.9]
 [ 46.4]
 [ 34.5]
 [ 25.8]
 [ 82.3]
 [ 13.4]
 [ 85.4]
 [ 42.9]
 [ 24.3]
 [ 37.8]
 [ 16.9]
 [ 47.6]
 [ 50. ]
 [ 21.1]
 [  1.6]
 [  2.1]
 [ 12.3]
 [ 66.3]
 [ 27.9]
 [ 17. ]
 [  2. ]
 [ 11. ]
 [ 53. ]
 [ 20.8]
 [ 50.9]
 [ 15.6]
 [ 66.2]
 [ 56.1]
 [ 49.8]
 [ 10.2]
 [ 77.7]
 [  7.2]
 [  3. ]
 [ 47.3]
 [ 78.9]
 [ 15.6]
 [  9. ]
 [ 47.2]
 [ 12. ]
 [  7.6]
 [ 42.3]
 [ 26.3]
 [  7.5]
 [ 31.5]
 [  1.9]
 [ 15.8]
 [ 11.6]
 [  8.5]
 [ 20.5]
 [ 52.4]
 [ 20.2]
 [ 16.6]
 [ 19.8]
 [ 37.9]
 [ 60.2]
 [ 36.1]
 [ 24.5]
 [ 24.6]
 [ 22.4]
 [ 30. ]
 [ 16.9]
 [  7.7]
 [  8.8]
 [  7.1]
 [ 39.1]
 [  4.6]
 [  1.8]
 [ 97.3]
 [ 60.5]
 [  6.9]
 [ 28.3]
 [ 41. ]
 [ 75. ]
 [ 13.7]
 [ 89.2]

In [4]:
y = df_2D.flatten()#flatten into 1d array
print(y)

[ 15.2  12.   27.8  71.2  34.3  86.9  17.5  78.3  40.5  21.2  94.7  40.2
  41.1  11.5  61.4 108.8  59.6  24.1   9.5  33.2  11.9  76.7  29.7  23.7
  38.   79.9  13.8  61.9   8.8   8.5  37.1  38.7  33.9  46.4  34.5  25.8
  82.3  13.4  85.4  42.9  24.3  37.8  16.9  47.6  50.   21.1   1.6   2.1
  12.3  66.3  27.9  17.    2.   11.   53.   20.8  50.9  15.6  66.2  56.1
  49.8  10.2  77.7   7.2   3.   47.3  78.9  15.6   9.   47.2  12.    7.6
  42.3  26.3   7.5  31.5   1.9  15.8  11.6   8.5  20.5  52.4  20.2  16.6
  19.8  37.9  60.2  36.1  24.5  24.6  22.4  30.   16.9   7.7   8.8   7.1
  39.1   4.6   1.8  97.3  60.5   6.9  28.3  41.   75.   13.7  89.2  76.5
  85.   77.   51.2   3.3  78.1   7.9  26.9  80.5  30.9  66.9   2.3]


## extract input from excel

In [5]:
#BMI
x1 = pd.DataFrame(data, columns=['BMI'])
x1_2D = x1.to_numpy()#convert into numpy array
x_bmi = x1_2D.flatten()#flatten into 1d array
#Age
x2 = pd.DataFrame(data, columns=['Age'])
x2_2D = x2.to_numpy()#convert into numpy array
x_age = x2_2D.flatten()#flatten into 1d array
#neck
x3 = pd.DataFrame(data, columns=['neck'])
x3_2D = x3.to_numpy()#convert into numpy array
x_neck = x3_2D.flatten()#flatten into 1d array
#desaturatino index
x4 = pd.DataFrame(data, columns=['desaturation index'])
x4_2D = x4.to_numpy()#convert into numpy array
x_dsi = x4_2D.flatten()#flatten into 1d array
#CVHR-OR_CEI
x5 = pd.DataFrame(data, columns=['Aligned CVHR-OR-CEI'])
x5_2D = x5.to_numpy()#convert into numpy array
x_cvhrorcei = x5_2D.flatten()#flatten into 1d array
#CVHRI
x6 = pd.DataFrame(data, columns=['CVHRI'])
x6_2D = x6.to_numpy()#convert into numpy array
x_cvhri = x6_2D.flatten()#flatten into 1d array
#CEI
x7 = pd.DataFrame(data, columns=['CEI'])
x7_2D = x7.to_numpy()#convert into numpy array
x_cei = x7_2D.flatten()#flatten into 1d array

# training a decision tree model

## decide your inputs

In [6]:
# X = x_cvhrorcei.reshape(-1, 1)
X = np.stack((x_cvhrorcei, x_neck), axis=-1)

## train test split

In [7]:
from sklearn.model_selection import train_test_split 
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.4, shuffle=False)

# create a decision tree regressor

## decide max_depth

In [8]:
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth':range(1, 20, 1)}

In [9]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)
grid = GridSearchCV(tree_reg, param_dist, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_val, y_val)

GridSearchCV(cv=3, estimator=DecisionTreeRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(1, 20)},
             scoring='neg_root_mean_squared_error')

In [10]:
grid.best_params_

{'max_depth': 2}

In [11]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=grid.best_params_.get('max_depth'), random_state=42)
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=2, random_state=42)

# Data visualization

## export_text visualization

In [12]:
from sklearn import tree
text_representation = tree.export_text(tree_reg, feature_names=("CVHRI","CEI"))
print(text_representation)

## plot tree visualization

In [13]:
fig = plt.figure(figsize=(60, 20))
_ = tree.plot_tree(tree_reg, feature_names = ("CVHRI","CEI"), filled=True)

# testing data evaluation

reference: https://towardsdatascience.com/train-a-regression-model-using-a-decision-tree-70012c22bcc1

In [14]:
from sklearn.metrics import mean_squared_error as mse #import library of mse

max_depths = range(1, 20)
#training error curve
training_error = []
for max_depth in max_depths:
    model_1 = DecisionTreeRegressor(max_depth=max_depth)
    model_1.fit(X_train, y_train)#use the same training set model
    training_error.append(mse(y_train, model_1.predict(X_train)))#training error
#testing error curve
testing_error = []
for max_depth in max_depths:
    model_2 = DecisionTreeRegressor(max_depth=max_depth)
    model_2.fit(X_train, y_train)#use the same training set model
    testing_error.append(mse(y_test, model_2.predict(X_test)))#testing error
#find the minimum mse on the testing error curve
mini_mse = np.argmin(testing_error)
optimized_max_depth = grid.best_params_.get('max_depth')
#graph setting
plt.plot(max_depths, training_error, color='blue', label='Training error')
plt.plot(max_depths, testing_error, color='green', label='Testing error')
plt.xlabel('Tree depth')
plt.axvline(mini_mse+1, color='orange', linestyle='--')#becuse list counts from 0, we need to plus 1
plt.annotate('optimum ='+(mini_mse+1).astype(str), xy=(mini_mse+1, min(testing_error)), color='red')
plt.axvline(optimized_max_depth, color='green', linestyle='--')
plt.annotate('max_depth ='+str(optimized_max_depth), xy=(optimized_max_depth, 0.9), color='green')
plt.ylabel('Mean squared error')
plt.title('Hyperparameter Tuning', pad=15, size=15)
plt.legend()
plt.savefig('error.png')

## MAE

In [15]:
from sklearn.metrics import mean_absolute_error as mae
y_test_expect = tree_reg.predict(X_test)
print(mae(y_test, y_test_expect))

20.70952380952381


## RMSE

In [16]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_test_expect, squared=False))

24.066021983610963
