# set up

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# import dataset

## read data from excel

In [2]:
import pandas as pd
data = pd.read_excel(r'C:\Users\ASUS\Desktop\machine learning\MLfinalReport\AHIdata.xlsx')

## extract y from excel

In [3]:
df = pd.DataFrame(data, columns=['rescored AHI'])
print(type(df))#datatype is pd.dataframe not numpy array
df_2D = df.to_numpy()#convert into numpy array
#print(df_2D)#now it is numpy 2d array

<class 'pandas.core.frame.DataFrame'>


In [4]:
y_stack = df_2D.flatten()#flatten into 1d array
print(y_stack)

[ 15.2  12.   27.8  71.2  34.3  86.9  17.5  78.3  40.5  21.2  94.7  40.2
  41.1  11.5  61.4 108.8  59.6  24.1   9.5  33.2  11.9  76.7  29.7  23.7
  38.   79.9  13.8  61.9   8.8   8.5  37.1  38.7  33.9  46.4  34.5  25.8
  82.3  13.4  85.4  42.9  24.3  37.8  16.9  47.6  50.   21.1   1.6   2.1
  12.3  66.3  27.9  17.    2.   11.   53.   20.8  50.9  15.6  66.2  56.1
  49.8  10.2  77.7   7.2   3.   47.3  78.9  15.6   9.   47.2  12.    7.6
  42.3  26.3   7.5  31.5   1.9  15.8  11.6   8.5  20.5  52.4  20.2  16.6
  19.8  37.9  60.2  36.1  24.5  24.6  22.4  30.   16.9   7.7   8.8   7.1
  39.1   4.6   1.8  97.3  60.5   6.9  28.3  41.   75.   13.7  89.2  76.5
  85.   77.   51.2   3.3  78.1   7.9  26.9  80.5  30.9  66.9   2.3]


## extract inputs from excel

In [5]:
#BMI
x1 = pd.DataFrame(data, columns=['BMI'])
x1_2D = x1.to_numpy()#convert into numpy array
x_bmi = x1_2D.flatten()#flatten into 1d array
#Age
x2 = pd.DataFrame(data, columns=['Age'])
x2_2D = x2.to_numpy()#convert into numpy array
x_age = x2_2D.flatten()#flatten into 1d array
#neck
x3 = pd.DataFrame(data, columns=['neck'])
x3_2D = x3.to_numpy()#convert into numpy array
x_neck = x3_2D.flatten()#flatten into 1d array
#desaturatino index
x4 = pd.DataFrame(data, columns=['desaturation index'])
x4_2D = x4.to_numpy()#convert into numpy array
x_dsi = x4_2D.flatten()#flatten into 1d array
#CVHR-OR_CEI
x5 = pd.DataFrame(data, columns=['Aligned CVHR-OR-CEI'])
x5_2D = x5.to_numpy()#convert into numpy array
x_cvhrorcei = x5_2D.flatten()#flatten into 1d array
#CVHRI
x6 = pd.DataFrame(data, columns=['CVHRI'])
x6_2D = x6.to_numpy()#convert into numpy array
x_cvhri = x6_2D.flatten()#flatten into 1d array
#CEI
x7 = pd.DataFrame(data, columns=['CEI'])
x7_2D = x7.to_numpy()#convert into numpy array
x_cei = x7_2D.flatten()#flatten into 1d array

# Data preprocessing

## decide your inputs

In [6]:
X_stack = np.stack((x_cvhrorcei, x_bmi), axis=-1)

## data rescaling

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_transpose = scaler.fit_transform(X_stack.T)
X = X_transpose.T

## train test split

In [8]:
from sklearn.model_selection import train_test_split 
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_stack, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.4, shuffle=False)

# Training SVR model

## decide hyperparameters

In [9]:
from sklearn.model_selection import GridSearchCV
param_dist = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
              'epsilon':[0.01, 0.1, 1, 10, 100]}

In [10]:
from sklearn import svm
model = svm.SVR()
grid =GridSearchCV(model, param_dist, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_val, y_val)

GridSearchCV(cv=3, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'epsilon': [0.01, 0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             scoring='neg_root_mean_squared_error')

In [11]:
grid.best_params_

{'C': 10, 'epsilon': 1, 'gamma': 1, 'kernel': 'sigmoid'}

## create a SVR model

In [12]:
from sklearn.svm import SVR
model = svm.SVR(kernel = grid.best_params_.get('kernel'), 
                C = grid.best_params_.get('C'), 
                gamma = grid.best_params_.get('gamma'), 
                epsilon = grid.best_params_.get('epsilon'))

In [13]:
model.fit(X_train, y_train)

SVR(C=10, epsilon=1, gamma=1, kernel='sigmoid')

# testing data evaluation

## MAE

In [14]:
from sklearn.metrics import mean_absolute_error as mae
y_test_expect = model.predict(X_test)
print(mae(y_test, y_test_expect))

22.70000010532749


## RMSE

In [15]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_test_expect, squared=False))

25.486554272166277


#### from sklearn website: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [16]:
# from sklearn.svm import SVR
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
# regr.fit(X_train, y_train)

In [17]:
# def plot_svm_regression(svm_reg, X, y, axes):
#     x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1)
#     y_pred = svm_reg.predict(x1s)
#     plt.plot(x1s, y_pred, "k-", linewidth=2, label=r"$\hat{y}$")
#     plt.plot(x1s, y_pred + svm_reg.epsilon, "k--")
#     plt.plot(x1s, y_pred - svm_reg.epsilon, "k--")
#     plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors='#FFAAAA')
#     plt.plot(X, y, "bo")
#     plt.xlabel(r"$x_1$", fontsize=18)
#     plt.legend(loc="upper left", fontsize=18)
#     plt.axis(axes)

# fig, axes = plt.subplots(ncols=2, figsize=(9, 4), sharey=True)
# plt.sca(axes[0])
# plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11])
# plt.title(r"$\epsilon = {}$".format(svm_reg1.epsilon), fontsize=18)
# plt.ylabel(r"$y$", fontsize=18, rotation=0)
# #plt.plot([eps_x1, eps_x1], [eps_y_pred, eps_y_pred - svm_reg1.epsilon], "k-", linewidth=2)
# plt.annotate(
#         '', xy=(eps_x1, eps_y_pred), xycoords='data',
#         xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon),
#         textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}
#     )
# plt.text(0.91, 5.6, r"$\epsilon$", fontsize=20)
# plt.sca(axes[1])
# plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11])
# plt.title(r"$\epsilon = {}$".format(svm_reg2.epsilon), fontsize=18)
# save_fig("svm_regression_plot")
# plt.show()