# set up

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# import dataset

In [2]:
import pandas as pd
data = pd.read_excel(r'C:\Users\ASUS\Desktop\machine learning\MLfinalReport\AHIdata.xlsx')

In [3]:
df = pd.DataFrame(data, columns=['rescored AHI'])
print(type(df))#datatype is pd.dataframe not numpy array
df_2D = df.to_numpy()#convert into numpy array

<class 'pandas.core.frame.DataFrame'>


In [4]:
y_stack = df_2D.flatten()#flatten into 1d array

In [5]:
for count in range (0, 119):
    if y_stack[count]>=30:#AHI>=30, class:3
        y_stack[count]=3
    elif y_stack[count]>=15:#30>AHI>=15, class:2
        y_stack[count]=2
    elif y_stack[count]>=5:#15>AHI>=5, class: 1
        y_stack[count]=1
    else:
        y_stack[count]=0#5>AHI, class:0

# extract inputs form excel

In [6]:
#BMI
x1 = pd.DataFrame(data, columns=['BMI'])
x1_2D = x1.to_numpy()#convert into numpy array
x_bmi = x1_2D.flatten()#flatten into 1d array
#Age
x2 = pd.DataFrame(data, columns=['Age'])
x2_2D = x2.to_numpy()#convert into numpy array
x_age = x2_2D.flatten()#flatten into 1d array
#neck
x3 = pd.DataFrame(data, columns=['neck'])
x3_2D = x3.to_numpy()#convert into numpy array
x_neck = x3_2D.flatten()#flatten into 1d array
#desaturatino index
x4 = pd.DataFrame(data, columns=['desaturation index'])
x4_2D = x4.to_numpy()#convert into numpy array
x_dsi = x4_2D.flatten()#flatten into 1d array
#CVHR-OR_CEI
x5 = pd.DataFrame(data, columns=['Aligned CVHR-OR-CEI'])
x5_2D = x5.to_numpy()#convert into numpy array
x_cvhrorcei = x5_2D.flatten()#flatten into 1d array
#CVHRI
x6 = pd.DataFrame(data, columns=['CVHRI'])
x6_2D = x6.to_numpy()#convert into numpy array
x_cvhri = x6_2D.flatten()#flatten into 1d array
#CEI
x7 = pd.DataFrame(data, columns=['CEI'])
x7_2D = x7.to_numpy()#convert into numpy array
x_cei = x7_2D.flatten()#flatten into 1d array

In [7]:
X_stack = np.stack((x_cvhri, x_cei), axis=-1)

In [8]:
from sklearn.model_selection import train_test_split 
X_trainval, X_test, y_trainval, y_test = train_test_split(X_stack, y_stack, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.4, shuffle=False)

### code from hands on ML

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
rnd_clf1 = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=42)
rnd_clf2 = RandomForestClassifier(n_estimators=1300, max_depth=3, random_state=42)
svm_clf1 = SVC(kernel='sigmoid', gamma=1, C=10, decision_function_shape='ovo', random_state=42, probability=True)
svm_clf2 = SVC(kernel='rbf', gamma=1, C=1000, decision_function_shape='ovo', random_state=42, probability=True)


voting_clf = VotingClassifier(
    estimators=[('tree', tree_clf), ('rf1', rnd_clf1), ('rf2', rnd_clf2), ('svc1', svm_clf1), ('svc2', svm_clf2)],
    voting='hard')

In [10]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(max_depth=5,
                                                     random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=2,
                                                     n_estimators=500,
                                                     random_state=42)),
                             ('rf2',
                              RandomForestClassifier(max_depth=3,
                                                     n_estimators=1300,
                                                     random_state=42)),
                             ('svc1',
                              SVC(C=10, decision_function_shape='ovo', gamma=1,
                                  kernel='sigmoid', probability=True,
                                  random_state=42)),
                             ('svc2',
                              SVC(C=1000, decision_f

In [11]:
from sklearn.metrics import accuracy_score

for clf in (tree_clf, rnd_clf1, rnd_clf2, svm_clf1, svm_clf2, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

DecisionTreeClassifier 0.6666666666666666
RandomForestClassifier 0.8333333333333334
RandomForestClassifier 0.8333333333333334
SVC 0.6111111111111112
SVC 0.6666666666666666
VotingClassifier 0.6666666666666666


# testing data evaluation

In [12]:
from sklearn import metrics
expected = y_test
predicted = voting_clf.predict(X_test)

print("classification for classifier %s:\n%s\n"
      % (voting_clf, metrics.classification_report(expected, predicted)))

classification for classifier VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(max_depth=5,
                                                     random_state=42)),
                             ('rf1',
                              RandomForestClassifier(max_depth=2,
                                                     n_estimators=500,
                                                     random_state=42)),
                             ('rf2',
                              RandomForestClassifier(max_depth=3,
                                                     n_estimators=1300,
                                                     random_state=42)),
                             ('svc1',
                              SVC(C=10, decision_function_shape='ovo', gamma=1,
                                  kernel='sigmoid', probability=True,
                                  random_state=42)),
                             ('svc2',
                      

In [13]:
from sklearn.metrics import accuracy_score
y_pred = voting_clf.predict(X_test)
print(voting_clf.__class__.__name__, accuracy_score(y_test, y_pred))

VotingClassifier 0.6666666666666666


In [14]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("voting classifier confusion matrix:\n", cm)

voting classifier confusion matrix:
 [[ 2  5]
 [ 1 10]]
