## 15/02/2019

- Compare the performance of different classification models on patient normalised feature vectors. 
- Plot the segments for which classification mistakes were made.

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from epdata_tools import epdata_main, get_ep_features
from IPython.display import HTML
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.gaussian_process.kernels import RBF
import xgboost

from IPython.display import display, clear_output
import pdb

plt.style.use('default')

In [55]:
X = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/Data/X_af.pkl')

In [56]:
# Create individual dataframes for the data we will be training seperate models on
X_CS12_df = X[(X['Channel']=='CS1-2') + (X['S1/S2']=='S2')]
X_CS34_df = X[(X['Channel']=='CS3-4') + (X['S1/S2']=='S2')]
X_CS56_df = X[(X['Channel']=='CS3-4') + (X['S1/S2']=='S2')]
X_CS78_df = X[(X['Channel']=='CS3-4') + (X['S1/S2']=='S2')]

In [65]:
X_12, X_34, X_56, X_78, y_12, y_34, y_56, y_78 = [], [], [], [], [], [], [], []
ci_12, ci_34, ci_56, ci_78, idx_12, idx_34, idx_56, idx_78 = [], [], [], [], [], [], [], []

for i, row in X[(X['S1/S2']=='S2')].iterrows():
    clear_output(wait=True)
    display('Extracting Features: ' + str(round(100*i/X[(X['S1/S2']=='S2')].index[-1],3)) + '%')
    coupling_interval = row['Coupling Interval']
    channel = row['Channel']
    data = row['Data']
    label = row['Label']
    patient = row['Patient']
    typical_s1 = X[(X['S1/S2']=='S1') & (X['Channel']==channel) & (X['Patient']==patient)].iloc[0]['Data']
    typical_s1_fv = get_ep_features(typical_s1)
    
    fv = get_ep_features(data)
    fv -= typical_s1_fv
    
    if(channel == 'CS1-2'):
        X_12.append(fv)
        y_12.append(int(label))
        ci_12.append(int(coupling_interval))
        idx_12.append(i)
    elif(channel == 'CS3-4'):
        X_34.append(fv)
        y_34.append(int(label))
        ci_34.append(int(coupling_interval))
        idx_34.append(i)
    elif(channel == 'CS5-6'):
        X_56.append(fv)
        y_56.append(int(label))
        ci_56.append(int(coupling_interval))
        idx_56.append(i)
    elif(channel == 'CS7-8'):
        X_78.append(fv)
        y_78.append(int(label))
        ci_78.append(int(coupling_interval))
        idx_78.append(i)
            

'Extracting Features: 100.0%'

In [66]:
X_12 = np.asarray(X_12); y_12 = np.asarray(y_12); ci_12 = np.asarray(ci_12); idx_12 = np.asarray(idx_12)
X_34 = np.asarray(X_34); y_34 = np.asarray(y_34); ci_34 = np.asarray(ci_34); idx_34 = np.asarray(idx_34)
X_56 = np.asarray(X_56); y_56 = np.asarray(y_56); ci_56 = np.asarray(ci_56); idx_56 = np.asarray(idx_56)
X_78 = np.asarray(X_78); y_78 = np.asarray(y_78); ci_78 = np.asarray(ci_78); idx_78 = np.asarray(idx_78)

print(X_12.shape); print(X_34.shape); print(X_56.shape); print(X_78.shape)
print(y_12.shape); print(y_34.shape); print(y_56.shape); print(y_78.shape)

(128, 11)
(128, 11)
(128, 11)
(128, 11)
(128,)
(128,)
(128,)
(128,)


In [67]:
no_errors = (y_12 != -1) & (y_34 != -1) & (y_34 != -1)

In [68]:
# X_array = np.concatenate((ci_12.reshape(-1,1), X_12, X_34, X_56),axis=1) including coupling intervals as features appears to have a negative impact on performance
X_array = np.concatenate((X_12, X_34, X_56),axis=1)
# Removes occurances of errors
X_array = X_array[no_errors]
y = np.logical_or(y_12, y_34, y_56).astype(int)
# Removes occurances of errors
y = y[no_errors]
idx = np.concatenate((idx_12.reshape(-1,1),idx_34.reshape(-1,1), idx_56.reshape(-1,1)),axis=1)
# Removes occurances of errors
idx = idx[no_errors]

In [69]:
# Remove occurances of errors from original labels
X_12 = X_12[y_12 != -1]; idx_12 = idx_12[y_12 != -1]; y_12 = y_12[y_12 != -1]
y_34 = y_34[y_34 != -1]; idx_34 = idx_34[y_34 != -1]; y_34 = y_34[y_34 != -1]
y_56 = y_56[y_56 != -1]; idx_56 = idx_56[y_56 != -1]; y_56 = y_56[y_56 != -1]

In [70]:
X_12_train, X_12_test, y_12_train, y_12_test, idx_12_train, idx_12_test = train_test_split(X_12, y_12, idx_12, test_size=0.3)
X_34_train, X_34_test, y_34_train, y_34_test, idx_34_train, idx_34_test = train_test_split(X_34, y_34, idx_34, test_size=0.3)
X_56_train, X_56_test, y_56_train, y_56_test, idx_56_train, idx_56_test = train_test_split(X_56, y_56, idx_56, test_size=0.3)
X_78_train, X_78_test, y_78_train, y_78_test, idx_78_train, idx_78_test = train_test_split(X_78, y_78, idx_78, test_size=0.3)


X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X_array, y, idx, test_size=0.3)

In [71]:
import warnings
warnings.filterwarnings('ignore')

models = (svm.SVC(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), 
          gaussian_process.GaussianProcessClassifier(kernel=1.0*RBF(1)), xgboost.XGBClassifier())
model_names = ('SVM', 'Naive Bayes', 'KNN', 'GP', 'XGBoost')
print('Cross validation scores on combined data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_train, y_train, cv=3))
    
print('Cross validation scores on CS1-2 data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_12_train, y_12_train, cv=3))
    
print('Cross validation scores on CS3-4 data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_34_train, y_34_train, cv=3))
    
print('Cross validation scores on CS5-6 data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_56_train, y_56_train, cv=3))
    
print('Cross validation scores on CS7-8 data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_78_train, y_78_train, cv=3))

Cross validation scores on combined data:
SVM
[0.77777778 0.77777778 0.77777778]
Naive Bayes
[0.81481481 0.96296296 0.88888889]
KNN
[0.77777778 0.81481481 0.77777778]
GP
[0.77777778 0.77777778 0.77777778]
XGBoost
[0.88888889 0.92592593 1.        ]
Cross validation scores on CS1-2 data:
SVM
[0.81481481 0.81481481 0.81481481]
Naive Bayes
[0.92592593 1.         0.92592593]
KNN
[0.7037037  0.77777778 0.85185185]
GP
[0.85185185 0.81481481 0.88888889]
XGBoost
[0.96296296 0.96296296 0.96296296]
Cross validation scores on CS3-4 data:
SVM
[0.83333333 0.83333333 0.86206897]
Naive Bayes
[0.9        0.8        0.82758621]
KNN
[0.9        0.93333333 0.89655172]
GP
[0.9        0.93333333 0.89655172]
XGBoost
[0.9        0.86666667 0.89655172]
Cross validation scores on CS5-6 data:
SVM
[0.66666667 0.66666667 0.68965517]
Naive Bayes
[0.83333333 0.86666667 0.86206897]
KNN
[0.76666667 0.73333333 0.75862069]
GP
[0.7        0.63333333 0.79310345]
XGBoost
[0.76666667 0.76666667 0.86206897]
Cross validation 

In [72]:
clf = xgboost.XGBClassifier()
print('Test score on CS1-2 data:')
clf.fit(X_12_train, y_12_train)
print(clf.score(X_12_test, y_12_test))
print('Test score on CS3-4 data:')
clf.fit(X_34_train, y_34_train)
print(clf.score(X_34_test, y_34_test))
print('Test score on CS5-6 data:')
clf.fit(X_56_train, y_56_train)
print(clf.score(X_56_test, y_56_test))
print('Test score on CS7-8 data:')
clf.fit(X_78_train, y_78_train)
print(clf.score(X_78_test, y_78_test))
print('Test score on combined data:')
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
predictions = clf.predict(X_test)
mistake_idxs = idx_test[predictions != y_test]
mistake_labels = np.concatenate((predictions[predictions != y_test].reshape(-1,1), y_test[predictions != y_test].reshape(-1,1)), axis=1)

mistake_idxs = np.squeeze(mistake_idxs)

print(mistake_labels)

Test score on CS1-2 data:
0.9444444444444444
Test score on CS3-4 data:
0.9230769230769231
Test score on CS5-6 data:
0.8461538461538461
Test score on CS7-8 data:
1.0
Test score on combined data:
0.9444444444444444
[[0 1]
 [0 1]]


In [73]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

In [74]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print_cm(cm, ['Not Fractionated','Fractionated'])

           t/p       Not Fractionated     Fractionated 
    Not Fractionated             27.0              0.0 
        Fractionated              2.0              7.0 


In [75]:
mistake_idxs = np.squeeze(mistake_idxs)
mistake_idxs

array([[  96,   99,  102],
       [1156, 1159, 1162]])

In [76]:
%matplotlib qt 
for i, [cs12_idx, cs34_idx, cs56_idx] in enumerate(mistake_idxs):
    patient = X['Patient'].loc[cs12_idx]
    coupling_interval = X['Coupling Interval'].loc[cs12_idx]
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12,4))
    [ax1, ax2, ax3] = axes.flatten()
    ax1.plot(X['Data'].loc[cs12_idx])
    ax1.set_title('CS1-2')
    ax2.plot(X['Data'].loc[cs34_idx])
    ax2.set_title('CS3-4')
    ax3.plot(X['Data'].loc[cs56_idx])
    ax3.set_title('CS5-6')
    plt.suptitle('Patient: ' + patient + ' Coupling Interval: ' + coupling_interval + '\n Predicted label: ' + str(mistake_labels[i,0]) + ' True label: ' + str(mistake_labels[i,1]))
    plt.subplots_adjust(top = 0.8, bottom = 0.2)
    plt.draw()
    plt.waitforbuttonpress()
    plt.close()