## 26/02/2019

- Investigating the performance of a single model trained on segments from all electrodes.
- Investigate performance of different machine learning models.

In [1]:
# Import necessary modules. Set settings. Import data.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from epdata_tools import epdata_main, get_ep_features, get_ep_feature_dict
from IPython.display import HTML

from Augmentation import data_augmentation
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process.kernels import RBF
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw

import xgboost

from IPython.display import display, clear_output
import pdb

plt.style.use('default')

X = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/Data/X_all_channel_labels.pkl')

  from pandas.core import datetools


## Data Format
Each row corresponds to a recording of the patient's response to either an S1 or S2 pulse at a particular coupling interval (as identified by the 'Coupling Interval' column) in a particular channel (as identified by the 'Channel' column).

In [2]:
# Remove bad files with bad labels
X = X[~(X['Label 1']=='-1') & ~(X['Label 2']=='-1')]
X.head()

Unnamed: 0,Channel,Coupling Interval,Data,Patient,S1/S2,Type,Label 1,Label 2
0,CS1-2,340,"[-636, -617, -652, -560, -482, -415, -383, -46...",1,S2,af,0.0,0.0
1,CS1-2,340,"[-903.0, -873.0, -935.0, -941.0, -910.0, -845....",1,S1,af,,
2,CS1-2,340,"[-931.0, -896.0, -896.0, -906.0, -858.0, -839....",1,S1,af,,
3,CS3-4,340,"[472, 464, 491, 523, 553, 706, 1019, 1404, 164...",1,S2,af,0.0,0.0
4,CS3-4,340,"[298.0, 292.0, 303.0, 311.0, 299.0, 395.0, 451...",1,S1,af,,


Split the data into S1 and S2 DataFrames. 

In [3]:
X_S1 = X.loc[X['S1/S2']=='S1']
X_S2 = X.loc[X['S1/S2']=='S2']

## Training/Test Split
Divide the patients into training/test datasets

In [4]:
# Perform training test split on patients. i.e., 3 af_patients in the test set and 7 in training.
af_patients = X[(X['Type']=='af') & (X['S1/S2']=='S2')]['Patient'].unique()
at_patients = X[(X['Type']=='at') & (X['S1/S2']=='S2')]['Patient'].unique()
avnrt_patients = X[(X['Type']=='avnrt') & (X['S1/S2']=='S2')]['Patient'].unique()
avrt_patients = X[(X['Type']=='avrt') & (X['S1/S2']=='S2')]['Patient'].unique()
ep_patients = X[(X['Type']=='ep') & (X['S1/S2']=='S2')]['Patient'].unique()

random.shuffle(af_patients); random.shuffle(at_patients); random.shuffle(avrt_patients); random.shuffle(avnrt_patients); random.shuffle(ep_patients)

train_af_patients, test_af_patients = train_test_split(af_patients, test_size=0.3)
train_at_patients, test_at_patients = train_test_split(at_patients, test_size=0.3)
train_avnrt_patients, test_avnrt_patients = train_test_split(avnrt_patients, test_size=0.3)
train_avrt_patients, test_avrt_patients = train_test_split(avrt_patients, test_size=0.3)
train_ep_patients, test_ep_patients = train_test_split(ep_patients, test_size=0.3)

# Store trining and test patients in dictionaries
training_patients = {}
training_patients['af'] = train_af_patients
training_patients['at'] = train_at_patients
training_patients['avnrt'] = train_avnrt_patients
training_patients['avrt'] = train_avrt_patients
training_patients['ep'] = train_ep_patients

test_patients = {}
test_patients['af'] = test_af_patients
test_patients['at'] = test_at_patients
test_patients['avnrt'] = test_avnrt_patients
test_patients['avrt'] = test_avrt_patients
test_patients['ep'] = test_ep_patients

In [5]:
X_train = pd.concat([X_S2[(X_S2['Type']=='af') & ([(x in train_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='at') & ([(x in train_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='avnrt') & ([(x in train_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='avrt') & ([(x in train_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='ep') & ([(x in train_af_patients) for x in X_S2['Patient'].values])]])

X_test = pd.concat([X_S2[(X_S2['Type']=='af') & ([(x in test_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='at') & ([(x in test_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='avnrt') & ([(x in test_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='avrt') & ([(x in test_af_patients) for x in X_S2['Patient'].values])],
                     X_S2[(X_S2['Type']=='ep') & ([(x in test_af_patients) for x in X_S2['Patient'].values])]])

# Training the Models
Choice of either training a single model to predict fractionation given an input segment from any channel, or training three seperate models to predict fractionation in for segments of a particular channel. Let's start with a single model.

In [6]:
# Proportion of responses that are fractionated
prop_fractionated = np.sum(np.float64(X_train['Label 1'].values))/X_train.shape[0]
prop_fractionated

0.08571428571428572

## Data Augmentation
As the dataset is very asymetric (many more non-fractionated examples), here I augmented more fractionated examples as described in my technical milestone report. I have found that this improves classification performance very significantly - especially in reducing the number of false negatives (i.e. fractionated responses being predicted as non-fractionated).

In [7]:
# %matplotlib qt 
# Create new augmented data for each S2 row of X_compact
X_train['Augmented'] = 0
# Use for storing augmented_rows in the form of dicts
augmented_list = []
for _, row in X_train[X_train['Label 1']=='1'].iterrows():
    
    augmented_data = data_augmentation.augment_fractionation(row['Data'], 7, False)
    for data in augmented_data:
        augmented_row = {}
        augmented_row['Data'] = data
        augmented_row['Channel'] = row['Channel']
        augmented_row['Coupling Interval'] = row['Coupling Interval']
        augmented_row['Label 1'] = row['Label 1']
        augmented_row['Label 2'] = row['Label 2']
        augmented_row['Patient'] = row['Patient']
        augmented_row['S1/S2'] = row['S1/S2']
        augmented_row['Type'] = row['Type']
        augmented_row['Augmented'] = 1
        augmented_list.append(augmented_row)
    

augmented_data = pd.DataFrame(augmented_list)

In [8]:
# X_train = pd.concat([X_train, augmented_data], ignore_index=True)
X_train = pd.concat([X_train], ignore_index=True)

## Feature Extraction
Here the feature vectors are extracted for each row S2 response (including the augmented responses). A reference feature vector is subtracted (as described - the first S1 response of the patient (i.e. in the file with the largest S1/S2 coupling interval) is extracted and the feature vector calculated. This is subtracted from all feature vectors corresponding to all other S2 responses for that patient).

In [25]:
X_train_feature_list = []
X_test_feature_list = []
for i, row in X_train.iterrows():
    clear_output(wait=True)
    display('Extracting Training Features: ' + str(round(100*i/X_train.index[-1],3)) + '%')
    
    # Get typical response for this patient and channel
    typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    typical_feature_dict = get_ep_feature_dict(typical_response['Data'])
    feature_dict = get_ep_feature_dict(row['Data'])
    
    # Normalise by subtracting 'typical' feature values
    for k, v in feature_dict.items():
        feature_dict[k] = v - typical_feature_dict[k]
        
    # Add DTW distance
#     distance, path = fastdtw(row['Data']/max(abs(row['Data'])), typical_response['Data']/max(abs(typical_response['Data'])), dist=euclidean)
#     feature_dict['DTW'] = distance
        
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_train_feature_list.append(feature_dict)
    
for i, row in X_test.iterrows():
    clear_output(wait=True)
    display('Extracting Test Features: ' + str(round(100*i/X_test.index[-1],3)) + '%')
    
    # Get typical response for this patient and channel
    typical_response = X_S1[(X_S1['Type']==row['Type']) & 
                           (X_S1['Patient']==row['Patient']) &
                           (X_S1['Channel']==row['Channel'])
                           ].sort_values(by=['Coupling Interval'], ascending=False).iloc[0]
    typical_feature_dict = get_ep_feature_dict(typical_response['Data'])
    feature_dict = get_ep_feature_dict(row['Data'])
    
    # Normalise by subtracting 'typical' feature values
    for k, v in feature_dict.items():
        feature_dict[k] = v - typical_feature_dict[k]
        
    # Add DTW distance
#     distance, path = fastdtw(row['Data']/max(abs(row['Data'])), typical_response['Data']/max(abs(typical_response['Data'])), dist=euclidean)
#     feature_dict['DTW'] = distance
        
    # Fill in the other column values
    for col, value in row.iteritems():
        feature_dict[col] = value
        
    X_test_feature_list.append(feature_dict)

'Extracting Test Features: 100.0%'

In [26]:
X_train_features = pd.DataFrame(X_train_feature_list)
X_test_features = pd.DataFrame(X_test_feature_list)

In [27]:
x_training = X_train_features.drop(['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label 1', 'Label 2', 'Patient', 'Type', 'S1/S2'], axis=1)
y_training = X_train_features['Label 1']
info_training = X_train_features[['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label 1', 'Label 2', 'Patient', 'Type', 'S1/S2']]

x_test = X_test_features.drop(['Channel', 'Coupling Interval', 'Data', 'Label 1', 'Label 2', 'Patient', 'Type', 'S1/S2'], axis=1)
y_test = X_test_features['Label 1']
info_test = X_test_features[['Channel', 'Coupling Interval', 'Data', 'Label 1', 'Label 2', 'Patient', 'Type', 'S1/S2']]

In [28]:
x_training[y_training.values == '1']

Unnamed: 0,Approximate Entropy: m=2 r=0.7,Conduction Delay,Number of Peaks,Percentage Fractionation
36,0.188661,44,5,13.333333
38,0.265399,9,8,25.333333
41,0.147361,13,6,16.666667
70,0.060564,11,4,10.0
71,0.150661,7,4,9.333333
74,0.13974,19,7,18.666667
104,0.092506,0,5,16.0
107,0.146783,1,6,11.333333
108,0.156046,30,6,21.333333
110,0.236426,6,7,15.333333


In [29]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

## Results of Classifiers - Feature Selection Using Feature Importance
Here features are selected using feature importance/recursive feature elimination. 

In [31]:
# Get cross validation scores on training data, following by test score.
import warnings
warnings.filterwarnings('ignore')

models = (LogisticRegression(penalty='l2', C=0.5, random_state=0, solver='liblinear', class_weight="balanced"), 
          svm.SVC(class_weight="balanced"), 
          svm.LinearSVC(penalty='l2', C=1, dual=False, class_weight="balanced"),
          naive_bayes.GaussianNB(),  
          gaussian_process.GaussianProcessClassifier(kernel=1.0*RBF(1)), 
          xgboost.XGBClassifier(scale_pos_weight=(1/prop_fractionated)))
model_names = ('Logistic Regression', 'RBF SVC', 'Linear SVC', 'Naive Bayes', 'GP', 'XGBoost')

feature_names = x_training.columns

print('Original number of features: ' + str(x_training.shape[1]))

# Select according to feature importance. 
xgb = xgboost.XGBClassifier()
linear_svc = svm.LinearSVC(penalty='l2', C=1, dual=False)
log_regression = LogisticRegression(penalty='l2', C=1, random_state=0, solver='liblinear')


sfm = RFECV(log_regression, step=1, cv=3)
sfm.fit(x_training.values, y_training.values)
x_training_sparse = sfm.transform(x_training.values)
print('Number of features selected: ' + str(x_training_sparse.shape[1]))
mask = sfm.get_support() #list of booleans
selected_features = [] # The list of your K best features

for bool, feature in zip(mask, feature_names):
    if bool:
        selected_features.append(feature)
        
print(selected_features)

for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    print('Cross validation score:')
    print(cross_val_score(clf, x_training_sparse, y_training.values, cv=3))

# print('Test Score:')
# X_test_sparse = sfm.transform(X_test.values)
# xgb.fit(X_training_sparse, y_training.values)
# print(xgb.score(X_test_sparse, y_test.values))

Original number of features: 4
Number of features selected: 4
['Approximate Entropy: m=2 r=0.7', 'Conduction Delay', 'Number of Peaks', 'Percentage Fractionation']
~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
Cross validation score:
[0.85238095 0.89047619 0.92380952]
~~~~~~~~~~~~~~~~~~~~~~~~~
RBF SVC
Cross validation score:
[0.85714286 0.87619048 0.86666667]
~~~~~~~~~~~~~~~~~~~~~~~~~
Linear SVC
Cross validation score:
[0.87142857 0.8952381  0.93333333]
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
Cross validation score:
[0.92857143 0.94285714 0.93333333]
~~~~~~~~~~~~~~~~~~~~~~~~~
GP
Cross validation score:
[0.94761905 0.90952381 0.92857143]
~~~~~~~~~~~~~~~~~~~~~~~~~
XGBoost
Cross validation score:
[0.93809524 0.9        0.91904762]


In [32]:
x_test_sparse = sfm.transform(x_test.values)

for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(x_training_sparse, y_training.values)
    print('Test data score:')
    print(clf.score(x_test_sparse, y_test.values))
    
    predictions = clf.predict(x_test_sparse)
    cm = confusion_matrix(y_test.values, predictions)
    print_cm(cm, ['Not Fractionated','Fractionated'])

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
Test data score:
0.8259587020648967
           t/p       Not Fractionated     Fractionated 
    Not Fractionated            256.0             46.0 
        Fractionated             13.0             24.0 
~~~~~~~~~~~~~~~~~~~~~~~~~
RBF SVC
Test data score:
0.8672566371681416
           t/p       Not Fractionated     Fractionated 
    Not Fractionated            279.0             23.0 
        Fractionated             22.0             15.0 
~~~~~~~~~~~~~~~~~~~~~~~~~
Linear SVC
Test data score:
0.8407079646017699
           t/p       Not Fractionated     Fractionated 
    Not Fractionated            260.0             42.0 
        Fractionated             12.0             25.0 
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
Test data score:
0.8761061946902655
           t/p       Not Fractionated     Fractionated 
    Not Fractionated            276.0             26.0 
        Fractionated             16.0             21.0 
~~~~~~~~~~~~~~~~~~~~~~~~~
GP
