In [27]:
import sys
sys.path.insert(0, '/Users/matthewashman/github/MasterProject2018')

# Import necessary modules. Set settings. Import data.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pywt
import math
from IPython.display import HTML

# For model building
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score, make_scorer
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA

# For feature extraction
from scipy.interpolate import CubicSpline      # for warping
from statsmodels.robust import mad
from tsfresh.feature_extraction import feature_calculators
from FeatureExtraction.feature_tools import detect_peaks
from sklearn.utils import resample
import fastdtw

# Miscelaneous
from IPython.display import display, clear_output
import pdb

plt.style.use('default')

X = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/extracted_segments_with_labels_updated_2.pkl')

In [28]:
print(X.shape)
X = X[~(X['Label']=='-1') & ~(X['Label']=='?')]
print(X.shape)

(7398, 7)
(7174, 7)


In [29]:
# Perform training test split on patients. i.e., 3 af_patients in the test set and 7 in training.
af_patients = X[(X['Type']=='af') & (X['S1/S2']=='S2')]['Patient'].unique()
at_patients = X[(X['Type']=='at') & (X['S1/S2']=='S2')]['Patient'].unique()
avnrt_patients = X[(X['Type']=='avnrt') & (X['S1/S2']=='S2')]['Patient'].unique()
avrt_patients = X[(X['Type']=='avrt') & (X['S1/S2']=='S2')]['Patient'].unique()
ep_patients = X[(X['Type']=='ep') & (X['S1/S2']=='S2')]['Patient'].unique()

af_test_patients = [str(x) for x in range(11, 20)]
at_test_patients = ['4']
avnrt_test_patients = [str(x) for x in range(23, 37)]
avrt_test_patients = [str(x) for x in range(9, 21)]
ep_test_patients = [str(x) for x in range(9, 16)]

af_train_patients = [x for x in af_patients if x not in af_test_patients]
at_train_patients = [x for x in af_patients if x not in at_test_patients]
avnrt_train_patients = [x for x in avnrt_patients if x not in avnrt_test_patients]
avrt_train_patients = [x for x in avrt_patients if x not in avrt_test_patients]
ep_train_patients = [x for x in ep_patients if x not in ep_test_patients]


train_af_patients, val_af_patients = train_test_split(af_train_patients, test_size=0.2, random_state=2)
train_at_patients, val_at_patients = train_test_split(at_train_patients, test_size=0.2, random_state=3)
train_avnrt_patients, val_avnrt_patients = train_test_split(avnrt_train_patients, test_size=0.2, random_state=4)
train_avrt_patients, val_avrt_patients = train_test_split(avrt_train_patients, test_size=0.2, random_state=5)
train_ep_patients, val_ep_patients = train_test_split(ep_train_patients, test_size=0.2, random_state=6)

In [30]:
train_af_patients

['8', '3', '4', '1', '6', '9', '10']

In [31]:
training_data = pd.concat([X[(X['Type']=='af') & ([(x in train_af_patients) for x in X['Patient'].values])],
                          X[(X['Type']=='at') & ([(x in train_at_patients) for x in X['Patient'].values])],
                          X[(X['Type']=='avnrt') & ([(x in train_avnrt_patients) for x in X['Patient'].values])],
                          X[(X['Type']=='avrt') & ([(x in train_avrt_patients) for x in X['Patient'].values])],
                          X[(X['Type']=='ep') & ([(x in train_ep_patients) for x in X['Patient'].values])]],
                          ignore_index=True)

test_data = pd.concat([X[(X['Type']=='af') & ([(x in af_test_patients) for x in X['Patient'].values])],
                      X[(X['Type']=='at') & ([(x in at_test_patients) for x in X['Patient'].values])],
                      X[(X['Type']=='avnrt') & ([(x in avnrt_test_patients) for x in X['Patient'].values])],
                      X[(X['Type']=='avrt') & ([(x in avrt_test_patients) for x in X['Patient'].values])],
                      X[(X['Type']=='ep') & ([(x in ep_test_patients) for x in X['Patient'].values])]],
                      ignore_index=True)

validation_data = pd.concat([X[(X['Type']=='af') & ([(x in val_af_patients) for x in X['Patient'].values])],
                             X[(X['Type']=='at') & ([(x in val_at_patients) for x in X['Patient'].values])],
                             X[(X['Type']=='avnrt') & ([(x in val_avnrt_patients) for x in X['Patient'].values])],
                             X[(X['Type']=='avrt') & ([(x in val_avrt_patients) for x in X['Patient'].values])],
                             X[(X['Type']=='ep') & ([(x in val_ep_patients) for x in X['Patient'].values])]],
                             ignore_index=True)

In [32]:
print(np.sum((test_data['Label']=='0').astype(int)))
print(np.sum((training_data['Label']=='0').astype(int)))
print(np.sum((validation_data['Label']=='0').astype(int)))

951
830
246


In [33]:
print(train_af_patients)

['8', '3', '4', '1', '6', '9', '10']


In [34]:
print(len(train_at_patients) + len(train_avrt_patients) + len(train_avnrt_patients) + len(train_ep_patients))
print(len(val_af_patients) + len(val_avrt_patients) + len(val_avnrt_patients) + len(val_ep_patients))
print(len(avrt_test_patients) + len(avnrt_test_patients) + len(ep_test_patients))

38
10
33


In [35]:
training_data.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/training_data.pkl')
validation_data.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/validation_data.pkl')
test_data.to_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/test_data.pkl')