# Investigating the use of TSFresh
## 16/02/19
Here the use of TSFresh for feature selection will be investigated. Feature selection is performed only on the training data, preventing bias as much as possible. Unlike the previous feature based classification method, in which a much more manual approach towards feature selection was adopted, this method (in it's 'out of the box' implementation) does not attempt to normalise with respect to each patient.

In [261]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tsfresh
from sklearn.model_selection import train_test_split

import pdb

plt.style.use('default')

In [262]:
X_orig = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/Data/X_af.pkl')

In [263]:
X_orig.head()

Unnamed: 0,Channel,Coupling Interval,Data,Patient,S1/S2,Type,Label
0,CS1-2,340,"[-636, -617, -652, -560, -482, -415, -383, -46...",1,S2,af,0.0
1,CS1-2,340,"[-903.0, -873.0, -935.0, -941.0, -910.0, -845....",1,S1,af,
2,CS1-2,340,"[-931.0, -896.0, -896.0, -906.0, -858.0, -839....",1,S1,af,
3,CS3-4,340,"[472, 464, 491, 523, 553, 706, 1019, 1404, 164...",1,S2,af,0.0
4,CS3-4,340,"[298.0, 292.0, 303.0, 311.0, 299.0, 395.0, 451...",1,S1,af,


In [268]:
# X = pd.DataFrame(columns=['Type', 'Patient', 'Coupling Interval', 'Label', 'CS1-2', 'CS3-4', 'CS5-6', 'CS7-8'])
X_list = []
patients = ('1', '2', '3', '4', '5', '6', '8', '9', '10')
patient_type = 'af'
for patient in patients:
    for pulse_type in ('S1', 'S2'):
        patient_X = X_orig[(X_orig['Patient']==patient) & (X_orig['S1/S2']==pulse_type)]
        cis = patient_X['Coupling Interval'].unique()
        for ci in cis:
            ci_patient_X = patient_X[patient_X['Coupling Interval']==ci]
            # Extract data
            if 1 in np.float64(ci_patient_X['Label'].values):
                label = 1
            elif 0 in np.float64(ci_patient_X['Label'].values):
                label = 0
            else:
                label = np.nan

            # Sometimes 2 rows of data if S1.
            cs12_datas = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS1-2']['Data'])
            cs34_datas = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS3-4']['Data'])
            cs56_datas = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS5-6']['Data'])
            cs78_datas = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS7-8']['Data'])
            
            if isinstance(cs12_datas, pd.core.series.Series):
                for cs12_data, cs34_data, cs56_data, cs78_data in zip(cs12_datas, cs34_datas, cs56_datas, cs78_datas):
                    temp_dict = {}
                    temp_dict.update({'Type':'af', 'Patient':patient, 'Coupling Interval':ci, 'S1/S2':pulse_type,
                                      'Label':label, 'CS1-2':cs12_data, 'CS3-4':cs34_data, 'CS5-6':cs56_data, 
                                      'CS7-8':cs78_data})
                    X_list.append(temp_dict)
            else:
                temp_dict = {}
                temp_dict.update({'Type':'af', 'Patient':patient, 'Coupling Interval':ci, 'S1/S2':pulse_type,
                                  'Label':label, 'CS1-2':cs12_datas, 'CS3-4':cs34_datas, 'CS5-6':cs56_datas, 
                                  'CS7-8':cs78_datas})
            
            X_list.append(temp_dict)
    
X = pd.DataFrame(X_list)

In [269]:
X.head()

Unnamed: 0,CS1-2,CS3-4,CS5-6,CS7-8,Coupling Interval,Label,Patient,S1/S2,Type
0,"[-903.0, -873.0, -935.0, -941.0, -910.0, -845....","[298.0, 292.0, 303.0, 311.0, 299.0, 395.0, 451...","[1776.0, 2220.0, 1790.0, -4058.0, -15027.0, -2...","[6283.0, 6464.0, 5952.0, 4507.0, 3694.0, 2873....",340,,1,S1,af
1,"[-931.0, -896.0, -896.0, -906.0, -858.0, -839....","[339.0, 328.0, 348.0, 328.0, 299.0, 361.0, 349...","[1678.0, 2049.0, 2080.0, -1681.0, -11648.0, -1...","[6161.0, 6208.0, 5897.0, 4703.0, 3732.0, 2980....",340,,1,S1,af
2,"[-931.0, -896.0, -896.0, -906.0, -858.0, -839....","[339.0, 328.0, 348.0, 328.0, 299.0, 361.0, 349...","[1678.0, 2049.0, 2080.0, -1681.0, -11648.0, -1...","[6161.0, 6208.0, 5897.0, 4703.0, 3732.0, 2980....",340,,1,S1,af
3,"[-985.0, -925.0, -943.0, -919.0, -935.0, -877....","[255.0, 287.0, 321.0, 335.0, 396.0, 400.0, 491...","[2053.0, 2296.0, -448.0, -11020.0, -19196.0, -...","[5944.0, 5725.0, 4579.0, 3615.0, 2952.0, 1393....",330,,1,S1,af
4,"[-1025.0, -913.0, -885.0, -903.0, -864.0, -947...","[262.0, 285.0, 336.0, 349.0, 342.0, 490.0, 506...","[1726.0, 2111.0, 2289.0, -736.0, -11322.0, -19...","[6270.0, 6584.0, 6191.0, 4876.0, 3749.0, 2986....",330,,1,S1,af


## Reformatting the Data
In order to apply the TSFresh package to this dataset, the data must be somewhat re-formatted. First, group together the data such that each row corresponds to a single file (with four columns containing the data in CS1-2, CS3-4, CS5-6 and CS7-8 respectively).

In [270]:
X.to_pickle('/Users/matthewashman/github/MasterProject2018/Data/X_af_compact.pkl')

In [225]:
# X = pd.DataFrame(columns=['Type', 'Patient', 'Coupling Interval', 'Label', 'CS1-2', 'CS3-4', 'CS5-6', 'CS7-8'])
X_list = []
patients = ('1', '2', '3', '4', '5', '6', '8', '9', '10')
patient_type = 'af'
for patient in patients:
    patient_X = X_orig[(X_orig['Patient']==patient) & (X_orig['S1/S2']=='S2')]
    cis = patient_X['Coupling Interval'].unique()
    for ci in cis:
        ci_patient_X = patient_X[patient_X['Coupling Interval']==ci]
        # Extract data
        if 1 in np.float64(ci_patient_X['Label'].values):
            label = 1
        else:
            label = 0
            
        cs12_data = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS1-2']['Data'])
        cs34_data = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS3-4']['Data'])
        cs56_data = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS5-6']['Data'])
        cs78_data = np.squeeze(ci_patient_X[ci_patient_X['Channel']=='CS7-8']['Data'])
        
        temp_dict = {}
        temp_dict.update({'Type':'af', 'Patient':patient, 'Coupling Interval':ci, 
                                 'Label':label, 'CS1-2':cs12_data, 'CS3-4':cs34_data, 
                                 'CS5-6':cs56_data, 'CS7-8':cs78_data})
        X_list.append(temp_dict)
        
X = pd.DataFrame(X_list)

In [226]:
X.head()

Unnamed: 0,CS1-2,CS3-4,CS5-6,CS7-8,Coupling Interval,Label,Patient,Type
0,"[-636, -617, -652, -560, -482, -415, -383, -46...","[472, 464, 491, 523, 553, 706, 1019, 1404, 164...","[4815, 1298, -9337, -27488, -32768, -26740, -9...","[16884, 13247, 9806, 7552, 5619, 2160, -1810, ...",340,0,1,af
1,"[-696, -663, -601, -642, -557, -566, -480, -44...","[447, 513, 501, 515, 692, 1031, 1448, 1809, 18...","[871, -6797, -19677, -31155, -29543, -16492, 5...","[13795, 10194, 7900, 5749, 2226, -1913, -4211,...",330,0,1,af
2,"[-555, -526, -595, -680, -540, -489, -422, -43...","[418, 374, 436, 549, 640, 852, 1166, 1395, 139...","[825, -3065, -9139, -18461, -26311, -24543, -9...","[10616, 9608, 7508, 5167, 2660, -463, -2864, -...",320,0,1,af
3,"[-619, -524, -514, -438, -463, -436, -425, -43...","[483, 445, 459, 587, 816, 1149, 1551, 1844, 20...","[1671, -5160, -12348, -18340, -23745, -21941, ...","[13584, 12742, 9975, 5823, 1403, -2069, -3543,...",310,0,1,af
4,"[-577, -532, -513, -480, -492, -506, -270, -26...","[337, 344, 320, 366, 502, 739, 1129, 1489, 164...","[2513, -32, -6171, -12617, -15992, -16099, -14...","[10876, 8574, 8187, 7449, 4346, 151, -3071, -3...",300,0,1,af


Here the data is split into training a test prior to feature selection. This prevents unrealistic performance estimates.

In [171]:
X_train, X_test = train_test_split(X, test_size=0.3)

TSFresh requires the data to appear in columns, i.e. not in a single cell as in X. Here, X is unpacked such that each row unravels its contents (150 dimensional arrays in CS1-2, CS3-4, CS5-6) into 150 rows. Unique ID's indicate the original row in X that has been unpacked.

In [187]:
X_train_ts = pd.DataFrame(columns=['CS1-2', 'CS3-4', 'CS5-6','ID'])
X_test_ts = pd.DataFrame(columns=['CS1-2', 'CS3-4', 'CS5-6','ID'])
y_train = pd.DataFrame(columns=['ID', 'Label'])
y_test = pd.DataFrame(columns=['ID', 'Label'])
patients = ('1', '2', '3', '4', '5', '6', '8', '9', '10')
patient_type = 'af'
for patient in patients:
    patient_X_train = X_train[(X_train['Patient']==patient)]
    patient_X_test = X_test[(X_test['Patient']==patient)]
    train_cis = patient_X_train['Coupling Interval'].unique()
    test_cis = patient_X_test['Coupling Interval'].unique()
    
    for ci in train_cis:
        ci_patient_X = patient_X_train[patient_X_train['Coupling Interval']==ci]
        # Extract data
        cs12_data = np.float64(np.squeeze(ci_patient_X['CS1-2']))
        cs34_data = np.float64(np.squeeze(ci_patient_X['CS3-4']))
        cs56_data = np.float64(np.squeeze(ci_patient_X['CS5-6']))
        
        X_train_ts_row = pd.DataFrame({'CS1-2':cs12_data, 'CS3-4':cs34_data, 'CS5-6':cs56_data, 'ID':int(patient + ci)})
        X_train_ts = X_train_ts.append(X_train_ts_row, ignore_index=True)

        labels = ci_patient_X['Label'].values
        y_train_row = pd.DataFrame({'ID':int(patient + ci), 'Label':labels})
        y_train = y_train.append(y_train_row, ignore_index=True)
        
    for ci in test_cis:
        ci_patient_X = patient_X_test[patient_X_test['Coupling Interval']==ci]
        # Extract data
        cs12_data = np.float64(np.squeeze(ci_patient_X['CS1-2']))
        cs34_data = np.float64(np.squeeze(ci_patient_X['CS3-4']))
        cs56_data = np.float64(np.squeeze(ci_patient_X['CS5-6']))
        
        X_test_ts_row = pd.DataFrame({'CS1-2':cs12_data, 'CS3-4':cs34_data, 'CS5-6':cs56_data, 'ID':int(patient + ci)})
        X_test_ts = X_test_ts.append(X_test_ts_row, ignore_index=True)

        labels = ci_patient_X['Label'].values
        y_test_row = pd.DataFrame({'ID':int(patient + ci), 'Label':labels})
        y_test = y_test.append(y_test_row, ignore_index=True)

## Feature Extraction/Selection
First we extract all the features, with rows grouped by the column 'ID'.

In [174]:
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
X_train_features = extract_features(X_train_ts, column_id="ID", impute_function= impute)












Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  10%|█         | 1/10 [00:12<01:52, 12.54s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  20%|██        | 2/10 [00:12<01:10,  8.84s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  30%|███       | 3/10 [00:25<01:10, 10.13s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  40%|████      | 4/10 [00:26<00:43,  7.31s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  50%|█████     | 5/10 [00:38<00:43,  8.65s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  60%|██████    | 6/10 [00:39<00:25,  6.25s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  70%|███████   | 7/10 [00:50<00:23,  7.78s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  80%|████████  | 8/10 [00:51<00:11,  5.66s/it][A[A[A[A[A[A[A[A[A[A[A










Feature

In [188]:
X = X_train_features.sort_index()
y_train = y_train.set_index('ID')
y_train = y_train.sort_index()

Select the most relevant features. fdr_level = 0.01 = 1% probability of features being irrelevant

In [191]:
from tsfresh import select_features
X_train_features_select = select_features(X_train_features, y_train['Label'], fdr_level = 0.01)





















Interestingly, most of the selected relevant features are those of channel CS5-6. This indicates that when CS5-6 show fractionation, all the others are very likely to show fractionation too (and not visa-versa). Thus, CS5-6 is the best indicator of fractionation across all channels.

In [192]:
X_train_features_select.head()

variable,CS5-6__ratio_beyond_r_sigma__r_0.5,CS5-6__ratio_beyond_r_sigma__r_1,CS5-6__energy_ratio_by_chunks__num_segments_10__segment_focus_2,CS5-6__energy_ratio_by_chunks__num_segments_10__segment_focus_3,CS5-6__energy_ratio_by_chunks__num_segments_10__segment_focus_0,CS5-6__index_mass_quantile__q_0.6,CS5-6__approximate_entropy__m_2__r_0.7,CS5-6__sample_entropy,CS5-6__kurtosis,CS5-6__approximate_entropy__m_2__r_0.5,...,CS1-2__index_mass_quantile__q_0.8,CS1-2__energy_ratio_by_chunks__num_segments_10__segment_focus_3,CS5-6__binned_entropy__max_bins_10,CS5-6__index_mass_quantile__q_0.4,CS5-6__ratio_beyond_r_sigma__r_1.5,CS5-6__approximate_entropy__m_2__r_0.3,CS5-6__index_mass_quantile__q_0.3,CS5-6__energy_ratio_by_chunks__num_segments_10__segment_focus_1,CS5-6__energy_ratio_by_chunks__num_segments_10__segment_focus_4,CS1-2__approximate_entropy__m_2__r_0.9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1230,0.306667,0.226667,0.389351,0.309102,0.102294,0.273333,0.272799,0.966525,4.136314,0.251302,...,0.346667,0.017104,1.507773,0.233333,0.146667,0.209728,0.166667,0.183249,0.014467,0.10986
1250,0.346667,0.18,0.512333,0.152297,0.15272,0.293333,0.232184,1.126929,6.941752,0.250532,...,0.3,0.024837,1.268454,0.26,0.12,0.226867,0.18,0.14404,0.036094,0.17275
1260,0.333333,0.206667,0.170208,0.132965,0.321111,0.22,0.238401,0.962065,3.719279,0.224825,...,0.26,0.001805,1.367337,0.146667,0.153333,0.177601,0.12,0.371199,0.001559,0.110333
1270,0.18,0.1,0.014638,0.000275,0.452284,0.113333,0.078348,0.637033,14.048838,0.123491,...,0.34,0.024028,0.83065,0.1,0.073333,0.119954,0.08,0.529753,5.2e-05,0.106461
1280,0.14,0.1,0.000415,2.2e-05,0.796787,0.1,0.088265,0.346816,18.545891,0.073473,...,0.246667,0.005284,0.74308,0.053333,0.08,0.066305,0.053333,0.202223,5e-05,0.043524


Now we extract all the features for the test dataset and extract the ones that were identified as relevant.

In [193]:
X_test_features = extract_features(X_test_ts, column_id="ID", impute_function= impute)












Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  10%|█         | 1/10 [00:06<00:57,  6.42s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  20%|██        | 2/10 [00:06<00:36,  4.54s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  30%|███       | 3/10 [00:12<00:34,  4.99s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  40%|████      | 4/10 [00:12<00:21,  3.57s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  50%|█████     | 5/10 [00:17<00:20,  4.01s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  60%|██████    | 6/10 [00:18<00:12,  3.13s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  70%|███████   | 7/10 [00:23<00:10,  3.52s/it][A[A[A[A[A[A[A[A[A[A[A










Feature Extraction:  80%|████████  | 8/10 [00:24<00:05,  2.84s/it][A[A[A[A[A[A[A[A[A[A[A










Feature

In [194]:
X_test_features_select = X_test_features[X_train_features_select.columns]

In [195]:
y_test = y_test.set_index('ID')

In [196]:
y_train = y_train.sort_index()
y_test = y_test.sort_index()

In [202]:
X_train_array = X_train_features_select.values
X_test_array = X_test_features_select.values

y_train_array = np.float64(y_train.values)
y_test_array = np.float64(y_test.values)
y_train_array = np.squeeze(y_train_array)
y_test_array = np.squeeze(y_test_array)

## Model Training
Here, we train a selection of common Machine Learning algorithms. The cross validation score in the training dataset is very good. However, as the features were selected using the training dataset these results are expected to be optimistic relative to the performance on the test dataset.

In [205]:
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import cross_val_score
import xgboost

models = (svm.SVC(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), 
          gaussian_process.GaussianProcessClassifier(kernel=1.0*RBF(1)), xgboost.XGBClassifier())
model_names = ('SVM', 'Naive Bayes', 'KNN', 'GP', 'XGBoost')
print('Cross validation scores on combined data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    print(cross_val_score(clf, X_train_array, y_train_array, cv=3))

Cross validation scores on combined data:
SVM
[0.96428571 0.92592593 0.80769231]
Naive Bayes
[0.92857143 0.88888889 0.84615385]
KNN
[0.96428571 0.81481481 0.80769231]
GP
[0.96428571 0.88888889 0.84615385]
XGBoost
[0.85714286 0.88888889 0.88461538]


  if diff:
  if diff:
  if diff:


Here the models are evaluated on the test dataset. As expected, the test scores are significantly lower than the cross validation scores. Considering that no patient normalisation was attempted, the scores are very good. It will be worth looking at which type of features were deemed as relevant, such that they can be used in the more 'manual' feature extraction and selection method.

In [206]:
print('Test score on combined data:')
for clf, model_name in zip(models, model_names):
    print(model_name)
    clf.fit(X_train_array, y_train_array)
    print(clf.score(X_test_array, y_test_array))

Test score on combined data:
SVM
0.8055555555555556
Naive Bayes
0.8333333333333334
KNN
0.75
GP
0.8611111111111112
XGBoost
0.8611111111111112


  " state: %s" % convergence_dict)
  if diff:


In [209]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

And finally, the confusion matrix. 4/14 examples of fractionated responses were identified as non-fractionated. This is somewhat expected given the unbalanced nature of the dataset. Perhaps augmenting fractionated responses would help here.

In [210]:
from sklearn.metrics import confusion_matrix

predictions = clf.predict(X_test_array)
cm = confusion_matrix(y_test_array, predictions)
print_cm(cm, ['Not Fractionated','Fractionated'])

           t/p       Not Fractionated     Fractionated 
    Not Fractionated             21.0              1.0 
        Fractionated              4.0             10.0 


  if diff:
