### Import Packages

In [None]:
import numpy as np
import pandas as pd

### Get Date for documentation

In [None]:
'''Get Time and Data for documentation '''
def get_date_yyyymmdd():
    import datetime
    date = datetime.datetime.now()
    day = str(date.day)
    month = str(date.month)
    year = str(date.year)
    if len(day) <= 1:
        day = '0' + day
    if len(month) <= 1:
        month = '0' + month
        
    return year+month+day

date = get_date_yyyymmdd()


### Data Files Path

In [None]:
'''VT MAP FILES'''
VTMAP_data_file_path = r'../Data/VT_MAP_Data_20191219.npz'

'''Choose one from below (uncomment one outcome)'''
'''VTVF Outcome'''
excelSheetPath = r'../Data/VTVF Split.xlsx'
output_label = 'VT_VF in 3Y'

#OR

'''Mortality Outcome'''
#excelSheetPath = r'../Data/Mortality split.xlsx'
#output_label = 'Mortality'

### Configurables

In [None]:
'''Choose number of features to build model'''
numberOfFeatures = 40 

### Load Data

In [None]:
def load_VT_MAP_data(file):
    data = np.load(file, allow_pickle = True)
    if file.endswith('.npz'):
        data = data['data']
    data = data.item()
    return data

data = load_VT_MAP_data(VTMAP_data_file_path)

### Initialize Excel Writer

In [None]:
#Initialize Writer for Save Data Frames and Results as xlsx file
writer = pd.ExcelWriter('../Results/SVC_Results_'+date+'_'+str(numberOfFeatures)+'_features_'+output_label+'.xlsx', engine='xlsxwriter')

### Create function to tranform data into tsfresh format

In [None]:
def data_to_tsfresh_input_format(key,value,output):
    x = value
    x_reshaped = np.reshape(x,(x.shape[0]*x.shape[1],1))
    ids = np.reshape(np.asarray([[float(key)*10000 + i]*x.shape[1] for i in range(x.shape[0])]),(x.shape[0]*x.shape[1],1))
    time = np.reshape([i for i in range(x.shape[1])]*x.shape[0],(np.prod(x.shape),1))
    ids_time = np.append(ids,time,axis = 1)
    ids_time = ids_time.astype(int)
    x = np.append(ids_time,x_reshaped,axis = 1)
    x_df = pd.DataFrame(data = x, columns = ['id','time','voltages'])
    ids = np.asarray([[float(key)*10000 + i] for i in range(value.shape[0])])
    ids = ids.astype(int)
    y_ds = pd.DataFrame(np.append(ids,np.ones((value.shape[0],1))*output,axis = 1),columns = ['id','outcome'])
        
    return  x_df, y_ds

### Transform data to tsfresh format

In [None]:
#load outcome excel sheet into pd object
outcome_sheet_all = pd.ExcelFile(excelSheetPath) 

#read each sheet into a pd data frame
all_df = pd.DataFrame()
all_y = pd.DataFrame()
for sheet in ['CV1']:#Choose any sheet in the excel sheet to populate the database
    sheet_name = sheet
    long_term_outcome = pd.read_excel(outcome_sheet_all, sheet_name = sheet)

    #iterate through the rows to get the name of each file
    for index, row in long_term_outcome.iterrows():
        output = row[output_label]#Get Label associated with patient id
        patient_name = str(row['patient'])
        if patient_name.endswith('_B'):#Some patients have a second file, change for tsfresh format database creation
            patient_name = patient_name[:-2] + '.2'
        x_df,y = data_to_tsfresh_input_format(patient_name,data[str(row['patient'])],output)
        
        #Add each file ot entire database
        all_df = all_df.append(x_df)
        all_y = all_y.append(y)

### Feature extraction using tsfresh

In [None]:
#Import tsfresh commands        
from tsfresh import extract_features
#extract features without regard to features
if not 'extracted_features' in globals():
    extracted_features = extract_features(all_df, column_id="id", column_sort='time')

#import select feature to choose right features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
#impute command in tsfresh works in place to remove -inf,+inf,nan by min,max,median in each feature
impute(extracted_features)

#make sure y series in in right format and sorted
all_y_series = pd.Series(all_y['outcome'].values, index=all_y['id'])
all_y_series = all_y_series.sort_index(axis = 0)

#sort extracted features by id
extracted_features = extracted_features.sort_index(axis = 0)

#use tsfresh select feature command to reduce features
features_filtered = select_features(extracted_features, all_y_series, ml_task='classification')


### Remove all features with correlation 0.9 or more

In [None]:
#drop all but one features with correlation of 0.9 or more
from drop_corr_features import drop_input_corr_columns
features_filtered_no_colnr, to_drop = drop_input_corr_columns(features_filtered, corr_fac = 0.9)
features_filtered_no_colnr = features_filtered_no_colnr.dropna(axis='columns')

### Standardize feature using mean and standard deviation

In [None]:
#Get mean and sd of all columns to standardize data
features_mean = features_filtered_no_colnr.mean()
features_std = features_filtered_no_colnr.std()
features_filtered_no_colnr_norm=((features_filtered_no_colnr-features_mean)/features_std)

### Apply logistic regression with L1 penalty to choose features (features with highest coefficients in absolute value)

In [None]:
#Use logistic regression with L1 penalty to get features with highest coefficient in absolute value
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(penalty='l1', random_state = 0, solver='liblinear',
                             multi_class = 'ovr', C = 0.05, max_iter = 1000).fit(features_filtered_no_colnr_norm, all_y_series)

logReg_coef = log_reg.coef_


### Train and test SVM classificaiton model for all CVs

In [None]:
#initialize panda dataframe to save results
all_y_results = pd.DataFrame()
SVM_results_Train = pd.DataFrame()
SVM_results_Val = pd.DataFrame()


#Train and test model for all CVs splits
for sheet in ['CV1','CV2','CV3','CV4','CV5','CV6','CV7','CV8','CV9','CV10']:
    if True:
        print(sheet)
        sheet_name = sheet
        long_term_outcome = pd.read_excel(outcome_sheet_all, sheet_name = sheet)
        
        Train_dict = {}
        Val_dict = {}
        Test_dict = {}
        
        #get train and validation splits
        for index, row in long_term_outcome.iterrows():
            if index >= 0:     
                split_num = row['split_num']
                if split_num == 0:
                    Train_dict[str(row['patient'])] = 1
                    
                elif split_num == 1:
                    Val_dict[str(row['patient'])] = 1
                    
        #initialize traiinng and validaiton panda frames for train/validation
        Train_pd = pd.DataFrame()
        Train_Y = pd.Series()
        Val_pd = pd.DataFrame()
        Val_Y = pd.Series()

        #populate training and validation inputs/outputs
        count = 0
        for index, row in features_filtered_no_colnr_norm.iterrows():
            MAP_features = features_filtered_no_colnr_norm.iloc[[count]]
            MAP_label = all_y_series.iloc[[count]]
            patient_id = str(index)[:5]
            if patient_id in Train_dict.keys():
                Train_pd = Train_pd.append(MAP_features)
                Train_Y = Train_Y.append(MAP_label)
            elif patient_id in Val_dict.keys():
                Val_pd = Val_pd.append(MAP_features)
                Val_Y = Val_Y.append(MAP_label)
            count += 1
        
        #get features with highest coefficients - in absolute value
        topFeaturesIdx = np.argsort(np.abs(logReg_coef[0,:]))[-numberOfFeatures:]

        #remove all features but top x coefficients as selected from train/val inputs
        Train_pd_top = Train_pd.iloc[:,topFeaturesIdx]
        Val_pd_top = Val_pd.iloc[:,topFeaturesIdx]

        #SVC model
        from sklearn.svm import SVC
        svc_model = SVC(kernel='linear', C=1).fit(Train_pd_top,Train_Y)#train model

        #get model predictions for train/val datasets
        y_pred_Train_Features = svc_model.predict(Train_pd_top)
        y_pred_Val_Features = svc_model.predict(Val_pd_top)
    
        #calculate metrics for results
        from calc_metrics_v2 import calc_classification_metrics
        
        Train_met_Features = calc_classification_metrics(Train_Y.values,y_pred_Train_Features)
        Val_met_Features = calc_classification_metrics(Val_Y.values,y_pred_Val_Features)

        #popiulate and save results
        SVM_results_Train = SVM_results_Train.append({'num of top features': numberOfFeatures, 'Cross Validation': sheet, 'split': 'Train', 
                                          'Accuracy':Train_met_Features[1], 'Sensitivity':Train_met_Features[2], 
                                          'Specificity':Train_met_Features[3], 'PPV':Train_met_Features[4], 
                                          'NPV':Train_met_Features[5]}, ignore_index=True)
        SVM_results_Val = SVM_results_Val.append({'num of top features': numberOfFeatures, 'Cross Validation': sheet, 'split': 'Val', 
                                          'Accuracy':Val_met_Features[1], 'Sensitivity':Val_met_Features[2], 
                                          'Specificity':Val_met_Features[3], 'PPV':Val_met_Features[4], 
                                          'NPV':Val_met_Features[5]}, ignore_index=True)
        
        Val_results = Val_Y.to_frame()
        Val_results = Val_results.rename(columns={0: "True Labels"})
        Val_results['Predicted Labels'] = y_pred_Val_Features
        
        Train_results = Train_Y.to_frame()
        Train_results = Train_results.rename(columns={0: "True Labels"})
        Train_results['Predicted Labels'] = y_pred_Train_Features
        
        Val_results.to_excel(writer, sheet_name='Validation single labels '+ sheet)
        Train_results.to_excel(writer, sheet_name='Training single labels '+ sheet)

In [None]:
### Save results to Excel file

In [None]:
# Write each dataframe to a different worksheet.
SVM_results_Val.to_excel(writer, sheet_name='Results_Val')
SVM_results_Train.to_excel(writer, sheet_name='Results_Train')

# Close the Pandas Excel writer and output the Excel file.
writer.save()