In [6]:
import pandas as pd
import seaborn
import numpy as np
from pylab import *
import glob, os

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, make_scorer

In [7]:
basedir = os.getcwd()
os.chdir(basedir)

In [14]:
def get_best_channels(channelsdf, nchannels=2):
    
    list_of_rows = [list(row) for row in channelsdf.values]
    
    channel_score_dict = {}
    for i, elem in enumerate(list_of_rows[0]):
        channel_score_dict[elem] = i

    for row, rowlist in enumerate(list_of_rows[1:]):
        for i, elem in enumerate(rowlist):
            channel_score_dict[elem] = channel_score_dict[elem] + i

    sorted_channels = sorted(channel_score_dict, key=channel_score_dict.get)
    best_channels = sorted_channels[:nchannels]
    
    return best_channels

def get_expression(channels):
    expression = 'df['
    for channel in best_channels:
        expression +='(df[\'channel\'] == \'' + channel + '\')|'
    return expression[:-1] + ']'

def split_train_test(data, test_ratio):
    test_set_size = int(len(data) * test_ratio)
    test_set = data.iloc[:test_set_size]
    train_set = data.iloc[test_set_size:]
    return train_set, test_set

def split_proportional(dataframe, test_ratio, target):
    
    df_seizures = dataframe[dataframe[target] == True]
    df_normal = dataframe[dataframe[target] == False]
    
    train_seizures, test_seizures = split_train_test(df_seizures, test_ratio)
    train_normal, test_normal = split_train_test(df_normal, test_ratio)
    
    df_train = pd.concat([train_normal, train_seizures], axis=0).reset_index()
    x_train, y_train = df_train[df_train.columns.difference([target])], df_train[target]
    df_test = pd.concat([test_normal, test_seizures], axis=0).reset_index()
    x_test, y_test = df_test[df_test.columns.difference([target])], df_test[target]
    
    return x_train, y_train, x_test, y_test

def train_linear_svm(x_train, y_train, scorers):
    # Define simple pipeline
    pipe_svc = Pipeline((
        ("scl", StandardScaler()),
        ("clf", SVC())
    ))
    # Establish params
    param_grid = [#{'clf__kernel': ['linear'], 'clf__C': [0.1,1,10,100]},
                 {'clf__kernel': ['rbf'], 'clf__gamma': [1e-1,1e-2, 1e-3, 1e-4], 'clf__C': [0.01,0.1,1,10, 100, 1000]}]

    model = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def read_last_results(file, ncolumns):
    if os.path.isfile('svm_rbf_' + str(nchannels) + 'channels.csv'):
        past_df = pd.read_csv('svm_rbf_' + str(nchannels) + 'channels.csv', delimiter=',')
    else:
        zero_data = np.zeros([len(patients), ncolumns])
        past_df = pd.DataFrame(zero_data, columns = column_names)
    return past_df

In [15]:
patients = ['chb01', 'chb02', 'chb03', 'chb05', 'chb07', 'chb08', 'chb09','chb10', 'chb11']

In [16]:
nchannels = 1
column_names = ['patient', 'model', 'hyperparameters', 'sensitivity', 'specificity', 'roc_auc', 'precision', 'accuracy']

past_df = read_last_results('svm_rbf_' + str(nchannels) + 'channels.csv', len(column_names))

# Open file to store the results in a txt format, more detailed than the csv
f = open(basedir + '\\results' + str(nchannels) + 'channels.txt',"w+")

text = ''
data = np.zeros(8)
for i, patient in enumerate(patients):
    # Calculate most significant channels for patient
    channelsdf = pd.read_csv('..\DataSetCreation\Datasets\\' + patient + '_channel_order.csv', delimiter=',')
    best_channels = get_best_channels(channelsdf, nchannels)
    
    # Read the patient dataframe
    df = pd.read_hdf('..\DataSetCreation\Datasets\\' + patient + 'features.h5', key = 'fullpatient')
    
    # Extract only the best channels
    df = eval(get_expression(best_channels))
    df = df.drop(['channel'], axis=1)
    
    # Split the dataframe into train and test
    x_train, y_train, x_test, y_test = split_proportional(df, 0.2, 'seizure')
    
    # Train models
    svc_clf = train_linear_svm(x_train, y_train, scorers)
    
    f.write('-------------------------- Patient ' + patient + ' ------------------------------------\n')
    f.write(str(svc_clf.best_params_) + '\n')
    
    # Predict and see final model
    y_train_pred = svc_clf.predict(x_train)
    f.write(classification_report(y_train, y_train_pred) + '\n')
    y_true, y_pred = y_test, svc_clf.predict(x_test)
    f.write(classification_report(y_true, y_pred) + '\n')
    
    print('---------------------- Finished report for patient ' + patient + ' -----------------------')
    f.write('---------------------- Finished report for patient ' + patient + ' -----------------------\n\n\n')
    
    # Add result to the csv file
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
    specificity = cm[0][0]/(cm[0][0] + cm[0][1])
    precision = cm[1][1]/(cm[1][1] + cm[0][1])
    roc_score = roc_auc_score(y_true, y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
    
    # Replace if better results
    if (sensitivity + specificity + precision)/3 > (past_df.iloc[i]['sensitivity'] + past_df.iloc[i]['specificity'] + past_df.iloc[i]['precision'])/3:
        data = np.vstack((data, np.array([patient, 'svm_rbf', str(svc_clf.best_params_), sensitivity, specificity, precision, roc_score, accuracy])))
    else: 
        data = np.vstack((data, past_df.iloc[i].values))
        
# Close  text file
f.close() 

# Save df to csv file
data = data[1:, :]
dataframe = pd.DataFrame(data, columns = column_names)
dataframe.set_index('patient')
dataframe.to_csv('svm_rbf_' + str(nchannels) + 'channels.csv', index=False)

---------------------- Finished report for patient chb01 -----------------------
---------------------- Finished report for patient chb02 -----------------------
---------------------- Finished report for patient chb03 -----------------------
---------------------- Finished report for patient chb05 -----------------------
---------------------- Finished report for patient chb07 -----------------------
---------------------- Finished report for patient chb08 -----------------------
---------------------- Finished report for patient chb09 -----------------------
---------------------- Finished report for patient chb10 -----------------------
---------------------- Finished report for patient chb11 -----------------------


## Fine tune patient model

In [None]:
patient = 'chb08'
nchannels = 1
column_names = ['patient', 'model', 'hyperparameters', 'sensitivity', 'specificity', 'roc_auc', 'precision', 'accuracy']


In [None]:
# Read the results last dataframe and extracts the row
past_df = read_last_results('svm_rbf_' + str(nchannels) + 'channels.csv', len(column_names))
past_patient_data = past_df[past_df['patient'] == patient].values

# Calculate most significant channels for patient
channelsdf = pd.read_csv('..\DataSetCreation\Datasets\\' + patient + '_channel_order.csv', delimiter=',')
best_channels = get_best_channels(channelsdf, nchannels)
    
# Read the patient dataframe
df = pd.read_hdf('..\DataSetCreation\Datasets\\' + patient + 'features.h5', key = 'fullpatient')

# Extract only the best channels
df = eval(get_expression(best_channels))
df = df.drop(['channel'], axis=1)

In [42]:
# Split the dataframe into train and test
x_train, y_train, x_test, y_test = split_proportional(df, 0.2, 'seizure')

# Train models
svm_clf = Pipeline((
    ("scaler", StandardScaler()),
    ("linear_svc", svm.SVC(kernel = 'rbf', C=100, gamma = 0.01, max_iter = 100000000))
))
svm_clf.fit(x_train, y_train)

#y_scores = svm_clf.decision_function()

y_train_pred = cross_val_predict(svm_clf, x_train, y_train, cv=5, n_jobs = 12)

# Predict
y_train_pred = svm_clf.predict(x_train)
y_true, y_pred = y_test, svm_clf.predict(x_test)

In [43]:
print('Precision score: ' + str(precision_score(y_train, y_train_pred)))
print('Recall score: ' + str(recall_score(y_train, y_train_pred)))
print('F1 score: ' + str(f1_score(y_train, y_train_pred)))
print('Roc_auc score: ' + str(roc_auc_score(y_train, y_train_pred)))

Precision score: 0.9621993127147767
Recall score: 0.7650273224043715
F1 score: 0.852359208523592
Roc_auc score: 0.8817089794172553


In [44]:
print(classification_report(y_train, y_train_pred))
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       False       0.99      1.00      0.99      6835
        True       0.96      0.77      0.85       366

    accuracy                           0.99      7201
   macro avg       0.97      0.88      0.92      7201
weighted avg       0.99      0.99      0.99      7201

              precision    recall  f1-score   support

       False       0.98      0.99      0.99      1708
        True       0.86      0.68      0.76        91

    accuracy                           0.98      1799
   macro avg       0.92      0.84      0.87      1799
weighted avg       0.98      0.98      0.98      1799

