In [1]:
import pandas as pd
import seaborn
import numpy as np
from pylab import *
import glob, os
import pickle

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, make_scorer, average_precision_score
from joblib import dump, load

In [2]:
basedir = os.getcwd()
os.chdir(basedir)

In [7]:
def get_best_channels(channelsdf, nchannels=2):
    
    list_of_rows = [list(row) for row in channelsdf.values]
    
    channel_score_dict = {}
    for i, elem in enumerate(list_of_rows[0]):
        channel_score_dict[elem] = i

    for row, rowlist in enumerate(list_of_rows[1:]):
        for i, elem in enumerate(rowlist):
            channel_score_dict[elem] = channel_score_dict[elem] + i

    sorted_channels = sorted(channel_score_dict, key=channel_score_dict.get)
    best_channels = sorted_channels[:nchannels]
    
    return best_channels

def get_expression(channels):
    expression = 'df['
    for channel in best_channels:
        expression +='(df[\'channel\'] == \'' + channel + '\')|'
    return expression[:-1] + ']'

def split_train_test(data, test_ratio):
    test_set_size = int(len(data) * test_ratio)
    test_set = data.iloc[:test_set_size]
    train_set = data.iloc[test_set_size:]
    return train_set, test_set

def split_proportional(dataframe, test_ratio, target):
    dataframe = dataframe.sample(frac=1)
    df_seizures = dataframe[dataframe[target] == True]
    df_normal = dataframe[dataframe[target] == False]
    
    train_seizures, test_seizures = split_train_test(df_seizures, test_ratio)
    train_normal, test_normal = split_train_test(df_normal, test_ratio)
    
    df_train = pd.concat([train_normal, train_seizures], axis=0)
    df_train = df_train.sample(frac=1)
    df_train = df_train.reset_index(drop=True)
    x_train, y_train = df_train[df_train.columns.difference([target])], df_train[target]
    
    df_test = pd.concat([test_normal, test_seizures], axis=0)
    df_test = df_test.sample(frac=1)
    df_test = df_test.reset_index(drop=True)
    x_test, y_test = df_test[df_test.columns.difference([target])], df_test[target]
    
    return x_train, y_train, x_test, y_test

def train_random_forest(x_train, y_train):
    # Define simple pipeline
    pipe_rnf = Pipeline((
        ("scl", StandardScaler()),
        ("clf", RandomForestClassifier())
    ))
    # Establish params
    param_grid = [
                 {'clf__n_estimators': [50, 100, 200, 300, 400, 500], 'clf__max_leaf_nodes': [4,8,16,32,64]}]

    model = GridSearchCV(estimator=pipe_rnf,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def train_knn(x_train, y_train):
    # Define simple pipeline
    pipe_svc = Pipeline((
        ("scl", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ))

    param_grid = [{'clf__n_neighbors': list(range(11)), 'clf__p': [1,2], 'clf__weights': ['uniform', 'distance']}]

    model = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def train_linear_svm(x_train, y_train):
    # Define simple pipeline
    pipe_svc = Pipeline((
        ("scl", StandardScaler()),
        ("clf", SVC())
    ))
    # Establish params
    param_grid = [#{'clf__kernel': ['linear'], 'clf__C': [0.1,1,10,100]},
                 {'clf__kernel': ['rbf'], 'clf__gamma': [1e-1,1e-2, 1e-3, 1e-4], 'clf__C': [0.01,0.1,1,10, 100,1000], 'clf__probability': [True]}]

    model = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def read_last_results(file, ncolumns, npatients):
    if os.path.isfile(file):
        past_df = pd.read_csv(file, delimiter=',')
    else:
        zero_data = np.zeros([npatients, ncolumns])
        past_df = pd.DataFrame(zero_data, columns = column_names)
    return past_df

In [4]:
patients = ['chb01', 'chb02', 'chb03', 'chb04', 'chb05','chb06', 'chb07', 'chb08', 'chb09','chb10', 'chb11',
            'chb13', 'chb14', 'chb15', 'chb16', 'chb17', 'chb18', 'chb19', 'chb20', 'chb21', 'chb22', 'chb23']

In [8]:
nchannels = 1
column_names = ['patient', 'model', 'hyperparameters', 'sensitivity', 'specificity', 'roc_auc', 'precision', 'accuracy']

past_df_svm = read_last_results(basedir + '\\results_detection\\svm_rbf_' + str(nchannels) + 'channels.csv', len(column_names), len(patients))
past_df_rnf = read_last_results(basedir + '\\results_detection\\rnf_' + str(nchannels) + 'channels.csv', len(column_names), len(patients))
past_df_knn = read_last_results(basedir + '\\results_detection\\knn_' + str(nchannels) + 'channels.csv', len(column_names), len(patients))

# Open file to store the results in a txt format, more detailed than the csv
f = open(basedir + '\\results_detection\\results' + str(nchannels) + 'channels.txt',"w+")

data_svm = np.zeros(8)
data_rnf = np.zeros(8)
data_knn = np.zeros(8)
for i, patient in enumerate(patients):
    # Calculate most significant channels for patient
    channelsdf = pd.read_csv('..\DataSetCreation\DataSetsDetection\\' + patient + '_channel_order.csv', delimiter=',')
    best_channels = get_best_channels(channelsdf, nchannels)
    
    # Read the patient dataframe
    df = pd.read_hdf('..\DataSetCreation\DataSetsDetection\\' + patient + 'features.h5', key = 'fullpatient')
    
    # Extract only the best channels
    df = eval(get_expression(best_channels))
    df = df.drop(['channel'], axis=1)
    
    # Split the dataframe into train and test
    x_train, y_train, x_test, y_test = split_proportional(df, 0.2, 'seizure')
    
    # Train models
    svc_clf = train_linear_svm(x_train, y_train)
    rnf_clf = train_random_forest(x_train, y_train)
    knn_clf = train_knn(x_train, y_train)
    
    # Results for SVM model
    y_train_pred = svc_clf.predict(x_train)
    y_true, y_pred = y_test, svc_clf.predict(x_test)
    f.write('-------------------------- Patient ' + patient + ' ------------------------------------\n')
    f.write(str(svc_clf.best_params_) + '\n')
    
    # Predict and see final model
    f.write(classification_report(y_train, y_train_pred) + '\n')
    f.write(classification_report(y_true, y_pred) + '\n')
    
    print('---------------------- Finished report for patient ' + patient + ' -----------------------')
    f.write('---------------------- Finished report for patient ' + patient + ' -----------------------\n\n\n')
    
    # Add result to the csv file
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
    specificity = cm[0][0]/(cm[0][0] + cm[0][1])
    precision = cm[1][1]/(cm[1][1] + cm[0][1])
    roc_score = roc_auc_score(y_true, y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
    
    # Replace if better results
    if (sensitivity + specificity )/2 > (past_df_svm.iloc[i]['sensitivity'] + past_df_svm.iloc[i]['specificity'])/2:
        data_svm = np.vstack((data_svm, np.array([patient, 'svm_rbf', str(svc_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])))
        pickle.dump(svc_clf, open(basedir + '\\results_detection\\' + 'svm_{}.sav'.format(patient), 'wb'))
        dump(svc_clf, 'svm_{}.joblib'.format(patient)) 
    else: 
        data_svm = np.vstack((data_svm, past_df_svm.iloc[i].values))
        
    # Results for RandomForest model
    y_train_pred = rnf_clf.predict(x_train)
    y_true, y_pred = y_test, rnf_clf.predict(x_test)
    
    # Add result to the csv file
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
    specificity = cm[0][0]/(cm[0][0] + cm[0][1])
    precision = cm[1][1]/(cm[1][1] + cm[0][1])
    roc_score = roc_auc_score(y_true, y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
    
    # Replace if better results
    if (sensitivity + specificity )/2 > (past_df_rnf.iloc[i]['sensitivity'] + past_df_rnf.iloc[i]['specificity'])/2:
        data_rnf = np.vstack((data_rnf, np.array([patient, 'rnf', str(rnf_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])))
        #pickle.dump(rnf_clf, open(basedir + '\\results_detection\\' + 'rnf_{}.sav'.format(patient), 'wb'))
        dump(rnf_clf, 'rnf_{}.joblib'.format(patient)) 
    else: 
        data_rnf = np.vstack((data_rnf, past_df_rnf.iloc[i].values))
        
    # Results for KNN model
    y_train_pred = knn_clf.predict(x_train)
    y_true, y_pred = y_test, knn_clf.predict(x_test)
    
    # Add result to the csv file
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
    specificity = cm[0][0]/(cm[0][0] + cm[0][1])
    precision = cm[1][1]/(cm[1][1] + cm[0][1])
    roc_score = roc_auc_score(y_true, y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
    
    # Replace if better results
    if (sensitivity + specificity )/2 > (past_df_knn.iloc[i]['sensitivity'] + past_df_knn.iloc[i]['specificity'])/2:
        data_knn = np.vstack((data_knn, np.array([patient, 'knn', str(knn_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])))
        pickle.dump(knn_clf, open(basedir + '\\results_detection\\' + 'knn_{}.sav'.format(patient), 'wb'))
    else: 
        data_knn = np.vstack((data_knn, past_df_knn.iloc[i].values))
    
    
        
# Close  text file
f.close() 

# Save df to csv file
data_svm = data_svm[1:, :]
dataframe_svm = pd.DataFrame(data_svm, columns = column_names)
dataframe_svm.set_index('patient')
dataframe_svm.to_csv(basedir + '\\results_detection\\svm_rbf_' + str(nchannels) + 'channels.csv', index=False)

# Save df to csv file
data_rnf = data_rnf[1:, :]
dataframe_rnf = pd.DataFrame(data_rnf, columns = column_names)
dataframe_rnf.set_index('patient')
dataframe_rnf.to_csv(basedir + '\\results_detection\\rnf_' + str(nchannels) + 'channels.csv', index=False)

# Save df to csv file
data_knn = data_knn[1:, :]
dataframe_knn = pd.DataFrame(data_knn, columns = column_names)
dataframe_knn.set_index('patient')
dataframe_knn.to_csv(basedir + '\\results_detection\\knn_' + str(nchannels) + 'channels.csv', index=False)

---------------------- Finished report for patient chb01 -----------------------


KeyboardInterrupt: 