In [1]:
import pandas as pd
import seaborn
import numpy as np
from pylab import *
import glob, os

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, make_scorer, average_precision_score

In [2]:
basedir = os.getcwd()
os.chdir(basedir)

In [3]:
def get_best_channels(channelsdf, nchannels=2):
    
    list_of_rows = [list(row) for row in channelsdf.values]
    
    channel_score_dict = {}
    for i, elem in enumerate(list_of_rows[0]):
        channel_score_dict[elem] = i

    for row, rowlist in enumerate(list_of_rows[1:]):
        for i, elem in enumerate(rowlist):
            channel_score_dict[elem] = channel_score_dict[elem] + i

    sorted_channels = sorted(channel_score_dict, key=channel_score_dict.get)
    best_channels = sorted_channels[:nchannels]
    
    return best_channels

def get_expression(channels):
    expression = 'patient_df['
    for channel in best_channels:
        expression +='(patient_df[\'channel\'] == \'' + channel + '\')|'
    return expression[:-1] + ']'

def split_train_test(data, test_ratio):
    test_set_size = int(len(data) * test_ratio)
    test_set = data.iloc[:test_set_size]
    train_set = data.iloc[test_set_size:]
    return train_set, test_set

def split_proportional(dataframe, test_ratio, target):
    dataframe = dataframe.sample(frac=1)
    df_seizures = dataframe[dataframe[target] == True]
    df_normal = dataframe[dataframe[target] == False]
    
    train_seizures, test_seizures = split_train_test(df_seizures, test_ratio)
    train_normal, test_normal = split_train_test(df_normal, test_ratio)
    
    df_train = pd.concat([train_normal, train_seizures], axis=0).reset_index()
    x_train, y_train = df_train[df_train.columns.difference([target])], df_train[target]
    df_test = pd.concat([test_normal, test_seizures], axis=0).reset_index()
    x_test, y_test = df_test[df_test.columns.difference([target])], df_test[target]
    
    
    return x_train, y_train, x_test, y_test

def train_random_forest(x_train, y_train):
    # Define simple pipeline
    pipe_rnf = Pipeline((
        ("scl", StandardScaler()),
        ("clf", RandomForestClassifier())
    ))
    # Establish params
    param_grid = [
                 {'clf__n_estimators': [50, 100, 200, 300, 400, 500], 'clf__max_leaf_nodes': [4,8,16,32,64]}]

    model = GridSearchCV(estimator=pipe_rnf,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def train_knn(x_train, y_train):
    # Define simple pipeline
    pipe_svc = Pipeline((
        ("scl", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ))

    param_grid = [{'clf__n_neighbors': list(range(11)), 'clf__p': [1,2], 'clf__weights': ['uniform', 'distance']}]

    model = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def train_linear_svm(x_train, y_train):
    # Define simple pipeline
    pipe_svc = Pipeline((
        ("scl", StandardScaler()),
        ("clf", SVC())
    ))
    # Establish params
    param_grid = [#{'clf__kernel': ['linear'], 'clf__C': [0.1,1,10,100]},
                 {'clf__kernel': ['rbf'], 'clf__gamma': [1e-1,1e-2, 1e-3, 1e-4], 'clf__C': [0.01,0.1,1,10, 100,1000]}]

    model = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, cv = 5, scoring=make_scorer(roc_auc_score), return_train_score=True, n_jobs = -1)
    model.fit(x_train, y_train)
    return model

def read_last_results(file, ncolumns, npatients):
    if os.path.isfile(file):
        past_df = pd.read_csv(file, delimiter=',')
    else:
        zero_data = np.zeros([npatients, ncolumns])
        past_df = pd.DataFrame(zero_data, columns = column_names)
    return past_df

In [4]:
patients = ['chb01', 'chb02', 'chb03', 'chb05', 'chb07', 'chb08', 'chb09','chb10', 'chb11',
            'chb13', 'chb14', 'chb15', 'chb16', 'chb17', 'chb18', 'chb19', 'chb20', 'chb21', 'chb22', 'chb23']

In [6]:
nchannels = 1

# Open file to store the results in a txt format, more detailed than the csv
f = open(basedir + '\\results_welch\\results' + str(nchannels) + 'channels.txt',"w+")

dataframe = pd.DataFrame()
for i, patient in enumerate(patients):
    # Calculate most significant channels for patient
    channelsdf = pd.read_csv('..\DataSetCreation\DataSetsWelch\\' + patient + '_channel_order.csv', delimiter=',')
    best_channels = get_best_channels(channelsdf, nchannels)
    
    # Read the patient dataframe
    patient_df = pd.read_hdf('..\DataSetCreation\DataSetsWelch\\' + patient + 'features.h5', key = 'fullpatient')
    
    # Extract only the best channels
    patient_df = eval(get_expression(best_channels))
    patient_df = patient_df.drop(['channel'], axis=1)
    dataframe = dataframe.append(patient_df, ignore_index=True)


In [7]:
dataframe

Unnamed: 0,mean,variance,skewness,kurtosis,std,zero_crossings,peak2peak,total_energy,delta,theta,alpha,beta,gamma,seizure
0,0.163932,3154.293104,0.132924,-0.368701,56.163094,62.0,294.850406,5181.014869,2212.371990,2262.965153,884.035069,164.850940,97.218538,True
1,-3.363003,12616.597818,0.257843,1.596332,112.323630,50.0,734.158928,25164.604639,21904.924142,3090.354751,582.918383,202.580573,156.068159,True
2,-2.393524,33270.889695,-0.220567,-0.275883,182.403097,26.0,936.005650,71700.884842,64830.573651,6569.391804,561.658893,482.070473,420.365466,True
3,4.462458,35728.106631,-0.498677,-0.470689,189.018800,23.0,936.005650,72582.043554,67424.964543,4927.892459,624.006343,496.495812,480.370363,True
4,5.919438,29569.069241,-0.502099,-0.464093,171.956591,30.0,823.965651,64397.121875,61265.210019,2762.673411,605.275561,364.707896,249.885274,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174082,-0.271013,3564.393389,1.268979,5.233157,59.702541,32.0,446.509767,8725.740848,8371.484274,349.646641,75.770720,37.133015,11.331364,False
174083,1.075107,3628.772178,0.503217,3.639426,60.239291,36.0,428.498851,6177.260433,5869.016319,320.436035,58.575865,42.385116,11.007761,False
174084,1.128110,5218.158539,0.348837,1.296146,72.236823,116.0,453.529925,11701.546846,8917.557022,501.591204,119.130557,1226.429930,1220.274505,False
174085,-4.879393,3965.977528,0.591510,1.221693,62.976008,136.0,434.273711,9269.266473,3681.251974,340.533554,236.832740,2046.884185,3170.397058,False


In [8]:
# Split the dataframe into train and test
x_train, y_train, x_test, y_test = split_proportional(dataframe, 0.2, 'seizure')
    
# Train models
svc_clf = train_linear_svm(x_train, y_train)
print('SVM finished')
rnf_clf = train_random_forest(x_train, y_train)
print('RNF finished')
knn_clf = train_knn(x_train, y_train)
print('KNN finished')
    
# Results for SVM model
y_train_pred = svc_clf.predict(x_train)
y_true, y_pred = y_test, svc_clf.predict(x_test)
    

cm = confusion_matrix(y_true, y_pred)
sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
specificity = cm[0][0]/(cm[0][0] + cm[0][1])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
roc_score = roc_auc_score(y_true, y_pred)
accuracy = (cm[0][0]+cm[1][1])/(sum(cm)) 
data_svm = np.array(['svm_rbf', str(svc_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])

        
# Results for RandomForest model
y_train_pred = rnf_clf.predict(x_train)
y_true, y_pred = y_test, rnf_clf.predict(x_test) 

cm = confusion_matrix(y_true, y_pred)
sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
specificity = cm[0][0]/(cm[0][0] + cm[0][1])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
roc_score = roc_auc_score(y_true, y_pred)
accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
data_rnf = np.array(['rnf', str(rnf_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])

        
# Results for KNN model
y_train_pred = knn_clf.predict(x_train)
y_true, y_pred = y_test, knn_clf.predict(x_test)
    
# Add result to the csv file
cm = confusion_matrix(y_true, y_pred)
sensitivity = cm[1][1]/(cm[1][1] + cm[1][0])
specificity = cm[0][0]/(cm[0][0] + cm[0][1])
precision = cm[1][1]/(cm[1][1] + cm[0][1])
roc_score = roc_auc_score(y_true, y_pred)
accuracy = (cm[0][0]+cm[1][1])/(sum(cm))
    
data_knn = np.array(['knn', str(knn_clf.best_params_), sensitivity, specificity, roc_score, precision, accuracy])


SVM finished
RNF finished
KNN finished


In [9]:
column_names = ['model', 'hyperparameters', 'sensitivity', 'specificity', 'roc_auc', 'precision', 'accuracy']

results = np.array([data_svm, data_rnf, data_knn])
results_df = pd.DataFrame(results, columns = column_names)

# Save df to csv file
results_df.set_index('model')
results_df.to_csv(basedir + '\\results_welch\\general_' + str(nchannels) + 'channels.csv', index=False)


In [16]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([1, 2, 3, 4, 5])

In [17]:
mix = np.array([a, b, c])

In [18]:
mix

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [19]:
x = pd.DataFrame(mix, columns = ['uno', 'dos', 'tres', 'cuatro', 'cinco'])

In [20]:
x

Unnamed: 0,uno,dos,tres,cuatro,cinco
0,1,2,3,4,5
1,1,2,3,4,5
2,1,2,3,4,5
