In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import random

In [2]:
#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')
    
#Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table.txt', sep='\t').T
feature = feature[1:][:-1]

feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3101,3102,3103,3104,3105,3106,3107,3108,3109,3110
1629.SubjectIBD335,34292,20670,18413,9981,7071,6881,5411,5335,5289,4741,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD643,15243,64328,0,0,0,4,4507,3216,15630,199,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD539,22182,21589,0,1365,0,11501,33619,3638,5053,0,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD078,0,805,0,0,0,4,330,2305,0,8,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD671,0,19734,0,0,0,0,215,0,0,699,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629.SubjectIBD421,5154,12101,1572,62,190,1448,6316,2205,1885,0,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD202,14565,24920,3543,0,0,0,63,0,37768,48660,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD544,32,52,31,0,0,2,18543,0,3,45,...,0,0,0,0,0,0,0,0,946,31
1629.SubjectIBD422,5718,18420,9534,0,0,0,4791,2770,0,588,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

#CCD
CCD = df[df.ibd_subtype.eq("CCD")]
    
#ICD-r
ICD_r = df[df.ibd_subtype.eq("ICD_r")]

#ICD-nr
ICD_nr = df[df.ibd_subtype.eq("ICD_nr")]
    
#UCD
UC = df[df.ibd_subtype.eq("UC")]
   

In [4]:
#Splitt nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split
    trainSize = int(np.round(tSize * len(shuffleListe), 0))
    testSize = len(shuffleListe) - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:len(shuffleListe)]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [5]:
f1Sum = 0.0
n = 100
for i in range(0, n):
    
    #Split Test und Trainingsdaten für jede Gruppe
    X_trainHC, X_testHC, y_trainHC, y_testHC = split_function(tSize=0.75, random_state=i, table=feature, metadata=HC, hc_group=HC)
    X_trainCCD, X_testCCD, y_trainCCD, y_testCCD = split_function(tSize=0.75, random_state=i, table=feature, metadata=CCD, hc_group=HC)
    X_trainICD_r, X_testICD_r, y_trainICD_r, y_testICD_r = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_r, hc_group=HC)
    X_trainICD_nr, X_testICD_nr, y_trainICD_nr, y_testICD_nr = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_nr, hc_group=HC)
    X_trainUC, X_testUC, y_trainUC, y_testUC = split_function(tSize=0.75, random_state=i, table=feature, metadata=UC, hc_group=HC)
    
    X_train = np.concatenate((X_trainHC,  X_trainCCD,  X_trainICD_r,  X_trainICD_nr, X_trainUC), axis=0)
    X_test = np.concatenate((X_testHC,  X_testCCD,  X_testICD_r,  X_testICD_nr, X_testUC), axis=0)
    y_train = np.concatenate((y_trainHC,  y_trainCCD,  y_trainICD_r,  y_trainICD_nr, y_trainUC), axis=0)
    y_test = np.concatenate((y_testHC,  y_testCCD,  y_testICD_r,  y_testICD_nr, y_testUC), axis=0)
    
    
    trainSize = len(X_train)/(len(X_train)+len(X_test))
    testSize =len(X_test)/(len(X_train)+len(X_test)) 
    
    HCTrainSize = len(X_trainHC)/(len(X_trainHC)+len(X_testHC))
    HCTestSize = len(X_testHC)/(len(X_trainHC)+len(X_testHC))                          
        
    CCDTrainSize = len(X_trainCCD)/(len(X_trainCCD)+len(X_testCCD))
    CCDTestSize = len(X_testCCD)/(len(X_trainCCD)+len(X_testCCD))   
    
    ICD_rTrainSize = len(X_trainICD_r)/(len(X_trainICD_r)+len(X_testICD_r))
    ICD_rTestSize = len(X_testICD_r)/(len(X_trainICD_r)+len(X_testICD_r)) 
    
    ICD_nrTrainSize = len(X_trainICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr))
    ICD_nrTestSize = len(X_testICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr)) 
    
    UCTrainSize = len(X_trainUC)/(len(X_trainUC)+len(X_testUC))
    UCTestSize = len(X_testUC)/(len(X_trainUC)+len(X_testUC))   
                                

    #Featuer Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Train
    mlp = MLPClassifier(hidden_layer_sizes=(8,8), activation='relu', alpha=5, solver='lbfgs', max_iter=1000,warm_start=True, random_state=i)
    mlp.fit(X_train,y_train)
    
    #Test
    y_pred = mlp.predict(X_test)
    predict_train = mlp.predict(X_train)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1Sum += f1
    
    print(str(i) + ": " + "f1_score: " + str(f1) + "    " + str(np.round(trainSize, 2)) +"/"+ str(np.round(testSize, 2)) + " Gesamt-Split")
    print(str(np.round(HCTrainSize, 2)) +"/"+ str(np.round(HCTestSize, 2)) + " HC-Split")
    print(str(np.round(CCDTrainSize, 2)) +"/"+ str(np.round(CCDTestSize, 2)) + " CCD-Split")
    print(str(np.round(ICD_rTrainSize, 2)) +"/"+ str(np.round(ICD_rTestSize, 2)) + " ICD_r-Split")
    print(str(np.round(ICD_nrTrainSize, 2)) +"/"+ str(np.round(ICD_nrTestSize, 2)) + " ICD_nr-Split")
    print(str(np.round(UCTrainSize, 2)) +"/"+ str(np.round(UCTestSize, 2)) + " UC-Split")
    
    #Evaluation Train Data
    print("Accuracy Train Data:")
    print(confusion_matrix(y_train,predict_train))
    print(classification_report(y_train,predict_train))

    #Evaluating Test Data
    print("Accuracy Test Data:")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("\n\n")

f1score = f1Sum/n
print("Avg f1-score = "+str(f1score))

0: f1_score: 0.5071460809387693    0.75/0.25 Gesamt-Split
0.77/0.23 HC-Split
0.64/0.36 CCD-Split
0.74/0.26 ICD_r-Split
0.62/0.38 ICD_nr-Split
0.8/0.2 UC-Split
Accuracy Train Data:
[[431   0]
 [  4  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       431
           1       1.00      0.92      0.96        48

    accuracy                           0.99       479
   macro avg       1.00      0.96      0.98       479
weighted avg       0.99      0.99      0.99       479

Accuracy Test Data:
[[134   8]
 [ 13   1]]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       142
           1       0.11      0.07      0.09        14

    accuracy                           0.87       156
   macro avg       0.51      0.51      0.51       156
weighted avg       0.84      0.87      0.85       156




1: f1_score: 0.7735507246376812    0.76/0.24 Gesamt-Split
0.74/0.26 HC-Split
0.76/0.24 CCD-Split


10: f1_score: 0.7248201438848921    0.76/0.24 Gesamt-Split
0.76/0.24 HC-Split
0.73/0.27 CCD-Split
0.78/0.22 ICD_r-Split
0.81/0.19 ICD_nr-Split
0.75/0.25 UC-Split
Accuracy Train Data:
[[435   0]
 [  4  43]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       435
           1       1.00      0.91      0.96        47

    accuracy                           0.99       482
   macro avg       1.00      0.96      0.98       482
weighted avg       0.99      0.99      0.99       482

Accuracy Test Data:
[[132   6]
 [  8   7]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       138
           1       0.54      0.47      0.50        15

    accuracy                           0.91       153
   macro avg       0.74      0.71      0.72       153
weighted avg       0.90      0.91      0.91       153




11: f1_score: 0.5339065858901699    0.75/0.25 Gesamt-Split
0.85/0.15 HC-Split
0.73/0.27 CCD-Sp

20: f1_score: 0.6970099667774087    0.74/0.26 Gesamt-Split
0.74/0.26 HC-Split
0.77/0.23 CCD-Split
0.82/0.18 ICD_r-Split
0.5/0.5 ICD_nr-Split
0.71/0.29 UC-Split
Accuracy Train Data:
[[421   0]
 [  1  45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       421
           1       1.00      0.98      0.99        46

    accuracy                           1.00       467
   macro avg       1.00      0.99      0.99       467
weighted avg       1.00      1.00      1.00       467

Accuracy Test Data:
[[141  11]
 [  8   8]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       152
           1       0.42      0.50      0.46        16

    accuracy                           0.89       168
   macro avg       0.68      0.71      0.70       168
weighted avg       0.90      0.89      0.89       168




21: f1_score: 0.5218750000000001    0.73/0.27 Gesamt-Split
0.74/0.26 HC-Split
0.68/0.32 CCD-Spli

30: f1_score: 0.601815253989167    0.75/0.25 Gesamt-Split
0.77/0.23 HC-Split
0.8/0.2 CCD-Split
0.73/0.27 ICD_r-Split
0.75/0.25 ICD_nr-Split
0.74/0.26 UC-Split
Accuracy Train Data:
[[427   0]
 [  2  46]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       427
           1       1.00      0.96      0.98        48

    accuracy                           1.00       475
   macro avg       1.00      0.98      0.99       475
weighted avg       1.00      1.00      1.00       475

Accuracy Test Data:
[[140   6]
 [ 11   3]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       146
           1       0.33      0.21      0.26        14

    accuracy                           0.89       160
   macro avg       0.63      0.59      0.60       160
weighted avg       0.88      0.89      0.88       160




31: f1_score: 0.47770700636942676    0.74/0.26 Gesamt-Split
0.84/0.16 HC-Split
0.76/0.24 CCD-Spli

40: f1_score: 0.4807692307692307    0.79/0.21 Gesamt-Split
0.84/0.16 HC-Split
0.7/0.3 CCD-Split
0.8/0.2 ICD_r-Split
0.81/0.19 ICD_nr-Split
0.8/0.2 UC-Split
Accuracy Train Data:
[[448   0]
 [  4  48]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       448
           1       1.00      0.92      0.96        52

    accuracy                           0.99       500
   macro avg       1.00      0.96      0.98       500
weighted avg       0.99      0.99      0.99       500

Accuracy Test Data:
[[125   0]
 [ 10   0]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       125
           1       0.00      0.00      0.00        10

    accuracy                           0.93       135
   macro avg       0.46      0.50      0.48       135
weighted avg       0.86      0.93      0.89       135






  _warn_prf(average, modifier, msg_start, len(result))


41: f1_score: 0.6242647058823529    0.77/0.23 Gesamt-Split
0.76/0.24 HC-Split
0.71/0.29 CCD-Split
0.77/0.23 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.8/0.2 UC-Split
Accuracy Train Data:
[[442   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       442
           1       1.00      0.94      0.97        47

    accuracy                           0.99       489
   macro avg       1.00      0.97      0.98       489
weighted avg       0.99      0.99      0.99       489

Accuracy Test Data:
[[129   2]
 [ 12   3]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       131
           1       0.60      0.20      0.30        15

    accuracy                           0.90       146
   macro avg       0.76      0.59      0.62       146
weighted avg       0.88      0.90      0.88       146




42: f1_score: 0.7433333333333334    0.74/0.26 Gesamt-Split
0.76/0.24 HC-Split
0.69/0.31 CCD-Spli

51: f1_score: 0.6279874213836478    0.73/0.27 Gesamt-Split
0.76/0.24 HC-Split
0.64/0.36 CCD-Split
0.8/0.2 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.74/0.26 UC-Split
Accuracy Train Data:
[[419   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       419
           1       1.00      0.94      0.97        47

    accuracy                           0.99       466
   macro avg       1.00      0.97      0.98       466
weighted avg       0.99      0.99      0.99       466

Accuracy Test Data:
[[152   2]
 [ 12   3]]
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       154
           1       0.60      0.20      0.30        15

    accuracy                           0.92       169
   macro avg       0.76      0.59      0.63       169
weighted avg       0.90      0.92      0.90       169




52: f1_score: 0.5743589743589743    0.74/0.26 Gesamt-Split
0.76/0.24 HC-Split
0.59/0.41 CCD-Spli

61: f1_score: 0.5594627594627595    0.74/0.26 Gesamt-Split
0.82/0.18 HC-Split
0.74/0.26 CCD-Split
0.66/0.34 ICD_r-Split
0.62/0.38 ICD_nr-Split
0.77/0.23 UC-Split
Accuracy Train Data:
[[420   0]
 [  3  48]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       420
           1       1.00      0.94      0.97        51

    accuracy                           0.99       471
   macro avg       1.00      0.97      0.98       471
weighted avg       0.99      0.99      0.99       471

Accuracy Test Data:
[[152   1]
 [ 10   1]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       153
           1       0.50      0.09      0.15        11

    accuracy                           0.93       164
   macro avg       0.72      0.54      0.56       164
weighted avg       0.91      0.93      0.91       164




62: f1_score: 0.7884615384615385    0.77/0.23 Gesamt-Split
0.74/0.26 HC-Split
0.79/0.21 CCD-Sp

71: f1_score: 0.46739130434782605    0.77/0.23 Gesamt-Split
0.77/0.23 HC-Split
0.68/0.32 CCD-Split
0.87/0.13 ICD_r-Split
0.75/0.25 ICD_nr-Split
0.75/0.25 UC-Split
Accuracy Train Data:
[[440   0]
 [  3  45]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       440
           1       1.00      0.94      0.97        48

    accuracy                           0.99       488
   macro avg       1.00      0.97      0.98       488
weighted avg       0.99      0.99      0.99       488

Accuracy Test Data:
[[129   4]
 [ 14   0]]
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       133
           1       0.00      0.00      0.00        14

    accuracy                           0.88       147
   macro avg       0.45      0.48      0.47       147
weighted avg       0.82      0.88      0.85       147




72: f1_score: 0.730791788856305    0.76/0.24 Gesamt-Split
0.74/0.26 HC-Split
0.79/0.21 CCD-Sp

81: f1_score: 0.5602057516951133    0.73/0.27 Gesamt-Split
0.85/0.15 HC-Split
0.7/0.3 CCD-Split
0.7/0.3 ICD_r-Split
0.81/0.19 ICD_nr-Split
0.72/0.28 UC-Split
Accuracy Train Data:
[[411   0]
 [  4  49]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       411
           1       1.00      0.92      0.96        53

    accuracy                           0.99       464
   macro avg       1.00      0.96      0.98       464
weighted avg       0.99      0.99      0.99       464

Accuracy Test Data:
[[159   3]
 [  8   1]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       162
           1       0.25      0.11      0.15         9

    accuracy                           0.94       171
   macro avg       0.60      0.55      0.56       171
weighted avg       0.92      0.94      0.92       171




82: f1_score: 0.533050333535476    0.76/0.24 Gesamt-Split
0.82/0.18 HC-Split
0.72/0.28 CCD-Split
0

  _warn_prf(average, modifier, msg_start, len(result))


84: f1_score: 0.5166744222118815    0.74/0.26 Gesamt-Split
0.76/0.24 HC-Split
0.68/0.32 CCD-Split
0.75/0.25 ICD_r-Split
0.56/0.44 ICD_nr-Split
0.77/0.23 UC-Split
Accuracy Train Data:
[[424   0]
 [  2  45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       424
           1       1.00      0.96      0.98        47

    accuracy                           1.00       471
   macro avg       1.00      0.98      0.99       471
weighted avg       1.00      1.00      1.00       471

Accuracy Test Data:
[[144   5]
 [ 14   1]]
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       149
           1       0.17      0.07      0.10        15

    accuracy                           0.88       164
   macro avg       0.54      0.52      0.52       164
weighted avg       0.84      0.88      0.86       164




85: f1_score: 0.8682693353638637    0.73/0.27 Gesamt-Split
0.76/0.24 HC-Split
0.78/0.22 CCD-Sp

94: f1_score: 0.4965576592082616    0.72/0.28 Gesamt-Split
0.77/0.23 HC-Split
0.71/0.29 CCD-Split
0.72/0.28 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.71/0.29 UC-Split
Accuracy Train Data:
[[407   0]
 [  3  45]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       407
           1       1.00      0.94      0.97        48

    accuracy                           0.99       455
   macro avg       1.00      0.97      0.98       455
weighted avg       0.99      0.99      0.99       455

Accuracy Test Data:
[[153  13]
 [ 13   1]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       166
           1       0.07      0.07      0.07        14

    accuracy                           0.86       180
   macro avg       0.50      0.50      0.50       180
weighted avg       0.86      0.86      0.86       180




95: f1_score: 0.5212662652942318    0.77/0.23 Gesamt-Split
0.77/0.23 HC-Split
0.72/0.28 CCD-Sp

  _warn_prf(average, modifier, msg_start, len(result))


99: f1_score: 0.523219814241486    0.76/0.24 Gesamt-Split
0.77/0.23 HC-Split
0.74/0.26 CCD-Split
0.78/0.22 ICD_r-Split
0.62/0.38 ICD_nr-Split
0.76/0.24 UC-Split
Accuracy Train Data:
[[433   0]
 [  4  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       433
           1       1.00      0.92      0.96        48

    accuracy                           0.99       481
   macro avg       1.00      0.96      0.98       481
weighted avg       0.99      0.99      0.99       481

Accuracy Test Data:
[[136   4]
 [ 13   1]]
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       140
           1       0.20      0.07      0.11        14

    accuracy                           0.89       154
   macro avg       0.56      0.52      0.52       154
weighted avg       0.85      0.89      0.87       154




Avg f1-score = 0.6271173572577682
