In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import random


In [2]:
#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')
    
#Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table.txt', sep='\t').T
feature = feature[1:][:]

feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,323,324,325,326,327,328,329,330,331,332
1629.SubjectIBD335,0,345,0,0,0,0,412,0,0,0,...,0,0,0,0,0,0,7,0,0,353
1629.SubjectIBD643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1629.SubjectIBD539,0,2869,0,0,0,0,1665,0,0,0,...,0,746,0,0,0,3,21,0,0,88919
1629.SubjectIBD078,0,5,0,0,0,0,17,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1629.SubjectIBD671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629.SubjectIBD421,0,560,0,98,0,0,2368,0,0,0,...,0,68,0,0,6,22,259,0,650,2578
1629.SubjectIBD202,0,0,0,0,0,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,31
1629.SubjectIBD544,0,0,0,0,0,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,464937
1629.SubjectIBD422,0,0,0,0,0,0,171,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

#CCD
CCD = df[df.ibd_subtype.eq("CCD")]
    
#ICD-r
ICD_r = df[df.ibd_subtype.eq("ICD_r")]

#ICD-nr
ICD_nr = df[df.ibd_subtype.eq("ICD_nr")]
    
#UCD
UC = df[df.ibd_subtype.eq("UC")]

In [4]:
#Splitt nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split
    trainSize = int(np.round(tSize * len(shuffleListe), 0))
    testSize = len(shuffleListe) - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:len(shuffleListe)]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [5]:
f1Sum = 0.0
n = 100
for i in range(0, n):
    
    #Split Test und Trainingsdaten für jede Gruppe
    X_trainHC, X_testHC, y_trainHC, y_testHC = split_function(tSize=0.75, random_state=i, table=feature, metadata=HC, hc_group=HC)
    X_trainCCD, X_testCCD, y_trainCCD, y_testCCD = split_function(tSize=0.75, random_state=i, table=feature, metadata=CCD, hc_group=HC)
    X_trainICD_r, X_testICD_r, y_trainICD_r, y_testICD_r = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_r, hc_group=HC)
    X_trainICD_nr, X_testICD_nr, y_trainICD_nr, y_testICD_nr = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_nr, hc_group=HC)
    X_trainUC, X_testUC, y_trainUC, y_testUC = split_function(tSize=0.75, random_state=i, table=feature, metadata=UC, hc_group=HC)
    
    X_train = np.concatenate((X_trainHC,  X_trainCCD,  X_trainICD_r,  X_trainICD_nr, X_trainUC), axis=0)
    X_test = np.concatenate((X_testHC,  X_testCCD,  X_testICD_r,  X_testICD_nr, X_testUC), axis=0)
    y_train = np.concatenate((y_trainHC,  y_trainCCD,  y_trainICD_r,  y_trainICD_nr, y_trainUC), axis=0)
    y_test = np.concatenate((y_testHC,  y_testCCD,  y_testICD_r,  y_testICD_nr, y_testUC), axis=0)
    
    
    trainSize = len(X_train)/(len(X_train)+len(X_test))
    testSize =len(X_test)/(len(X_train)+len(X_test)) 
    
    HCTrainSize = len(X_trainHC)/(len(X_trainHC)+len(X_testHC))
    HCTestSize = len(X_testHC)/(len(X_trainHC)+len(X_testHC))                          
        
    CCDTrainSize = len(X_trainCCD)/(len(X_trainCCD)+len(X_testCCD))
    CCDTestSize = len(X_testCCD)/(len(X_trainCCD)+len(X_testCCD))   
    
    ICD_rTrainSize = len(X_trainICD_r)/(len(X_trainICD_r)+len(X_testICD_r))
    ICD_rTestSize = len(X_testICD_r)/(len(X_trainICD_r)+len(X_testICD_r)) 
    
    ICD_nrTrainSize = len(X_trainICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr))
    ICD_nrTestSize = len(X_testICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr)) 
    
    UCTrainSize = len(X_trainUC)/(len(X_trainUC)+len(X_testUC))
    UCTestSize = len(X_testUC)/(len(X_trainUC)+len(X_testUC))   
                                

    #Featuer Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Train
    mlp = MLPClassifier(hidden_layer_sizes=(8,8), activation='relu', solver='lbfgs', alpha=0.3, max_iter=1000, batch_size=len(X_train), random_state=i)
    mlp.fit(X_train,y_train)
    
    #Test
    y_pred = mlp.predict(X_test)
    predict_train = mlp.predict(X_train)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1Sum += f1
    
    print(str(i) + ": " + "f1_score: " + str(f1) + "    " + str(np.round(trainSize, 2)) +"/"+ str(np.round(testSize, 2)) + " Gesamt-Split")
    print(str(np.round(HCTrainSize, 2)) +"/"+ str(np.round(HCTestSize, 2)) + " HC-Split")
    print(str(np.round(CCDTrainSize, 2)) +"/"+ str(np.round(CCDTestSize, 2)) + " CCD-Split")
    print(str(np.round(ICD_rTrainSize, 2)) +"/"+ str(np.round(ICD_rTestSize, 2)) + " ICD_r-Split")
    print(str(np.round(ICD_nrTrainSize, 2)) +"/"+ str(np.round(ICD_nrTestSize, 2)) + " ICD_nr-Split")
    print(str(np.round(UCTrainSize, 2)) +"/"+ str(np.round(UCTestSize, 2)) + " UC-Split")
    
    #Evaluation Train Data
    print("Accuracy Train Data:")
    print(confusion_matrix(y_train,predict_train))
    print(classification_report(y_train,predict_train))

    #Evaluating Test Data
    print("Accuracy Test Data:")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("\n\n")

f1score = f1Sum/n
print("Avg f1-score = "+str(f1score))

0: f1_score: 0.528344671201814    0.75/0.25 Gesamt-Split
0.77/0.23 HC-Split
0.64/0.36 CCD-Split
0.74/0.26 ICD_r-Split
0.62/0.38 ICD_nr-Split
0.8/0.2 UC-Split
Accuracy Train Data:
[[431   0]
 [  4  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       431
           1       1.00      0.92      0.96        48

    accuracy                           0.99       479
   macro avg       1.00      0.96      0.98       479
weighted avg       0.99      0.99      0.99       479

Accuracy Test Data:
[[139   3]
 [ 13   1]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       142
           1       0.25      0.07      0.11        14

    accuracy                           0.90       156
   macro avg       0.58      0.53      0.53       156
weighted avg       0.85      0.90      0.87       156




1: f1_score: 0.5726495726495726    0.76/0.24 Gesamt-Split
0.74/0.26 HC-Split
0.76/0.24 CCD-Split
0

10: f1_score: 0.49907473309608535    0.76/0.24 Gesamt-Split
0.76/0.24 HC-Split
0.73/0.27 CCD-Split
0.78/0.22 ICD_r-Split
0.81/0.19 ICD_nr-Split
0.75/0.25 UC-Split
Accuracy Train Data:
[[435   0]
 [  4  43]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       435
           1       1.00      0.91      0.96        47

    accuracy                           0.99       482
   macro avg       1.00      0.96      0.98       482
weighted avg       0.99      0.99      0.99       482

Accuracy Test Data:
[[129   9]
 [ 14   1]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       138
           1       0.10      0.07      0.08        15

    accuracy                           0.85       153
   macro avg       0.50      0.50      0.50       153
weighted avg       0.82      0.85      0.84       153




11: f1_score: 0.7195767195767196    0.75/0.25 Gesamt-Split
0.85/0.15 HC-Split
0.73/0.27 CCD-S

20: f1_score: 0.5368572455951097    0.74/0.26 Gesamt-Split
0.74/0.26 HC-Split
0.77/0.23 CCD-Split
0.82/0.18 ICD_r-Split
0.5/0.5 ICD_nr-Split
0.71/0.29 UC-Split
Accuracy Train Data:
[[421   0]
 [  1  45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       421
           1       1.00      0.98      0.99        46

    accuracy                           1.00       467
   macro avg       1.00      0.99      0.99       467
weighted avg       1.00      1.00      1.00       467

Accuracy Test Data:
[[143   9]
 [ 14   2]]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       152
           1       0.18      0.12      0.15        16

    accuracy                           0.86       168
   macro avg       0.55      0.53      0.54       168
weighted avg       0.84      0.86      0.85       168




21: f1_score: 0.5700234192037471    0.73/0.27 Gesamt-Split
0.74/0.26 HC-Split
0.68/0.32 CCD-Spli

30: f1_score: 0.5260101907808982    0.75/0.25 Gesamt-Split
0.77/0.23 HC-Split
0.8/0.2 CCD-Split
0.73/0.27 ICD_r-Split
0.75/0.25 ICD_nr-Split
0.74/0.26 UC-Split
Accuracy Train Data:
[[427   0]
 [  3  45]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       427
           1       1.00      0.94      0.97        48

    accuracy                           0.99       475
   macro avg       1.00      0.97      0.98       475
weighted avg       0.99      0.99      0.99       475

Accuracy Test Data:
[[133  13]
 [ 12   2]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       146
           1       0.13      0.14      0.14        14

    accuracy                           0.84       160
   macro avg       0.53      0.53      0.53       160
weighted avg       0.85      0.84      0.85       160




31: f1_score: 0.554686651460845    0.74/0.26 Gesamt-Split
0.84/0.16 HC-Split
0.76/0.24 CCD-Split

40: f1_score: 0.6169609079445144    0.79/0.21 Gesamt-Split
0.84/0.16 HC-Split
0.7/0.3 CCD-Split
0.8/0.2 ICD_r-Split
0.81/0.19 ICD_nr-Split
0.8/0.2 UC-Split
Accuracy Train Data:
[[448   0]
 [  4  48]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       448
           1       1.00      0.92      0.96        52

    accuracy                           0.99       500
   macro avg       1.00      0.96      0.98       500
weighted avg       0.99      0.99      0.99       500

Accuracy Test Data:
[[113  12]
 [  6   4]]
              precision    recall  f1-score   support

           0       0.95      0.90      0.93       125
           1       0.25      0.40      0.31        10

    accuracy                           0.87       135
   macro avg       0.60      0.65      0.62       135
weighted avg       0.90      0.87      0.88       135




41: f1_score: 0.7669858641130871    0.77/0.23 Gesamt-Split
0.76/0.24 HC-Split
0.71/0.29 CCD-Split
0.

50: f1_score: 0.533050333535476    0.76/0.24 Gesamt-Split
0.82/0.18 HC-Split
0.68/0.32 CCD-Split
0.84/0.16 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.74/0.26 UC-Split
Accuracy Train Data:
[[430   0]
 [  2  49]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       430
           1       1.00      0.96      0.98        51

    accuracy                           1.00       481
   macro avg       1.00      0.98      0.99       481
weighted avg       1.00      1.00      1.00       481

Accuracy Test Data:
[[138   5]
 [ 10   1]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       143
           1       0.17      0.09      0.12        11

    accuracy                           0.90       154
   macro avg       0.55      0.53      0.53       154
weighted avg       0.88      0.90      0.89       154




51: f1_score: 0.6411889596602973    0.73/0.27 Gesamt-Split
0.76/0.24 HC-Split
0.64/0.36 CCD-Spl

60: f1_score: 0.5006211180124224    0.79/0.21 Gesamt-Split
0.77/0.23 HC-Split
0.81/0.19 CCD-Split
0.9/0.1 ICD_r-Split
0.56/0.44 ICD_nr-Split
0.77/0.23 UC-Split
Accuracy Train Data:
[[453   0]
 [  2  46]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       453
           1       1.00      0.96      0.98        48

    accuracy                           1.00       501
   macro avg       1.00      0.98      0.99       501
weighted avg       1.00      1.00      1.00       501

Accuracy Test Data:
[[112   8]
 [ 13   1]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       120
           1       0.11      0.07      0.09        14

    accuracy                           0.84       134
   macro avg       0.50      0.50      0.50       134
weighted avg       0.81      0.84      0.83       134




61: f1_score: 0.5885304659498208    0.74/0.26 Gesamt-Split
0.82/0.18 HC-Split
0.74/0.26 CCD-Spli

70: f1_score: 0.7334743521946061    0.74/0.26 Gesamt-Split
0.74/0.26 HC-Split
0.78/0.22 CCD-Split
0.75/0.25 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.72/0.28 UC-Split
Accuracy Train Data:
[[421   0]
 [  3  43]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       421
           1       1.00      0.93      0.97        46

    accuracy                           0.99       467
   macro avg       1.00      0.97      0.98       467
weighted avg       0.99      0.99      0.99       467

Accuracy Test Data:
[[145   7]
 [  8   8]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       152
           1       0.53      0.50      0.52        16

    accuracy                           0.91       168
   macro avg       0.74      0.73      0.73       168
weighted avg       0.91      0.91      0.91       168




71: f1_score: 0.7877426600352959    0.77/0.23 Gesamt-Split
0.77/0.23 HC-Split
0.68/0.32 CCD-Sp

80: f1_score: 0.6196373286156569    0.73/0.27 Gesamt-Split
0.74/0.26 HC-Split
0.68/0.32 CCD-Split
0.74/0.26 ICD_r-Split
0.69/0.31 ICD_nr-Split
0.74/0.26 UC-Split
Accuracy Train Data:
[[417   0]
 [  2  44]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       417
           1       1.00      0.96      0.98        46

    accuracy                           1.00       463
   macro avg       1.00      0.98      0.99       463
weighted avg       1.00      1.00      1.00       463

Accuracy Test Data:
[[154   2]
 [ 13   3]]
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       156
           1       0.60      0.19      0.29        16

    accuracy                           0.91       172
   macro avg       0.76      0.59      0.62       172
weighted avg       0.89      0.91      0.89       172




81: f1_score: 0.4738461538461538    0.73/0.27 Gesamt-Split
0.85/0.15 HC-Split
0.7/0.3 CCD-Spli

90: f1_score: 0.4748427672955975    0.74/0.26 Gesamt-Split
0.76/0.24 HC-Split
0.81/0.19 CCD-Split
0.72/0.28 ICD_r-Split
0.62/0.38 ICD_nr-Split
0.73/0.27 UC-Split
Accuracy Train Data:
[[421   0]
 [ 42   5]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       421
           1       1.00      0.11      0.19        47

    accuracy                           0.91       468
   macro avg       0.95      0.55      0.57       468
weighted avg       0.92      0.91      0.88       468

Accuracy Test Data:
[[151   1]
 [ 15   0]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       152
           1       0.00      0.00      0.00        15

    accuracy                           0.90       167
   macro avg       0.45      0.50      0.47       167
weighted avg       0.83      0.90      0.86       167




91: f1_score: 0.5022950819672132    0.74/0.26 Gesamt-Split
0.74/0.26 HC-Split
0.74/0.26 CCD-Sp