In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import random

In [2]:
#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')
    
#Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table.txt', sep='\t').T
feature = feature[1:][:-1]

feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,323,324,325,326,327,328,329,330,331,332
1629.SubjectIBD335,0,345,0,0,0,0,412,0,0,0,...,0,0,0,0,0,0,7,0,0,353
1629.SubjectIBD643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1629.SubjectIBD539,0,2869,0,0,0,0,1665,0,0,0,...,0,746,0,0,0,3,21,0,0,88919
1629.SubjectIBD078,0,5,0,0,0,0,17,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1629.SubjectIBD671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629.SubjectIBD238,0,0,0,0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD421,0,560,0,98,0,0,2368,0,0,0,...,0,68,0,0,6,22,259,0,650,2578
1629.SubjectIBD202,0,0,0,0,0,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,31
1629.SubjectIBD544,0,0,0,0,0,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,464937


In [3]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

y = []
for row in feature.index:
    if any(True for val in HC['sample_name'] if val == row):
        y.append(1)
    else:
        y.append(0)

X = feature.iloc[:, :].values  

In [4]:
#Splitt nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split. 137 PatientenNummern
    trainSize = int(np.round(tSize * 137, 0))
    testSize = 137 - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:138]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [5]:
f1Sum = 0.0
n = 100
for i in range(0, n):
    
    #Split Test und Trainingsdaten
    X_train, X_test, y_train, y_test = split_function(tSize=0.7, random_state=i, table=feature, metadata=df, hc_group=HC)
    trainSize = len(X_train)/681
    testSize =len(X_test)/681   
    
    
    #Featuer Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Train
    mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500, random_state=i)
    mlp.fit(X_train,y_train)
    
    #Test
    y_pred = mlp.predict(X_test)
    predict_train = mlp.predict(X_train)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1Sum += f1
    
    print(str(i) + ": " + "f1_score: " + str(f1) + "    " + str(np.round(trainSize, 2)) +"/"+ str(np.round(testSize, 2)) + " Split")
    #Evaluation Train Data
    print("Accuracy Train Data:")
    print(confusion_matrix(y_train,predict_train))
    print(classification_report(y_train,predict_train))

    #Evaluating Test Data
    print("Accuracy Test Data:")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("\n\n")

f1score = f1Sum/n
print("Avg f1-score = "+str(f1score))

0: f1_score: 0.6388506677458519    0.71/0.28 Split
Accuracy Train Data:
[[448   0]
 [  2  36]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       448
           1       1.00      0.95      0.97        38

    accuracy                           1.00       486
   macro avg       1.00      0.97      0.99       486
weighted avg       1.00      1.00      1.00       486

Accuracy Test Data:
[[165   6]
 [ 17   6]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       171
           1       0.50      0.26      0.34        23

    accuracy                           0.88       194
   macro avg       0.70      0.61      0.64       194
weighted avg       0.86      0.88      0.86       194




1: f1_score: 0.5033259423503327    0.67/0.33 Split
Accuracy Train Data:
[[417   0]
 [  3  36]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       417
 

11: f1_score: 0.5790816326530612    0.69/0.31 Split
Accuracy Train Data:
[[418   0]
 [  4  48]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       418
           1       1.00      0.92      0.96        52

    accuracy                           0.99       470
   macro avg       1.00      0.96      0.98       470
weighted avg       0.99      0.99      0.99       470

Accuracy Test Data:
[[185  16]
 [  6   3]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       201
           1       0.16      0.33      0.21         9

    accuracy                           0.90       210
   macro avg       0.56      0.63      0.58       210
weighted avg       0.93      0.90      0.91       210




12: f1_score: 0.4661016949152542    0.72/0.28 Split
Accuracy Train Data:
[[447   0]
 [  4  40]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       447

  _warn_prf(average, modifier, msg_start, len(result))


14: f1_score: 0.7080168776371307    0.74/0.25 Split
Accuracy Train Data:
[[461   0]
 [ 10  36]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       461
           1       1.00      0.78      0.88        46

    accuracy                           0.98       507
   macro avg       0.99      0.89      0.93       507
weighted avg       0.98      0.98      0.98       507

Accuracy Test Data:
[[150   8]
 [  8   7]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       158
           1       0.47      0.47      0.47        15

    accuracy                           0.91       173
   macro avg       0.71      0.71      0.71       173
weighted avg       0.91      0.91      0.91       173




15: f1_score: 0.6977722772277227    0.67/0.33 Split
Accuracy Train Data:
[[418   0]
 [  5  35]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       418

  _warn_prf(average, modifier, msg_start, len(result))


21: f1_score: 0.5607843137254902    0.75/0.25 Split
Accuracy Train Data:
[[468   0]
 [  2  42]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       468
           1       1.00      0.95      0.98        44

    accuracy                           1.00       512
   macro avg       1.00      0.98      0.99       512
weighted avg       1.00      1.00      1.00       512

Accuracy Test Data:
[[141  10]
 [ 14   3]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       151
           1       0.23      0.18      0.20        17

    accuracy                           0.86       168
   macro avg       0.57      0.56      0.56       168
weighted avg       0.84      0.86      0.85       168




22: f1_score: 0.6328861493836113    0.67/0.33 Split
Accuracy Train Data:
[[425   0]
 [  2  28]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       425

  _warn_prf(average, modifier, msg_start, len(result))


31: f1_score: 0.531786941580756    0.68/0.32 Split
Accuracy Train Data:
[[439   0]
 [  2  21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       439
           1       1.00      0.91      0.95        23

    accuracy                           1.00       462
   macro avg       1.00      0.96      0.98       462
weighted avg       1.00      1.00      1.00       462

Accuracy Test Data:
[[174   6]
 [ 34   4]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       180
           1       0.40      0.11      0.17        38

    accuracy                           0.82       218
   macro avg       0.62      0.54      0.53       218
weighted avg       0.76      0.82      0.77       218




32: f1_score: 0.5953079178885631    0.73/0.27 Split
Accuracy Train Data:
[[452   0]
 [  1  41]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       452


42: f1_score: 0.47507331378299117    0.74/0.26 Split
Accuracy Train Data:
[[446   0]
 [  5  50]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       446
           1       1.00      0.91      0.95        55

    accuracy                           0.99       501
   macro avg       0.99      0.95      0.97       501
weighted avg       0.99      0.99      0.99       501

Accuracy Test Data:
[[162  11]
 [  6   0]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       173
           1       0.00      0.00      0.00         6

    accuracy                           0.91       179
   macro avg       0.48      0.47      0.48       179
weighted avg       0.93      0.91      0.92       179




43: f1_score: 0.5742504409171076    0.69/0.31 Split
Accuracy Train Data:
[[420   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       42

53: f1_score: 0.5377207062600321    0.72/0.28 Split
Accuracy Train Data:
[[446   0]
 [  6  36]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       446
           1       1.00      0.86      0.92        42

    accuracy                           0.99       488
   macro avg       0.99      0.93      0.96       488
weighted avg       0.99      0.99      0.99       488

Accuracy Test Data:
[[166   7]
 [ 17   2]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       173
           1       0.22      0.11      0.14        19

    accuracy                           0.88       192
   macro avg       0.56      0.53      0.54       192
weighted avg       0.84      0.88      0.85       192




54: f1_score: 0.5708812260536399    0.72/0.28 Split
Accuracy Train Data:
[[443   0]
 [  5  40]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       443

  _warn_prf(average, modifier, msg_start, len(result))


57: f1_score: 0.5149638802889577    0.72/0.28 Split
Accuracy Train Data:
[[445   0]
 [  5  42]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       445
           1       1.00      0.89      0.94        47

    accuracy                           0.99       492
   macro avg       0.99      0.95      0.97       492
weighted avg       0.99      0.99      0.99       492

Accuracy Test Data:
[[156  18]
 [ 12   2]]
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       174
           1       0.10      0.14      0.12        14

    accuracy                           0.84       188
   macro avg       0.51      0.52      0.51       188
weighted avg       0.87      0.84      0.85       188




58: f1_score: 0.6785495716034272    0.72/0.28 Split
Accuracy Train Data:
[[450   0]
 [  4  35]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       450

68: f1_score: 0.5395555555555555    0.69/0.31 Split
Accuracy Train Data:
[[446   0]
 [  3  21]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       446
           1       1.00      0.88      0.93        24

    accuracy                           0.99       470
   macro avg       1.00      0.94      0.96       470
weighted avg       0.99      0.99      0.99       470

Accuracy Test Data:
[[169   4]
 [ 33   4]]
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       173
           1       0.50      0.11      0.18        37

    accuracy                           0.82       210
   macro avg       0.67      0.54      0.54       210
weighted avg       0.78      0.82      0.77       210




69: f1_score: 0.53125    0.68/0.32 Split
Accuracy Train Data:
[[430   0]
 [  3  31]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       430
          

79: f1_score: 0.5946969696969697    0.68/0.31 Split
Accuracy Train Data:
[[442   0]
 [  3  21]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       442
           1       1.00      0.88      0.93        24

    accuracy                           0.99       466
   macro avg       1.00      0.94      0.96       466
weighted avg       0.99      0.99      0.99       466

Accuracy Test Data:
[[176   1]
 [ 31   6]]
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       177
           1       0.86      0.16      0.27        37

    accuracy                           0.85       214
   macro avg       0.85      0.58      0.59       214
weighted avg       0.85      0.85      0.81       214




80: f1_score: 0.6026405241637519    0.68/0.32 Split
Accuracy Train Data:
[[431   0]
 [  2  29]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       431



86: f1_score: 0.5700354609929078    0.71/0.28 Split
Accuracy Train Data:
[[436   0]
 [  2  48]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       436
           1       1.00      0.96      0.98        50

    accuracy                           1.00       486
   macro avg       1.00      0.98      0.99       486
weighted avg       1.00      1.00      1.00       486

Accuracy Test Data:
[[183   0]
 [ 10   1]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       183
           1       1.00      0.09      0.17        11

    accuracy                           0.95       194
   macro avg       0.97      0.55      0.57       194
weighted avg       0.95      0.95      0.93       194




87: f1_score: 0.5409090909090909    0.7/0.3 Split
Accuracy Train Data:
[[447   0]
 [  4  27]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       447
 

97: f1_score: 0.7601994183631076    0.67/0.33 Split
Accuracy Train Data:
[[410   0]
 [  4  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       410
           1       1.00      0.92      0.96        48

    accuracy                           0.99       458
   macro avg       1.00      0.96      0.98       458
weighted avg       0.99      0.99      0.99       458

Accuracy Test Data:
[[201   8]
 [  5   8]]
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       209
           1       0.50      0.62      0.55        13

    accuracy                           0.94       222
   macro avg       0.74      0.79      0.76       222
weighted avg       0.95      0.94      0.94       222




98: f1_score: 0.552152819890843    0.69/0.31 Split
Accuracy Train Data:
[[436   0]
 [  2  31]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       436
