In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import random

In [2]:
#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')
    
#Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table.txt', sep='\t').T
feature = feature[1:][:-1]

feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3101,3102,3103,3104,3105,3106,3107,3108,3109,3110
1629.SubjectIBD335,34292,20670,18413,9981,7071,6881,5411,5335,5289,4741,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD643,15243,64328,0,0,0,4,4507,3216,15630,199,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD539,22182,21589,0,1365,0,11501,33619,3638,5053,0,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD078,0,805,0,0,0,4,330,2305,0,8,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD671,0,19734,0,0,0,0,215,0,0,699,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629.SubjectIBD421,5154,12101,1572,62,190,1448,6316,2205,1885,0,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD202,14565,24920,3543,0,0,0,63,0,37768,48660,...,0,0,0,0,0,0,0,0,0,0
1629.SubjectIBD544,32,52,31,0,0,2,18543,0,3,45,...,0,0,0,0,0,0,0,0,946,31
1629.SubjectIBD422,5718,18420,9534,0,0,0,4791,2770,0,588,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

y = []
for row in feature.index:
    if any(True for val in HC['sample_name'] if val == row):
        y.append(1)
    else:
        y.append(0)

X = feature.iloc[:, :].values  
X

array([[34292.0, 20670.0, 18413.0, ..., 0.0, 0.0, 0.0],
       [15243.0, 64328.0, 0.0, ..., 0.0, 0.0, 0.0],
       [22182.0, 21589.0, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       [32.0, 52.0, 31.0, ..., 0.0, 946.0, 31.0],
       [5718.0, 18420.0, 9534.0, ..., 0.0, 0.0, 0.0],
       [3151.0, 7071.0, 677.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [4]:
#Splitt nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split. 137 PatientenNummern
    trainSize = int(np.round(tSize * 137, 0))
    testSize = 137 - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:138]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [5]:
f1Sum = 0.0
n = 100
for i in range(0, n):
    
    #Split Test und Trainingsdaten
    X_train, X_test, y_train, y_test = split_function(tSize=0.7, random_state=i, table=feature, metadata=df, hc_group=HC)
    trainSize = len(X_train)/681
    testSize =len(X_test)/681   
    
    
    #Featuer Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Train
    mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500, random_state=i)
    mlp.fit(X_train,y_train)
    
    #Test
    y_pred = mlp.predict(X_test)
    predict_train = mlp.predict(X_train)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1Sum += f1
    
    print(str(i) + ": " + "f1_score: " + str(f1) + "    " + str(np.round(trainSize, 2)) +"/"+ str(np.round(testSize, 2)) + " Split")
    #Evaluation Train Data
    print("Accuracy Train Data:")
    print(confusion_matrix(y_train,predict_train))
    print(classification_report(y_train,predict_train))

    #Evaluating Test Data
    print("Accuracy Test Data:")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("\n\n")

f1score = f1Sum/n
print("Avg f1-score = "+str(f1score))

0: f1_score: 0.5314009661835748    0.72/0.28 Split
Accuracy Train Data:
[[446   2]
 [  1  38]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       448
           1       0.95      0.97      0.96        39

    accuracy                           0.99       487
   macro avg       0.97      0.98      0.98       487
weighted avg       0.99      0.99      0.99       487

Accuracy Test Data:
[[152  19]
 [ 19   4]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       171
           1       0.17      0.17      0.17        23

    accuracy                           0.80       194
   macro avg       0.53      0.53      0.53       194
weighted avg       0.80      0.80      0.80       194




1: f1_score: 0.5507270600033428    0.67/0.33 Split
Accuracy Train Data:
[[417   0]
 [  1  39]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       417
 

11: f1_score: 0.45618556701030927    0.69/0.31 Split
Accuracy Train Data:
[[418   0]
 [  4  48]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       418
           1       1.00      0.92      0.96        52

    accuracy                           0.99       470
   macro avg       1.00      0.96      0.98       470
weighted avg       0.99      0.99      0.99       470

Accuracy Test Data:
[[177  24]
 [ 10   0]]
              precision    recall  f1-score   support

           0       0.95      0.88      0.91       201
           1       0.00      0.00      0.00        10

    accuracy                           0.84       211
   macro avg       0.47      0.44      0.46       211
weighted avg       0.90      0.84      0.87       211




12: f1_score: 0.4576576576576577    0.72/0.28 Split
Accuracy Train Data:
[[447   0]
 [  4  41]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       44

  _warn_prf(average, modifier, msg_start, len(result))


14: f1_score: 0.47293906810035846    0.75/0.25 Split
Accuracy Train Data:
[[461   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       461
           1       1.00      0.94      0.97        47

    accuracy                           0.99       508
   macro avg       1.00      0.97      0.98       508
weighted avg       0.99      0.99      0.99       508

Accuracy Test Data:
[[138  20]
 [ 14   1]]
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       158
           1       0.05      0.07      0.06        15

    accuracy                           0.80       173
   macro avg       0.48      0.47      0.47       173
weighted avg       0.83      0.80      0.82       173




15: f1_score: 0.4587378640776699    0.67/0.33 Split
Accuracy Train Data:
[[416   2]
 [  3  37]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       41

  _warn_prf(average, modifier, msg_start, len(result))


21: f1_score: 0.5802285146547441    0.75/0.25 Split
Accuracy Train Data:
[[468   0]
 [  2  42]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       468
           1       1.00      0.95      0.98        44

    accuracy                           1.00       512
   macro avg       1.00      0.98      0.99       512
weighted avg       1.00      1.00      1.00       512

Accuracy Test Data:
[[140  11]
 [ 14   4]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       151
           1       0.27      0.22      0.24        18

    accuracy                           0.85       169
   macro avg       0.59      0.57      0.58       169
weighted avg       0.84      0.85      0.85       169




22: f1_score: 0.46361940298507454    0.67/0.33 Split
Accuracy Train Data:
[[424   1]
 [  2  29]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       42

  _warn_prf(average, modifier, msg_start, len(result))


31: f1_score: 0.6429347826086956    0.68/0.32 Split
Accuracy Train Data:
[[439   0]
 [  1  22]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       439
           1       1.00      0.96      0.98        23

    accuracy                           1.00       462
   macro avg       1.00      0.98      0.99       462
weighted avg       1.00      1.00      1.00       462

Accuracy Test Data:
[[163  17]
 [ 25  14]]
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       180
           1       0.45      0.36      0.40        39

    accuracy                           0.81       219
   macro avg       0.66      0.63      0.64       219
weighted avg       0.79      0.81      0.80       219




32: f1_score: 0.500805152979066    0.73/0.27 Split
Accuracy Train Data:
[[451   1]
 [  1  42]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       452


42: f1_score: 0.4811594202898551    0.74/0.26 Split
Accuracy Train Data:
[[445   1]
 [  4  52]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       446
           1       0.98      0.93      0.95        56

    accuracy                           0.99       502
   macro avg       0.99      0.96      0.97       502
weighted avg       0.99      0.99      0.99       502

Accuracy Test Data:
[[166   7]
 [  6   0]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       173
           1       0.00      0.00      0.00         6

    accuracy                           0.93       179
   macro avg       0.48      0.48      0.48       179
weighted avg       0.93      0.93      0.93       179




43: f1_score: 0.5668016194331984    0.69/0.31 Split
Accuracy Train Data:
[[420   0]
 [  2  45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       420

53: f1_score: 0.6519036519036518    0.72/0.28 Split
Accuracy Train Data:
[[446   0]
 [  3  40]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       446
           1       1.00      0.93      0.96        43

    accuracy                           0.99       489
   macro avg       1.00      0.97      0.98       489
weighted avg       0.99      0.99      0.99       489

Accuracy Test Data:
[[165   8]
 [ 13   6]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       173
           1       0.43      0.32      0.36        19

    accuracy                           0.89       192
   macro avg       0.68      0.63      0.65       192
weighted avg       0.88      0.89      0.88       192




54: f1_score: 0.6432552954292085    0.72/0.28 Split
Accuracy Train Data:
[[441   2]
 [  3  43]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       443

  _warn_prf(average, modifier, msg_start, len(result))


57: f1_score: 0.5434402332361516    0.72/0.28 Split
Accuracy Train Data:
[[445   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       445
           1       1.00      0.94      0.97        47

    accuracy                           0.99       492
   macro avg       1.00      0.97      0.98       492
weighted avg       0.99      0.99      0.99       492

Accuracy Test Data:
[[157  17]
 [ 12   3]]
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       174
           1       0.15      0.20      0.17        15

    accuracy                           0.85       189
   macro avg       0.54      0.55      0.54       189
weighted avg       0.87      0.85      0.86       189




58: f1_score: 0.5573745981909247    0.72/0.28 Split
Accuracy Train Data:
[[449   1]
 [  3  37]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       450



66: f1_score: 0.46630727762803237    0.71/0.29 Split
Accuracy Train Data:
[[442   0]
 [  3  38]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       442
           1       1.00      0.93      0.96        41

    accuracy                           0.99       483
   macro avg       1.00      0.96      0.98       483
weighted avg       0.99      0.99      0.99       483

Accuracy Test Data:
[[173   4]
 [ 21   0]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       177
           1       0.00      0.00      0.00        21

    accuracy                           0.87       198
   macro avg       0.45      0.49      0.47       198
weighted avg       0.80      0.87      0.83       198




67: f1_score: 0.4602739726027397    0.71/0.29 Split
Accuracy Train Data:
[[443   0]
 [  3  38]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       44

77: f1_score: 0.48496732026143796    0.71/0.29 Split
Accuracy Train Data:
[[437   0]
 [  3  44]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       437
           1       1.00      0.94      0.97        47

    accuracy                           0.99       484
   macro avg       1.00      0.97      0.98       484
weighted avg       0.99      0.99      0.99       484

Accuracy Test Data:
[[164  18]
 [ 14   1]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       182
           1       0.05      0.07      0.06        15

    accuracy                           0.84       197
   macro avg       0.49      0.48      0.48       197
weighted avg       0.86      0.84      0.85       197




78: f1_score: 0.5940033581194532    0.68/0.32 Split
Accuracy Train Data:
[[431   0]
 [  0  33]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       43

88: f1_score: 0.5125661375661376    0.7/0.3 Split
Accuracy Train Data:
[[417   4]
 [  4  55]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       421
           1       0.93      0.93      0.93        59

    accuracy                           0.98       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.98      0.98      0.98       480

Accuracy Test Data:
[[178  20]
 [  2   1]]
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       198
           1       0.05      0.33      0.08         3

    accuracy                           0.89       201
   macro avg       0.52      0.62      0.51       201
weighted avg       0.97      0.89      0.93       201




89: f1_score: 0.5495652173913043    0.73/0.27 Split
Accuracy Train Data:
[[452   0]
 [ 11  33]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       452
 

99: f1_score: 0.5348837209302325    0.71/0.29 Split
Accuracy Train Data:
[[453   0]
 [  1  31]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       453
           1       1.00      0.97      0.98        32

    accuracy                           1.00       485
   macro avg       1.00      0.98      0.99       485
weighted avg       1.00      1.00      1.00       485

Accuracy Test Data:
[[118  48]
 [ 17  13]]
              precision    recall  f1-score   support

           0       0.87      0.71      0.78       166
           1       0.21      0.43      0.29        30

    accuracy                           0.67       196
   macro avg       0.54      0.57      0.53       196
weighted avg       0.77      0.67      0.71       196




Avg f1-score = 0.5395093455333637
