In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
import random


In [14]:
#Daten einlesen
feature = pd.read_csv("data_for_random_forest.csv")

#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')

feature.set_index('sample_name',inplace=True)
feature.drop(['Gesund'], axis='columns', inplace=True)

#Daten mit Dummy Value -999 füllen und 20 Trees. f1-score = 0,67
feature = feature.fillna(-999)
feature

Unnamed: 0_level_0,bmi,calprotectin,sex,distance_Hp
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1629.SubjectIBD335,23.0,57.0,0,0.005381
1629.SubjectIBD643,-999.0,23.0,0,0.016346
1629.SubjectIBD539,22.0,63.0,0,0.039146
1629.SubjectIBD078,26.0,47.0,1,0.030779
1629.SubjectIBD671,-999.0,35.0,0,0.305698
...,...,...,...,...
1629.SubjectIBD421,26.0,1013.0,0,0.003157
1629.SubjectIBD202,28.0,220.0,1,0.344654
1629.SubjectIBD544,18.0,377.0,0,0.204402
1629.SubjectIBD422,26.0,67.0,1,0.040233


In [15]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

#CCD
CCD = df[df.ibd_subtype.eq("CCD")]
    
#ICD-r
ICD_r = df[df.ibd_subtype.eq("ICD_r")]

#ICD-nr
ICD_nr = df[df.ibd_subtype.eq("ICD_nr")]
    
#UCD
UC = df[df.ibd_subtype.eq("UC")]

In [16]:
#Splitt nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split
    trainSize = int(np.round(tSize * len(shuffleListe), 0))
    testSize = len(shuffleListe) - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:len(shuffleListe)]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [24]:
fSum = 0.0
fAvg = 0.0
n = 100
for i in range(0, n):
    #Split Test und Trainingsdaten für jede Gruppe. tSize ist die Trainingsgröße
    X_trainHC, X_testHC, y_trainHC, y_testHC = split_function(tSize=0.75, random_state=i, table=feature, metadata=HC, hc_group=HC)
    X_trainCCD, X_testCCD, y_trainCCD, y_testCCD = split_function(tSize=0.75, random_state=i, table=feature, metadata=CCD, hc_group=HC)
    X_trainICD_r, X_testICD_r, y_trainICD_r, y_testICD_r = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_r, hc_group=HC)
    X_trainICD_nr, X_testICD_nr, y_trainICD_nr, y_testICD_nr = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_nr, hc_group=HC)
    X_trainUC, X_testUC, y_trainUC, y_testUC = split_function(tSize=0.75, random_state=i, table=feature, metadata=UC, hc_group=HC)
    
    X_train = np.concatenate((X_trainHC,  X_trainCCD,  X_trainICD_r,  X_trainICD_nr, X_trainUC), axis=0)
    X_test = np.concatenate((X_testHC,  X_testCCD,  X_testICD_r,  X_testICD_nr, X_testUC), axis=0)
    y_train = np.concatenate((y_trainHC,  y_trainCCD,  y_trainICD_r,  y_trainICD_nr, y_trainUC), axis=0)
    y_test = np.concatenate((y_testHC,  y_testCCD,  y_testICD_r,  y_testICD_nr, y_testUC), axis=0)
    
    
    #Featuer Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Train
    classifier = RandomForestClassifier(n_estimators = 20, random_state = i)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(str(i) + ": " + str(f1))
    fSum += f1
fAvg = fSum/n
print("f1-score = "+str(fAvg))
    
#Evaluation
#The count of true negatives is C0,0 , false negatives is C1,0 , true positives is C1,1 and false positives is C0,1
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test, y_pred))
#print(accuracy_score(y_test, y_pred))

0: 0.46538461538461534
1: 0.4624505928853755
2: 0.45255474452554745
3: 0.622
4: 0.6441102756892231
5: 0.4602510460251046
6: 0.4585152838427947
7: 0.47766323024054985
8: 0.5279593318809005
9: 0.4641350210970464
10: 0.46899224806201545
11: 0.6040974529346622
12: 0.6197089947089947
13: 0.5605036855036856
14: 0.46747967479674796
15: 0.450381679389313
16: 0.4771784232365145
17: 0.46551724137931033
18: 0.47183098591549294
19: 0.4703703703703704
20: 0.5446327683615819
21: 0.5176258992805756
22: 0.5977011494252874
23: 0.44939271255060725
24: 0.5367647058823529
25: 0.5279693486590038
26: 0.5230148542086925
27: 0.5567765567765568
28: 0.4676258992805755
29: 0.5173618694037999
30: 0.6053558844256519
31: 0.5990694689555591
32: 0.4637096774193548
33: 0.5151162790697674
34: 0.5303064191456051
35: 0.666313559322034
36: 0.4725274725274725
37: 0.46785714285714286
38: 0.472668810289389
39: 0.4669117647058823
40: 0.5510860820595334
41: 0.5184950327626294
42: 0.5097402597402597
43: 0.5843935538592027
44: 0