In [19]:
# Setup

import numpy as np

# Ein Zufalls-Seed für Reproduzierbarkeit
np.random.seed(42)

from datetime import datetime
import pandas as pd
import seaborn as sns
from scipy import stats
import sklearn as sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.impute import KNNImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, utils

from IPython.display import display, clear_output

sns.set_theme()

print(f"GPUs für die Berechnung: {len(tf.config.experimental.list_physical_devices('GPU'))}")

GPUs für die Berechnung: 0


In [20]:
# OTU Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')

# Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table_otu.txt', sep='\t').T
feature = feature[1:][:-1]

In [21]:
# Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

y = []
for row in feature.index:
    if any(True for val in HC['sample_name'] if val == row):
        y.append(1)
    else:
        y.append(0)

X = feature.iloc[:, :].values

In [22]:
feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3101,3102,3103,3104,3105,3106,3107,3108,3109,3110
1629.SubjectIBD335,34292.0,20670.0,18413.0,9981.0,7071.0,6881.0,5411.0,5335.0,5289.0,4741.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD643,15243.0,64328.0,0.0,0.0,0.0,4.0,4507.0,3216.0,15630.0,199.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD539,22182.0,21589.0,0.0,1365.0,0.0,11501.0,33619.0,3638.0,5053.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD078,0.0,805.0,0.0,0.0,0.0,4.0,330.0,2305.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD671,0.0,19734.0,0.0,0.0,0.0,0.0,215.0,0.0,0.0,699.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def nn(random_state):
    # Split
    # TODO: Andere Aufteilung
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = random_state)

    # Das Modell soll aufhören zu rechnen, falls es keine nennenswerten Verbesserungen mehr gibt
    early_stopping = callbacks.EarlyStopping(
        min_delta=0.001,
        patience=64,
        restore_best_weights=True
    )

    # Das NN besteht aus einer Mischung von Dense-, Normalization- und Dropout-Layern.
    # Dropout führt allem Anschein nach zu schlechterem F1
    # Weniger LUs führen zu besseren Ergebnissen
    network = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
        #layers.Dropout(rate=0.5),
        layers.Dense(64, activation='relu'),
        #layers.Dropout(rate=0.5),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ])

    # NN kompilieren
    network.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[keras.metrics.Accuracy()]
    )

    # NN trainieren
    history = network.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        batch_size=64,
        epochs=512,
        callbacks=[early_stopping],
        verbose=0
    )

    # Scores berechnen
    # history_df = pd.DataFrame(history.history)

    y_pred = network.predict(X_test)
    y_pred_corrected = []
    
    for pred in y_pred:
        if pred > 0.7:
            y_pred_corrected.append(1)
        else: 
            y_pred_corrected.append(0)
    
    report = sklearn.metrics.classification_report(y_test, y_pred_corrected)
    f1 = f1_score(y_test, y_pred_corrected, average='macro')
    # confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred_corrected)
    # print(confusion_matrix)
    # print(report)
    return f1, report, network

    # history_df.loc[5:, ['loss', 'val_loss']].plot()

In [24]:
# 100 verschiedene Modelle trainieren und in einen DataFrame speichern
# Das kann ein paar Stunden dauern

results = {}

for i in np.arange(50):
    f1, report, model = nn(i)
    results[i] = [model, f1, report]
    clear_output()
    df_results = pd.DataFrame.from_dict(results, orient='index', columns=['model', 'f1', 'report'])
    display(df_results)


#df_results

Unnamed: 0,model,f1,report
0,<tensorflow.python.keras.engine.sequential.Seq...,0.871957,precision recall f1-score ...
1,<tensorflow.python.keras.engine.sequential.Seq...,0.823643,precision recall f1-score ...
2,<tensorflow.python.keras.engine.sequential.Seq...,0.826965,precision recall f1-score ...
3,<tensorflow.python.keras.engine.sequential.Seq...,0.785427,precision recall f1-score ...
4,<tensorflow.python.keras.engine.sequential.Seq...,0.778834,precision recall f1-score ...
5,<tensorflow.python.keras.engine.sequential.Seq...,0.794145,precision recall f1-score ...
6,<tensorflow.python.keras.engine.sequential.Seq...,0.811166,precision recall f1-score ...
7,<tensorflow.python.keras.engine.sequential.Seq...,0.749081,precision recall f1-score ...
8,<tensorflow.python.keras.engine.sequential.Seq...,0.87247,precision recall f1-score ...
9,<tensorflow.python.keras.engine.sequential.Seq...,0.758107,precision recall f1-score ...


In [25]:
# DataFrame nach bestem F1-Score sortieren
df_results_sorted = df_results.sort_values(by=['f1'], ascending=False)

# Die besten 10 Modelle speichern
for i in np.arange(10):
    df_results_sorted.iloc[i].model.save(f'models/best_otu/model{i}.h5')

In [26]:
df_results.describe()

Unnamed: 0,f1
count,50.0
mean,0.811205
std,0.042226
min,0.710059
25%,0.780722
50%,0.810671
75%,0.841964
max,0.895237
