In [1]:
# Setup

import numpy as np

# Ein Zufalls-Seed für Reproduzierbarkeit
np.random.seed(42)

from datetime import datetime
import pandas as pd
import seaborn as sns
from scipy import stats
import sklearn as sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.impute import KNNImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, utils

from IPython.display import display, clear_output

sns.set_theme()

print(f"GPUs für die Berechnung: {len(tf.config.experimental.list_physical_devices('GPU'))}")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


GPUs für die Berechnung: 0


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Taxonomie Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv")

# Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table_tax.txt', sep='\t').T
feature = feature[1:][:-1]

In [3]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

y = []
for row in feature.index:
    if any(True for val in HC['sample_name'] if val == row):
        y.append(1)
    else:
        y.append(0)

X = feature.iloc[:, :].values

In [4]:
feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,323,324,325,326,327,328,329,330,331,332
1629.SubjectIBD335,0.0,345.0,0.0,0.0,0.0,0.0,412.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,353.0
1629.SubjectIBD643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1629.SubjectIBD539,0.0,2869.0,0.0,0.0,0.0,0.0,1665.0,0.0,0.0,0.0,...,0.0,746.0,0.0,0.0,0.0,3.0,21.0,0.0,0.0,88919.0
1629.SubjectIBD078,0.0,5.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1629.SubjectIBD671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [5]:
def nn(random_state):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

    # Das Modell soll aufhören zu rechnen, falls es keine nennenswerten Verbesserungen mehr gibt
    early_stopping = callbacks.EarlyStopping(
        min_delta=0.001,
        patience=64,
        restore_best_weights=True
    )

    # Das NN besteht aus einer Mischung von Dense-, Normalization- und Dropout-Layern.
    # Dropout führt allem Anschein nach zu schlechterem F1
    # Weniger LUs führen zu besseren Ergebnissen
    network = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
        #layers.Dropout(rate=0.5),
        layers.Dense(128, activation='relu'),
        #layers.Dropout(rate=0.5),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ])

    # NN kompilieren
    network.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[keras.metrics.Accuracy()]
    )

    # NN trainieren
    history = network.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        batch_size=128,
        epochs=512,
        callbacks=[early_stopping],
        verbose=0
    )

    # Scores berechnen
    # history_df = pd.DataFrame(history.history)

    y_pred = np.floor(network.predict(X_test))
    report = sklearn.metrics.classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    # confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
    # print(confusion_matrix)
    # print(report)
    return f1, report, network

    # history_df.loc[5:, ['loss', 'val_loss']].plot()

In [6]:
# 100 verschiedene Modelle trainieren und in einen DataFrame speichern
# Das kann ein paar Stunden dauern

results = {}

for i in np.arange(100):
    f1, report, model = nn(i)
    results[i] = [model, f1, report]
    clear_output()
    df_results = pd.DataFrame.from_dict(results, orient='index', columns=['model', 'f1', 'report'])
    display(df_results)


#df_results

Unnamed: 0,model,f1,report
0,<tensorflow.python.keras.engine.sequential.Seq...,0.583552,precision recall f1-score ...
1,<tensorflow.python.keras.engine.sequential.Seq...,0.780392,precision recall f1-score ...
2,<tensorflow.python.keras.engine.sequential.Seq...,0.529412,precision recall f1-score ...
3,<tensorflow.python.keras.engine.sequential.Seq...,0.745371,precision recall f1-score ...
4,<tensorflow.python.keras.engine.sequential.Seq...,0.690537,precision recall f1-score ...
...,...,...,...
95,<tensorflow.python.keras.engine.sequential.Seq...,0.744841,precision recall f1-score ...
96,<tensorflow.python.keras.engine.sequential.Seq...,0.643045,precision recall f1-score ...
97,<tensorflow.python.keras.engine.sequential.Seq...,0.821522,precision recall f1-score ...
98,<tensorflow.python.keras.engine.sequential.Seq...,0.801781,precision recall f1-score ...


In [7]:
# DataFrame nach bestem F1-Score sortieren
df_results_sorted = df_results.sort_values(by=['f1'], ascending=False)

# Die besten 10 Modelle speichern
for i in np.arange(10):
    df_results_sorted.iloc[i].model.save(f'models/best_tax/model{i}.h5')

In [8]:
# Das beste Modell über neue Random-States predicten lassen

for i in np.arange(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = np.random.randint(100000))

    y_pred = np.floor(df_results_sorted.iloc[0].model.predict(X_test))
    report = sklearn.metrics.classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
    print(confusion_matrix)
    print(report)
    print('\n\n\n')

[[122   0]
 [  5   9]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       122
           1       1.00      0.64      0.78        14

    accuracy                           0.96       136
   macro avg       0.98      0.82      0.88       136
weighted avg       0.96      0.96      0.96       136





[[125   1]
 [  2   8]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       126
           1       0.89      0.80      0.84        10

    accuracy                           0.98       136
   macro avg       0.94      0.90      0.92       136
weighted avg       0.98      0.98      0.98       136





[[121   1]
 [  1  13]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       122
           1       0.93      0.93      0.93        14

    accuracy                           0.99       136
   macro avg       0.96      0.96      0.96       1

[[121   1]
 [  0  14]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       122
           1       0.93      1.00      0.97        14

    accuracy                           0.99       136
   macro avg       0.97      1.00      0.98       136
weighted avg       0.99      0.99      0.99       136





[[120   0]
 [  1  15]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       120
           1       1.00      0.94      0.97        16

    accuracy                           0.99       136
   macro avg       1.00      0.97      0.98       136
weighted avg       0.99      0.99      0.99       136





[[121   1]
 [  2  12]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       122
           1       0.92      0.86      0.89        14

    accuracy                           0.98       136
   macro avg       0.95      0.92      0.94       1

In [9]:
df_results.describe()

Unnamed: 0,f1
count,100.0
mean,0.783412
std,0.072444
min,0.529412
25%,0.738781
50%,0.787367
75%,0.834369
max,0.933927
