In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import time, datetime
from IPython.display import display, clear_output
from sklearn import svm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, f1_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
import random

sns.set_style()
%matplotlib inline
%matplotlib widget

In [50]:

#Metadaten einlesen
df = pd.read_csv("NIHMS841832-supplement-1.csv", sep=',')
    
#Ergebnisse des Feature Tables einlesen
feature = pd.read_csv('feature_table.txt', sep='\t').T
feature = feature[1:][:-1]

feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3101,3102,3103,3104,3105,3106,3107,3108,3109,3110
1629.SubjectIBD335,34292.0,20670.0,18413.0,9981.0,7071.0,6881.0,5411.0,5335.0,5289.0,4741.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD643,15243.0,64328.0,0.0,0.0,0.0,4.0,4507.0,3216.0,15630.0,199.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD539,22182.0,21589.0,0.0,1365.0,0.0,11501.0,33619.0,3638.0,5053.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD078,0.0,805.0,0.0,0.0,0.0,4.0,330.0,2305.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD671,0.0,19734.0,0.0,0.0,0.0,0.0,215.0,0.0,0.0,699.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629.SubjectIBD421,5154.0,12101.0,1572.0,62.0,190.0,1448.0,6316.0,2205.0,1885.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD202,14565.0,24920.0,3543.0,0.0,0.0,0.0,63.0,0.0,37768.0,48660.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629.SubjectIBD544,32.0,52.0,31.0,0.0,0.0,2.0,18543.0,0.0,3.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,946.0,31.0
1629.SubjectIBD422,5718.0,18420.0,9534.0,0.0,0.0,0.0,4791.0,2770.0,0.0,588.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
#Gesunde Kontrollgruppe
HC = df[df.ibd_subtype.eq("HC")]

#CCD
CCD = df[df.ibd_subtype.eq("CCD")]
    
#ICD-r
ICD_r = df[df.ibd_subtype.eq("ICD_r")]

#ICD-nr
ICD_nr = df[df.ibd_subtype.eq("ICD_nr")]
    
#UCD
UC = df[df.ibd_subtype.eq("UC")]


In [52]:
#Split nach Patient. Alle Zeitpunkte eines Patienten entweder in Train oder Test.
def split_function(tSize, random_state, table, metadata, hc_group):
    patientSamples = {}
    liste = []

    for row in metadata.index:
        liste.append(metadata['patientnumber'][row])

    menge = set(liste)  

    for e in menge:
        newPatient = metadata[metadata.patientnumber.eq(e)]
        patientSamples[e] = list(newPatient['sample_name'])

    shuffleListe = list(menge)
    random.Random(random_state).shuffle(shuffleListe)

    #Split
    trainSize = int(np.round(tSize * len(shuffleListe), 0))
    testSize = len(shuffleListe) - trainSize

    X_trainEntry = shuffleListe[0:trainSize]
    X_testEntry = shuffleListe[trainSize:len(shuffleListe)]

    X_train = []
    y_train = []
    for i in X_trainEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_train.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_train.append(1)
                        else:
                            y_train.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))


    X_test = []   
    y_test = []
    for i in X_testEntry:
        try:
            value = patientSamples[i]
            for sample_name in value:
                for row in table.index:
                    if(row == sample_name):
                        temp = table.loc[[row]].values[0]
                        X_test.append(temp)
                        if any(True for val in hc_group['sample_name'] if val == row):
                            y_test.append(1)
                        else:
                            y_test.append(0)
        except KeyError as e:
            fehler += 1
            print('I got a KeyError - reason "%s"' % str(e))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [53]:
def make_split(i):
    #Split Test und Trainingsdaten für jede Gruppe
    X_trainHC, X_testHC, y_trainHC, y_testHC = split_function(tSize=0.5, random_state=i, table=feature, metadata=HC, hc_group=HC)
    X_trainCCD, X_testCCD, y_trainCCD, y_testCCD = split_function(tSize=0.75, random_state=i, table=feature, metadata=CCD, hc_group=HC)
    X_trainICD_r, X_testICD_r, y_trainICD_r, y_testICD_r = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_r, hc_group=HC)
    X_trainICD_nr, X_testICD_nr, y_trainICD_nr, y_testICD_nr = split_function(tSize=0.75, random_state=i, table=feature, metadata=ICD_nr, hc_group=HC)
    X_trainUC, X_testUC, y_trainUC, y_testUC = split_function(tSize=0.75, random_state=i, table=feature, metadata=UC, hc_group=HC)

    X_train = np.concatenate((X_trainHC,  X_trainCCD,  X_trainICD_r,  X_trainICD_nr, X_trainUC), axis=0)
    X_test = np.concatenate((X_testHC,  X_testCCD,  X_testICD_r,  X_testICD_nr, X_testUC), axis=0)
    y_train = np.concatenate((y_trainHC,  y_trainCCD,  y_trainICD_r,  y_trainICD_nr, y_trainUC), axis=0)
    y_test = np.concatenate((y_testHC,  y_testCCD,  y_testICD_r,  y_testICD_nr, y_testUC), axis=0)

    """
    trainSize = len(X_train)/(len(X_train)+len(X_test))
    testSize =len(X_test)/(len(X_train)+len(X_test))

    HCTrainSize = len(X_trainHC)/(len(X_trainHC)+len(X_testHC))
    HCTestSize = len(X_testHC)/(len(X_trainHC)+len(X_testHC))

    CCDTrainSize = len(X_trainCCD)/(len(X_trainCCD)+len(X_testCCD))
    CCDTestSize = len(X_testCCD)/(len(X_trainCCD)+len(X_testCCD))

    ICD_rTrainSize = len(X_trainICD_r)/(len(X_trainICD_r)+len(X_testICD_r))
    ICD_rTestSize = len(X_testICD_r)/(len(X_trainICD_r)+len(X_testICD_r))

    ICD_nrTrainSize = len(X_trainICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr))
    ICD_nrTestSize = len(X_testICD_nr)/(len(X_trainICD_nr)+len(X_testICD_nr))

    UCTrainSize = len(X_trainUC)/(len(X_trainUC)+len(X_testUC))
    UCTestSize = len(X_testUC)/(len(X_trainUC)+len(X_testUC))
    
    print(f"{np.round(trainSize, 2)*100}/{np.round(testSize, 2)*100}")
    """
    
    """
    pca = MDS(n_components=3, random_state=0)
    
    pcas = [
        {'name': 'HC', 'model': pca.fit_transform(X_trainHC), 'x': [], 'y': [], 'z': [], 'color': 'red'},
        {'name': 'CCD', 'model': pca.fit_transform(X_trainCCD), 'x': [], 'y': [], 'z': [], 'color': 'blue'},
        {'name': 'ICD_r', 'model': pca.fit_transform(X_trainICD_r), 'x': [], 'y': [], 'z': [], 'color': 'green'},
        {'name': 'ICD_nr', 'model': pca.fit_transform(X_trainICD_nr), 'x': [], 'y': [], 'z': [], 'color': 'cyan'},
        {'name': 'UC', 'model': pca.fit_transform(X_trainUC), 'x': [], 'y': [], 'z': [], 'color': 'orange'}
    ]
    
    fig, ax = plt.subplots(subplot_kw={'projection': '3d'})
    
    for model in pcas:
        for vector in model['model']:
            model['x'].append(vector[0])
            model['y'].append(vector[1])
            model['z'].append(vector[2])
        
        ax.scatter(xs=model['x'], ys=model['y'], zs=model['z'], color = model['color'], label=model['name'])

    plt.legend()
    plt.show()
    """
    
    return X_train, X_test, y_train, y_test

In [54]:
classifiers = [
    {'name': 'Linear SVM', 'model': svm.SVC(kernel="linear", C=0.025, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'Decision Tree', 'model': DecisionTreeClassifier(max_features='auto', max_depth=24, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'MLP', 'model': MLPClassifier(alpha=5, max_iter=1000, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'AdaBoost', 'model': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_features='auto', max_depth=3, random_state=0), n_estimators=50, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'Gradient Boosting', 'model': GradientBoostingClassifier(n_estimators=200, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'Logistic Regression', 'model': linear_model.LogisticRegression(solver='lbfgs', C=1, class_weight={0: 1, 1: 13}, max_iter=2000, random_state=0), 'f1_best': 0, 'f1_avg': 0},
    {'name': 'Ridge Regression', 'model': linear_model.RidgeClassifier(solver='auto', alpha=1, class_weight={0: 1, 1: 13}, max_iter=1300, random_state=0), 'f1_best': 0, 'f1_avg': 0},
]

n = 10
for i in range(0, n):
    start_time = time.time()
    print(f"Now: Iteration {i+1}/{n}")
    
    X_train, X_test, y_train, y_test = make_split(i)
    
    for model in classifiers:
        start_time_model = time.time()

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
    
        classifier = model['model']
        
        classifier.fit(X_train, y_train)

        #Test
        y_pred = classifier.predict(X_test)
        predict_train = classifier.predict(X_train)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        if f1 > model['f1_best']:
            model['f1_best'] = f1
            
        model['f1_avg'] += f1
        
        end_time_model = time.time()
        time_difference_model = end_time_model - start_time_model
        print(f"Ran {model['name']} in {np.round(time_difference_model * 1000, 2)} milliseconds.")
        
        
    end_time = time.time()
    time_difference = end_time - start_time
    remaining_time = time_difference * (n-i+1)
    
    rest = datetime.timedelta(seconds=remaining_time)
    
    clear_output()
    print(f"The last iteration ({i+1}) took {round(time_difference, 2)} seconds.")
    print(f"Estimated total runtime remaining: {rest}")
        
    if i == n-1:
        print("\n")
        for model in classifiers:
            print(f"{model['name']}: \nBest: {model['f1_best']}\nAverage: {model['f1_avg']/n}\n")

The last iteration (10) took 4.44 seconds.
Estimated total runtime remaining: 0:00:08.874244


Stack: 
Best: 0.812638514525307
Average: 0.7311751620991273

