In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
bin_size = 5000000
matrix_path = f"/labmed/workspace/lotta/finaletoolkit/dataframes_notebook/final_feature_matrix_gc_corrected_{bin_size}.tsv"
df = pd.read_csv(matrix_path, sep="\t")

clinical_path = "/labmed/workspace/lotta/finaletoolkit/dataframes_notebook/filtered_clinical_characteristics.csv"
clinical_df = pd.read_csv(clinical_path)

In [None]:
valid_samples = clinical_df['Extracted_ID'].astype(str).unique()
df = df[df["sample"].astype(str).isin(valid_samples)].copy()
print("Samples in long matrix (after filtering):", df["sample"].nunique())

In [None]:
metrics = ['mean_gc_corrected',	'median_gc_corrected',	'stdev_gc_corrected',	'min_gc_corrected',	'max_gc_corrected',	'wps_value_gc_corrected']
missing_cols = [col for col in metrics if col not in df.columns]
if missing_cols:
    raise SystemExit("Missing columns:", missing_cols)

In [None]:
df_long["bin_id"] = df_long["chrom"].astype(str) + "_" + df_long["start"].astype(str)
melted = df_long.melt(id_vars=["sample","bin_id"], value_vars=metrics, var_name="metric", value_name="value")
melted["feature"] = melted["metric"] + "_" + melted["bin_id"]
pivot_df = melted.pivot(index="sample", columns="feature", values="value")

In [None]:
clinical_df['Extracted_ID'] = clinical_df['Extracted_ID'].astype(str)
sample_id = clinical_df.set_index('Extracted_ID')

def label_from_row(row):
    t = str(row.get("Patient Type"))
    return 0 if 'healthy' in t else 1

y = []
sample_ids = pivot_df.index.astype(str).tolist()
for s in sample_ids:
    if s not in sample_id.index:
        raise SystemExit(f"Sample {s} nicht in clinical_df")
    row = sample_id.loc[s]
    y.append(label_from_row(row))

y = np.array(y)
print("Labelverteilung:", np.bincount(y))

In [None]:
X = pivot_df.copy()
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify= y, random_state=42)
print("Train-Test Split:", X_train.shape, X_test.shape)

In [None]:
imputer = SimpleImputer(strategy='median')
# Lasso

lasso_pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', StandardScaler()),
    ('lasso', LogisticRegressionCV(
        Cs=10,
        penalty="l1",
        solver="liblinear",
        cv=5,
        scoring="roc_auc",
        max_iter=5000,
        random_state=42
        ))
])

# SVM
svm_pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', StandardScaler()),
    ('svm', SVC(
        kernel="rbf",
        Cs=1.0,
        gamma="scale",
        probability=True,
        random_state=42
        ))
])

# Extra Trees

et_pipeline = Pipeline([
    ('imputer', imputer),
    ('et', ExtraTreesClassifier(
        n_estimators=500,
        max_features="sqrt",
        n_jobs=-1,
        random_state=42
        ))
])

# XGBoost
xgb_pipeline = Pipeline([
    ('imputer', imputer),
    ('xgb', XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.6,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=4
    ))
])

models = {
    "lasso": lasso_pipeline,
    "svm": svm_pipeline,
    "et": et_pipeline,
    "xgb": xgb_pipeline
}


In [None]:
# Cross validation comparison 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
results = []
for name, model in models.items():
    aucs = cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    accuracy = cross_val_score(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    results[name] = {"auc_mean": np.mean(aucs), "auc_std": np.std(aucs), "acc_mean": np.mean(accuracy), "acc_std": np.std(accuracy)}
    print(f"  ROC-AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")
    print(f"  Accuracy: {np.mean(accuracy):.3f} ± {np.std(accuracy):.3f}")

In [None]:
# Ensemble Learning

voting = VotingClassifier(
    estimators=[(name, model) for name, model in models.items()],
    voting="soft",
    n_jobs=-1
)
voting.fit(X_train, y_train)

print("\nEvaluating Ensemble (soft voting) ...")
ens_auc = cross_val_score(voting, X, y, cv=cv, scoring="roc_auc", n_jobs=1)
ens_accuracy = cross_val_score(voting, X, y, cv=cv, scoring="accuracy", n_jobs=1)
print(f"  Ensemble ROC-AUC: {ens_auc.mean():.3f} ± {ens_auc.std():.3f}")
print(f"  Ensemble Accuracy: {ens_accuracy.mean():.3f} ± {ens_accuracy.std():.3f}")