In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1. Loading of Dataframes

In [29]:
bin_size = 5000000
matrix_path = f"/labmed/workspace/lotta/finaletoolkit/dataframes_notebook/final_feature_matrix_gc_corrected_{bin_size}.tsv"
df = pd.read_csv(matrix_path, sep="\t")

clinical_path = "/labmed/workspace/lotta/finaletoolkit/dataframes_notebook/filtered_clinical_characteristics.csv"
clinical_df = pd.read_csv(clinical_path)

valid_samples = clinical_df["Extracted_ID"].unique()
df = df[df["sample"].isin(valid_samples)].copy()

print(f"Number of Samples in Matrix: {df['sample'].nunique()}")
print(f"Number of Bins per Sample: {len(df) / df['sample'].nunique()}")

Number of Samples in Matrix: 230
Number of Bins per Sample: 411.0


# 2. Feature Selektion for LASSO and Pivoting

In [None]:
df["bin_id"] = df["chrom"] + "_" + df["start"].astype(str)

metrics = [
    "mean_gc_corrected", 
   # "median_gc_corrected", 
#"stdev_gc_corrected", 
   # "wps_value_gc_corrected",
   # "min_gc_corrected",
   # "max_gc_corrected",	
]

pivot_df = df.pivot(index="sample", columns="bin_id", values=metrics)

pivot_df.columns = [f"{metric}_{bin_id}" for metric, bin_id in pivot_df.columns]

# Fehlende Werte werden nun später in der Pipeline aufgefüllt, um Data Leakage zu vermeiden
# pivot_df = pivot_df.fillna(pivot_df.median())

print(f"Shape of Feature Matrix: {pivot_df.shape} (Patienten x Features)")
pivot_df.to_csv("/labmed/workspace/lotta/finaletoolkit/dataframes_notebook/pivot_df.csv", index=True)
pivot_df.head()

Shape of Feature Matrix: (230, 411) (Patienten x Features)


Unnamed: 0_level_0,mean_gc_corrected_chr10_0,mean_gc_corrected_chr10_10000000,mean_gc_corrected_chr10_100000000,mean_gc_corrected_chr10_110000000,mean_gc_corrected_chr10_115000000,mean_gc_corrected_chr10_120000000,mean_gc_corrected_chr10_125000000,mean_gc_corrected_chr10_130000000,mean_gc_corrected_chr10_25000000,mean_gc_corrected_chr10_30000000,...,mean_gc_corrected_chr9_135000000,mean_gc_corrected_chr9_15000000,mean_gc_corrected_chr9_20000000,mean_gc_corrected_chr9_25000000,mean_gc_corrected_chr9_35000000,mean_gc_corrected_chr9_5000000,mean_gc_corrected_chr9_80000000,mean_gc_corrected_chr9_85000000,mean_gc_corrected_chr9_90000000,mean_gc_corrected_chr9_95000000
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EE86234,131.424227,131.516527,129.680803,128.00397,132.785399,129.324524,129.525927,127.484744,131.355433,127.63972,...,129.855102,130.306197,132.024244,130.877137,129.181096,129.499363,132.209633,146.444961,133.279417,128.960446
EE86255,120.464885,122.79677,119.14416,118.941439,120.33656,124.809618,124.803791,124.059804,120.284725,116.392398,...,123.625131,120.460709,116.287109,124.747981,121.486045,118.726657,121.258566,113.912497,121.554256,119.906925
EE86259,136.350794,138.581798,132.074107,140.143615,139.555218,134.932124,135.10139,129.614872,148.390247,141.852126,...,125.472325,102.927377,50.867228,38.02347,105.820532,157.564962,58.564698,84.555967,67.624996,95.397221
EE86268,128.114336,126.979881,127.143262,126.007503,125.331331,129.860459,124.383769,129.172492,125.076143,123.730099,...,127.461257,129.471194,127.877158,125.233097,126.115463,121.075683,122.777226,131.353678,127.850372,124.523536
EE86270,170.684069,170.173054,170.233669,171.348012,171.234269,172.665171,172.416406,170.547694,171.677562,167.909379,...,171.380198,170.803561,171.184159,172.557039,171.140638,171.206502,172.011567,169.87198,172.852743,170.083507


# 3. Stratification

In [31]:
# 3. Labels (y) und Stratifizierung vorbereiten

y = []
strata = []

for sample_id in pivot_df.index:
    row = clinical_df[clinical_df["Extracted_ID"] == sample_id].iloc[0]
    
    is_healthy = row["Patient Type"].lower() == "healthy"
    y.append(0 if is_healthy else 1)
    strata.append(row["Gender"])

y = np.array(y)
X = pivot_df

print(f"Number Cancer: {sum(y)}")
print(f"Number Healthy: {len(y) - sum(y)}")

Number Cancer: 116
Number Healthy: 114


# 4. Split: Train and Test


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 5. LASSO Training mit Cross-Validation und Pipeline


In [33]:
# Pipeline: Imputation -> Scaling -> LassoCV
for C in [2, 1, 10, 100]:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('lasso', LogisticRegression(
            C=C,
            penalty='l1',
            solver='liblinear',
            max_iter=10000,
            random_state=42
        ))
    ])

pipeline.fit(X_train, y_train)

#best_c = pipeline.named_steps['lasso'].C_[0]
#print(f"Best possible C-Value: {best_c}")

y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.6739130434782609
ROC AUC Score: 0.6748582230623819

Confusion Matrix:
 [[18  5]
 [10 13]]

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.78      0.71        23
           1       0.72      0.57      0.63        23

    accuracy                           0.67        46
   macro avg       0.68      0.67      0.67        46
weighted avg       0.68      0.67      0.67        46



### 5.1 Visualisierung des Parameter-Tunings

Das `LogisticRegressionCV` Modell hat automatisch verschiedene Werte für den Parameter `C` ausprobiert. 
Hier visualisieren wir, wie sich die Genauigkeit des Modells mit `C` verändert.

- **Kleines C**: Starke Regularisierung (Modell wird "gezwungen", einfache Lösungen zu finden). Gefahr von Underfitting.
- **Großes C**: Schwache Regularisierung (Modell darf komplexer sein). Gefahr von Overfitting.
- **Bestes C**: Der Wert, der in der Cross-Validation (CV) die beste Balance und damit den höchsten Score erreicht hat.

In [None]:
'''
lasso = pipeline.named_steps['lasso']

# Scores extrahieren. Form: (n_folds, n_Cs)
# Wir nehmen den Durchschnitt über die Folds für jeden C-Wert
# scores_ ist ein Dict mit Klassen-Labels als Keys. Bei binärer Klassifikation (0, 1) nehmen wir 1.
mean_scores = np.mean(lasso.scores_[1], axis=0) 
std_scores = np.std(lasso.scores_[1], axis=0)
cs = lasso.Cs_

plt.figure(figsize=(10, 6))
plt.semilogx(cs, mean_scores, marker='o', label='Mean CV Score')
plt.fill_between(cs, mean_scores - std_scores, mean_scores + std_scores, alpha=0.2, color='gray', label='Std. Dev.')

# Bestes C markieren
plt.axvline(lasso.C_[0], linestyle='--', color='r', label=f'Best C: {lasso.C_[0]:.2e}')

plt.title("Lasso Parameter Tuning: Accuracy vs. Regularization Strength (C)")
plt.xlabel("C (Inverse Regularization Strength)")
plt.ylabel("Cross-Validation Accuracy")
plt.legend()
plt.grid(True, which="both", ls="-", alpha=0.5)
plt.show()'''

AttributeError: 'LogisticRegression' object has no attribute 'scores_'

# 6. Selected Features

In [35]:
# Zugriff auf das Lasso-Modell in der Pipeline
lasso_model = pipeline.named_steps['lasso']

coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lasso_model.coef_[0]
})

# Filtere Features, die NICHT 0 sind
important_features = coef_df[coef_df["Coefficient"] != 0].sort_values(by="Coefficient", ascending=False)

print(f"Number of Important Features: {len(important_features)}")
print("\nTop Features (Positive = Indikative for Cancer, Negative = Indikative for Healthy):")
important_features.head(20)

Number of Important Features: 196

Top Features (Positive = Indikative for Cancer, Negative = Indikative for Healthy):


Unnamed: 0,Feature,Coefficient
53,mean_gc_corrected_chr12_55000000,5.476272
354,mean_gc_corrected_chr7_110000000,5.403191
51,mean_gc_corrected_chr12_5000000,4.580455
184,mean_gc_corrected_chr20_10000000,4.367042
94,mean_gc_corrected_chr15_100000000,4.298604
369,mean_gc_corrected_chr7_70000000,4.036749
277,mean_gc_corrected_chr4_115000000,4.001259
342,mean_gc_corrected_chr6_45000000,3.967287
336,mean_gc_corrected_chr6_170000000,3.835751
73,mean_gc_corrected_chr13_65000000,3.789944
