In [1]:
import pandas as pd

rna_path = r"C:\Users\glori\Desktop\Lung-Cancer\Data\multiomics_rna_logCPM_TMM.tsv"
label_path = r"C:\Users\glori\Desktop\Lung-Cancer\Data\multiomics_labels.tsv"
meth_path   = "multiomics_meth_beta_values.tsv"


rna_df   = pd.read_csv(rna_path, sep="\t")
meth_df  = pd.read_csv(meth_path, sep="\t")
labels   = pd.read_csv(label_path, sep="\t")


In [2]:
print("RNA shape:", rna_df.shape)
print("DNA shape:", meth_df.shape)
print("Labels shape:", labels.shape)

rna_df.head()
meth_df.head()
labels.head()

RNA shape: (831, 17811)
DNA shape: (831, 10001)
Labels shape: (831, 3)


Unnamed: 0,patient_id,subtype,subtype_simple
0,TCGA-MP-A4SV,TCGA-LUAD,LUAD
1,TCGA-55-8621,TCGA-LUAD,LUAD
2,TCGA-MN-A4N1,TCGA-LUAD,LUAD
3,TCGA-55-6986,TCGA-LUAD,LUAD
4,TCGA-86-6851,TCGA-LUAD,LUAD


In [3]:
print(rna_df.columns[:10])
print(meth_df.columns[:10])
print(labels.columns)

Index(['patient_id', 'ENSG00000000003.15', 'ENSG00000000419.13',
       'ENSG00000000457.14', 'ENSG00000000460.17', 'ENSG00000000938.13',
       'ENSG00000000971.16', 'ENSG00000001036.14', 'ENSG00000001084.13',
       'ENSG00000001167.14'],
      dtype='object')
Index(['patient_id', 'cg26841862', 'cg23057992', 'cg23091104', 'cg16602369',
       'cg21164095', 'cg04571941', 'cg20545544', 'cg18703601', 'cg18121066'],
      dtype='object')
Index(['patient_id', 'subtype', 'subtype_simple'], dtype='object')


In [4]:
# Keep only the columns we need from labels
labels_small = labels[["patient_id", "subtype_simple"]].copy()
labels_small = labels_small.rename(columns={"subtype_simple": "label"})

# Merge RNA + DNA first
fused = (
    rna_df
    .merge(meth_df, on="patient_id", how="inner")
    .merge(labels_small, on="patient_id", how="inner")
)

print("Fused shape (RNA + DNA + label):", fused.shape)
fused.head(3)


Fused shape (RNA + DNA + label): (831, 27812)


Unnamed: 0,patient_id,ENSG00000000003.15,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,cg25874079,cg16400631,cg14729148,cg02096296,cg11741189,cg26429499,cg09157320,cg17470497,cg26002259,label
0,TCGA-MP-A4SV,5.394902,4.952867,4.235412,3.557212,4.204274,5.350455,5.315406,3.271355,5.631599,...,0.055724,0.557203,0.430219,0.587486,0.310899,0.85814,0.644167,0.429284,0.130573,LUAD
1,TCGA-55-8621,5.106508,4.469044,3.573832,2.163817,6.016368,6.293136,5.263294,4.924332,4.504391,...,0.497577,0.265244,0.549588,0.870886,0.259644,0.933164,0.776085,0.80132,0.151321,LUAD
2,TCGA-MN-A4N1,6.038168,4.933833,3.890689,3.051575,3.176808,5.026605,5.884609,5.471267,4.950593,...,0.042874,0.742028,0.791193,0.836851,0.70066,0.950542,0.717134,0.818287,0.657435,LUAD


In [5]:
# Drop patient_id and label from the feature matrix
X = fused.drop(columns=["patient_id", "label"])
y = fused["label"]

print("Feature matrix X:", X.shape)
print("Label distribution:\n", y.value_counts())


Feature matrix X: (831, 27810)
Label distribution:
 label
LUAD    459
LUSC    372
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape


((664, 27810), (167, 27810))

In [7]:
from sklearn.linear_model import LogisticRegression

fusion_clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

fusion_clf.fit(X_train_scaled, y_train)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
import numpy as np

print("Total NaNs in X:", np.isnan(X).sum())
# Optional: percentage
print("NaN percentage:", np.isnan(X).sum() / X.size)


Total NaNs in X: ENSG00000000003.15      0
ENSG00000000419.13      0
ENSG00000000457.14      0
ENSG00000000460.17      0
ENSG00000000938.13      0
                     ... 
cg11741189             24
cg26429499              2
cg09157320            275
cg17470497              3
cg26002259              0
Length: 27810, dtype: int64
NaN percentage: ENSG00000000003.15    0.000000e+00
ENSG00000000419.13    0.000000e+00
ENSG00000000457.14    0.000000e+00
ENSG00000000460.17    0.000000e+00
ENSG00000000938.13    0.000000e+00
                          ...     
cg11741189            1.038507e-06
cg26429499            8.654221e-08
cg09157320            1.189955e-05
cg17470497            1.298133e-07
cg26002259            0.000000e+00
Length: 27810, dtype: float64


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Split (same as before)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Pipeline: Impute -> Scale -> Logistic Regression
fusion_clf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # handles NaNs
    ("scaler", StandardScaler()),                   # standardization
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

fusion_clf.fit(X_train, y_train)


0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [10]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

# Predict
y_pred = fusion_clf.predict(X_test)
y_prob = fusion_clf.predict_proba(X_test)

print("=== Early Fusion: RNA + DNA (with imputation) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Binary LUAD vs LUSC ROC–AUC
auc = roc_auc_score(y_test, y_prob[:, 1])
print("ROC–AUC:", auc)


=== Early Fusion: RNA + DNA (with imputation) ===
Accuracy: 0.9760479041916168

Confusion matrix:
 [[89  3]
 [ 1 74]]

Classification report:
               precision    recall  f1-score   support

        LUAD       0.99      0.97      0.98        92
        LUSC       0.96      0.99      0.97        75

    accuracy                           0.98       167
   macro avg       0.97      0.98      0.98       167
weighted avg       0.98      0.98      0.98       167

ROC–AUC: 0.997536231884058


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

def run_model(clf, name):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", clf)
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]  # prob for class index 1

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"{name}:  Accuracy = {acc:.3f},  ROC-AUC = {auc:.3f}")
    return name, acc, auc


In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

models = {
    "LDA":  LinearDiscriminantAnalysis(),
    "LR":   LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=-1),
    "CART": DecisionTreeClassifier(random_state=42),
    "NB":   GaussianNB(),
    "SVM":  SVC(kernel="rbf", probability=True, random_state=42),
    "KNN":  KNeighborsClassifier(n_neighbors=5)
}


In [None]:
results = []
for name, clf in models.items():
    results.append(run_model(clf, name))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC_AUC"])
results_df = results_df.sort_values("ROC_AUC", ascending=False)
print("\n=== Multi-omics (RNA + DNA) model comparison ===")
print(results_df)

results_df.to_csv("multiomics_six_model_comparison.csv", index=False)


LDA:  Accuracy = 0.982,  ROC-AUC = 0.997
LR:  Accuracy = 0.976,  ROC-AUC = 0.998
CART:  Accuracy = 0.904,  ROC-AUC = 0.908
NB:  Accuracy = 0.910,  ROC-AUC = 0.908
SVM:  Accuracy = 0.970,  ROC-AUC = 0.997
KNN:  Accuracy = 0.970,  ROC-AUC = 0.993

=== Multi-omics (RNA + DNA) model comparison ===
  Model  Accuracy   ROC_AUC
1    LR  0.976048  0.997536
0   LDA  0.982036  0.997246
4   SVM  0.970060  0.996667
5   KNN  0.970060  0.993188
2  CART  0.904192  0.908116
3    NB  0.910180  0.908043


# **LATE FUSION**

In [20]:
# RNA-only features (no patient_id)
X_rna = rna_df.drop(columns=["patient_id"])

# DNA-only features (no patient_id)
X_dna = meth_df.drop(columns=["patient_id"])

# Labels (LUAD / LUSC from subtype_simple)
y = labels["subtype_simple"]


In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

indices = np.arange(len(y))

train_idx, test_idx = train_test_split(
    indices, test_size=0.2, stratify=y, random_state=42
)

Xr_tr, Xr_te = X_rna.iloc[train_idx], X_rna.iloc[test_idx]
Xd_tr, Xd_te = X_dna.iloc[train_idx], X_dna.iloc[test_idx]
y_tr,  y_te  = y.iloc[train_idx],    y.iloc[test_idx]


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# RNA: Logistic Regression
rna_clf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

# DNA: SVM with probability outputs
dna_clf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(
        kernel="rbf",
        probability=True,      # gives predict_proba
        random_state=42
    ))
])

rna_clf.fit(Xr_tr, y_tr)
dna_clf.fit(Xd_tr, y_tr)


0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [23]:
import numpy as np

# After rna_clf.fit(...)
class_labels = rna_clf.classes_       # e.g. array(['LUAD', 'LUSC'], dtype='<U4')
pos_label = "LUSC"                    # choose which one is "positive" for AUC
pos_idx = np.where(class_labels == pos_label)[0][0]


In [24]:
Pr = rna_clf.predict_proba(Xr_te)   # RNA posteriors
Pd = dna_clf.predict_proba(Xd_te)   # DNA posteriors

In [25]:
# 1) Fusion in probability space (unchanged)
P_sum  = Pr + Pd
P_max  = np.maximum(Pr, Pd)
P_prod = Pr * Pd

# 2) Convert argmax indices → string labels
y_sum  = class_labels[P_sum.argmax(axis=1)]
y_max  = class_labels[P_max.argmax(axis=1)]
y_prod = class_labels[P_prod.argmax(axis=1)]


In [26]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

def eval_fusion(name, y_true, P, y_pred, class_labels, pos_label):
    # accuracy & confusion matrix use string labels directly
    acc = accuracy_score(y_true, y_pred)
    cm  = confusion_matrix(y_true, y_pred, labels=class_labels)

    # For ROC–AUC we need a binary 0/1 vector for the positive class
    y_true_bin = (y_true == pos_label).astype(int)
    pos_idx = np.where(class_labels == pos_label)[0][0]
    auc = roc_auc_score(y_true_bin, P[:, pos_idx])

    print(f"\n{name} fusion")
    print("Accuracy:", acc)
    print("ROC–AUC:", auc)
    print("Confusion matrix (rows=true, cols=pred):\n", cm)
    return acc, auc


In [27]:
res_sum  = eval_fusion("SUM",     y_te, P_sum,  y_sum,  class_labels, pos_label)
res_max  = eval_fusion("MAX",     y_te, P_max,  y_max,  class_labels, pos_label)
res_prod = eval_fusion("PRODUCT", y_te, P_prod, y_prod, class_labels, pos_label)



SUM fusion
Accuracy: 0.9640718562874252
ROC–AUC: 0.9976811594202899
Confusion matrix (rows=true, cols=pred):
 [[87  5]
 [ 1 74]]

MAX fusion
Accuracy: 0.9640718562874252
ROC–AUC: 0.9957971014492754
Confusion matrix (rows=true, cols=pred):
 [[87  5]
 [ 1 74]]

PRODUCT fusion
Accuracy: 0.9640718562874252
ROC–AUC: 0.9981159420289856
Confusion matrix (rows=true, cols=pred):
 [[87  5]
 [ 1 74]]
