# PDAC Multi-omics Integration & Prediction Pipeline

**Contents:**

1. Load CSVs (circRNA, miRNA, mRNA, phosphoproteome gene, SCNA gene, clinical data)
2. Preprocess (filter, z-score)

In [70]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.metrics import pairwise_distances
print('ready')


ready


In [75]:
# Load files
base_path = '../PDAC_data'
files = os.listdir(base_path)
print('files in ../PDAC_data:', files)

def try_read(path):
    try:
        return pd.read_csv(path, index_col=0)
    except Exception as e:
        print('Could not read', path, '->', e)
        return None

circ = try_read(os.path.join(base_path,'circRNA.csv'))
mir = try_read(os.path.join(base_path,'miRNA.csv'))
mrna = try_read(os.path.join(base_path,'mRNA.csv'))
phospho = try_read(os.path.join(base_path,'phosphoproteome_gene.csv'))
proteome = try_read(os.path.join(base_path,'proteome_gene.csv'))
scna = try_read(os.path.join(base_path,'SCNA_gene.csv'))
clin = try_read(os.path.join(base_path,'clinical_data.csv'))

print('circ shape', circ.shape)
print('mir shape',  mir.shape)
print('mrna shape', mrna.shape)
print('phospho shape', phospho.shape)
print('proteome shape', proteome.shape)
print('scna shape', scna.shape)
print('clinical data shape', clin.shape)


files in ../PDAC_data: ['circRNA.csv', 'clinical_data.csv', 'miRNA.csv', 'mRNA.csv', 'phosphoproteome_gene.csv', 'proteome_gene.csv', 'SCNA_gene.csv', 'unused data']
circ shape (3979, 137)
mir shape (2416, 137)
mrna shape (28057, 137)
phospho shape (8004, 137)
proteome shape (11662, 137)
scna shape (19906, 137)
clinical data shape (140, 24)


In [77]:
# Extract sample IDs from omics datasets (columns)
circ_ids = set(circ.columns.astype(str)) if circ is not None else set()
mir_ids  = set(mir.columns.astype(str)) if mir is not None else set()
mrna_ids = set(mrna.columns.astype(str)) if mrna is not None else set()
phospho_ids = set(phospho.columns.astype(str)) if phospho is not None else set()
proteome_ids = set(proteome.columns.astype(str)) if proteome is not None else set()
scna_ids = set(scna.columns.astype(str)) if scna is not None else set()

# Extract sample IDs from clinical (row index)
clin_ids = set(clin.index.astype(str)) if clin is not None else set()

# Intersect across all datasets that exist
common = circ_ids & mir_ids & mrna_ids & phospho_ids & proteome_ids & scna_ids & clin_ids

print("circ IDs:", len(circ_ids))
print("mir IDs:",  len(mir_ids))
print("mrna IDs:", len(mrna_ids))
print("phospho IDs:", len(phospho_ids))
print("proteome IDs:", len(proteome_ids))
print("scna IDs:", len(scna_ids))
print("clinical IDs:", len(clin_ids))
print("common samples:", len(common))

circ IDs: 137
mir IDs: 137
mrna IDs: 137
phospho IDs: 137
proteome IDs: 137
scna IDs: 137
clinical IDs: 140
common samples: 137


In [78]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

def preprocess_omics(df, min_nonzero_frac=0.05, top_var=None):
    df = df.copy()

    # Keep features with sufficient non-zero entries
    nz_frac = (df.notna() & (df != 0)).sum(axis=1) / df.shape[1]
    df = df.loc[nz_frac >= min_nonzero_frac]

    # Select top variable features if requested
    if top_var is not None and df.shape[0] > top_var:
        var = df.var(axis=1)
        keep = var.sort_values(ascending=False).index[:top_var]
        df = df.loc[keep]

    # Z-score standardize features across samples
    scaler = StandardScaler(with_mean=True, with_std=True)
    mat = scaler.fit_transform(df.T).T  # scale features across samples

    return pd.DataFrame(mat, index=df.index, columns=df.columns)

# Subset datasets to common samples and preprocess
common_list = sorted(list(common))  # assume common exists and is not empty

circ_z = preprocess_omics(circ[common_list], min_nonzero_frac=0.05, top_var=None)
mir_z = preprocess_omics(mir[common_list], min_nonzero_frac=0.05, top_var=None)
mrna_z = preprocess_omics(mrna[common_list], min_nonzero_frac=0.05, top_var=None)
phospho_z = preprocess_omics(phospho[common_list], min_nonzero_frac=0.05, top_var=None)
proteome_z = preprocess_omics(proteome[common_list], min_nonzero_frac=0.05, top_var=None)
scna_z = preprocess_omics(scna[common_list], min_nonzero_frac=0.05, top_var=None)

# Print preprocessed shapes
print('preprocessed shapes:')
print('circ_z', circ_z.shape)
print('mir_z ', mir_z.shape)
print('mrna_z', mrna_z.shape)
print('phospho_z', phospho_z.shape)
print('proteome_z', proteome_z.shape)
print('scna_z', scna_z.shape)

preprocessed shapes:
circ_z (1337, 137)
mir_z  (1920, 137)
mrna_z (25063, 137)
phospho_z (7525, 137)
proteome_z (11316, 137)
scna_z (19892, 137)
