# RNA-Seq Preparation for Early Fusion

**Goal**

Prepare TCGA RNA-Seq data into a clean, fixed-length numeric feature matrix (one vector per patient), aligned with subtype labels (LUAD/LUSC), ready for early fusion with WSI, DNA methylation and clinical data

1. Load RNA-Seq files
2. Map samples to patient IDs
3. Construct a patient x gene matrix
4. Normalize
5. Dimensionality reduction
6. Save the features

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

In [12]:
RNA_ROOTS = {
    "LUAD": Path(
        "/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/"
        "rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification"
    ),
    "LUSC": Path(
        "/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/"
        "rna/TCGA-LUSC/Transcriptome_Profiling/Gene_Expression_Quantification"
    ),
}
LABELS_PATH = Path("multiomics_labels.tsv")

OUT_DIR = Path("features_rna")
OUT_DIR.mkdir(exist_ok=True)

In [13]:
for k, v in RNA_ROOTS.items():
    print(k, "exists:", v.exists())

LUAD exists: True
LUSC exists: True


In [3]:
labels = pd.read_csv(LABELS_PATH, sep="\t")

labels = labels[["patient_id", "subtype_simple"]].copy()
labels["patient_id"] = labels["patient_id"].astype(str).str.strip()
labels["subtype_simple"] = labels["subtype_simple"].astype(str).str.strip()

labels = labels[labels["subtype_simple"].isin(["LUAD", "LUSC"])]
labels = labels.reset_index(drop=True)

print("Total labeled patients:", len(labels))
labels["subtype_simple"].value_counts()

Total labeled patients: 831


subtype_simple
LUAD    459
LUSC    372
Name: count, dtype: int64

## Inspect RNA-seq folder structure

Each subfolder corresponds to a single RNA-seq file (UUID-named). Inside is a gene expression quantification file

In [15]:
# 1) Confirm paths exist + count UUID folders
rna_folders = {}
for cancer, root in RNA_ROOTS.items():
    if not root.exists():
        raise FileNotFoundError(f"{cancer} root not found: {root}")
    folders = [p for p in root.iterdir() if p.is_dir()]
    rna_folders[cancer] = folders
    print(f"{cancer}: {len(folders)} UUID folders")

# 2) Collect the actual STAR gene count TSVs (1 per UUID folder)
rna_files = []  # list of dicts: {cancer, uuid_folder, filepath}
for cancer, folders in rna_folders.items():
    for folder in folders:
        tsvs = list(folder.glob("*.rna_seq.augmented_star_gene_counts.tsv"))
        if len(tsvs) == 1:
            rna_files.append({"cancer": cancer, "uuid_folder": folder.name, "path": tsvs[0]})

print("\nTotal RNA count files found:", len(rna_files))
print("First 3 examples:")
for x in rna_files[:3]:
    print(x["cancer"], x["path"])

LUAD: 455 UUID folders
LUSC: 370 UUID folders

Total RNA count files found: 825
First 3 examples:
LUAD /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/ffec28dd-cc44-4057-bcc3-ccc0cd2331c2/d21d91e9-6e19-4c5f-aa2e-cdd41d98e542.rna_seq.augmented_star_gene_counts.tsv
LUAD /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/fe20855f-d3ce-4e02-b2dd-3df15eab44bf/564e41d6-f9a4-4a6b-bb97-f8e5423b84bd.rna_seq.augmented_star_gene_counts.tsv
LUAD /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/fdcad604-c1b4-40ff-a652-3c1e5fe5f9a1/2301103f-5cbe-4cb6-b5da-0b83b39e4616.rna_seq.augmented_star_gene_counts.tsv


In [17]:
import os, json, time
import numpy as np
import pandas as pd
from pathlib import Path
import requests

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [18]:
RNA_ROOTS = {
    "LUAD": Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification"),
    "LUSC": Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUSC/Transcriptome_Profiling/Gene_Expression_Quantification"),
}

LABELS_PATH = Path("multiomics_labels.tsv")  # in working dir; otherwise full path

OUT_DIR = Path("features_rna")
OUT_DIR.mkdir(exist_ok=True, parents=True)

for k, v in RNA_ROOTS.items():
    print(k, "exists:", v.exists())
print("labels exists:", LABELS_PATH.exists())
print("OUT_DIR:", OUT_DIR.resolve())

LUAD exists: True
LUSC exists: True
labels exists: True
OUT_DIR: /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/features_rna


Load labels and get the patients IDs

In [19]:
labels = pd.read_csv(LABELS_PATH, sep="\t")

# normalize expected columns
patient_col = "patient_id" if "patient_id" in labels.columns else ("submitter_id" if "submitter_id" in labels.columns else None)
label_col   = "subtype_simple" if "subtype_simple" in labels.columns else ("label" if "label" in labels.columns else None)

if patient_col is None:
    raise ValueError(f"Cannot find patient id column. Columns: {labels.columns.tolist()}")
if label_col is None:
    raise ValueError(f"Cannot find label column. Columns: {labels.columns.tolist()}")

labels = labels[[patient_col, label_col]].copy()
labels[patient_col] = labels[patient_col].astype(str).str.strip()
labels[label_col]   = labels[label_col].astype(str).str.strip()

labels = labels[labels[label_col].isin(["LUAD", "LUSC"])].reset_index(drop=True)

print("Total labeled patients:", len(labels))
print(labels[label_col].value_counts())
labels.head()


Total labeled patients: 831
subtype_simple
LUAD    459
LUSC    372
Name: count, dtype: int64


Unnamed: 0,patient_id,subtype_simple
0,TCGA-MP-A4SV,LUAD
1,TCGA-55-8621,LUAD
2,TCGA-MN-A4N1,LUAD
3,TCGA-55-6986,LUAD
4,TCGA-86-6851,LUAD


Collect the RNA count TSV files from LUAD and LUSC folders

In [20]:
def collect_rna_files(root: Path):
    uuid_folders = [p for p in root.iterdir() if p.is_dir()]
    files = []
    for folder in uuid_folders:
        hits = list(folder.glob("*.rna_seq.augmented_star_gene_counts.tsv"))
        if len(hits) == 1:
            files.append({"uuid_folder": folder.name, "path": hits[0]})
    return uuid_folders, files

rna_uuid_folders = {}
rna_files = []

for cancer, root in RNA_ROOTS.items():
    folders, files = collect_rna_files(root)
    rna_uuid_folders[cancer] = folders
    for d in files:
        d["cancer"] = cancer
        rna_files.append(d)

print("LUAD uuid folders:", len(rna_uuid_folders["LUAD"]))
print("LUSC uuid folders:", len(rna_uuid_folders["LUSC"]))
print("Total RNA count TSV files found:", len(rna_files))
print("First 3 examples:")
for x in rna_files[:3]:
    print(x["cancer"], x["uuid_folder"], x["path"])

LUAD uuid folders: 455
LUSC uuid folders: 370
Total RNA count TSV files found: 825
First 3 examples:
LUAD ffec28dd-cc44-4057-bcc3-ccc0cd2331c2 /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/ffec28dd-cc44-4057-bcc3-ccc0cd2331c2/d21d91e9-6e19-4c5f-aa2e-cdd41d98e542.rna_seq.augmented_star_gene_counts.tsv
LUAD fe20855f-d3ce-4e02-b2dd-3df15eab44bf /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/fe20855f-d3ce-4e02-b2dd-3df15eab44bf/564e41d6-f9a4-4a6b-bb97-f8e5423b84bd.rna_seq.augmented_star_gene_counts.tsv
LUAD fdcad604-c1b4-40ff-a652-3c1e5fe5f9a1 /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/fdcad604-c1b4-40ff-a652-3c1e5fe5f9a1/2301103f-5cbe-4cb6-b5da-0b83b39e4616.rna_seq.augmented_star_gene_counts.tsv


Map RNA UUID to patient_id usinf the GDC API

In [22]:
META_PATH = Path("RNASeq_metadata.tsv")

meta = pd.read_csv(META_PATH, sep="\t", dtype=str)
print("Metadata shape:", meta.shape)
print("Columns:", meta.columns.tolist())
meta.head(3)

Metadata shape: (1, 100)
Columns: ['barcode', 'patient', 'sample', 'shortLetterCode', 'definition', 'sample_submitter_id', 'tumor_descriptor', 'specimen_type', 'sample_id', 'submitter_id', 'state', 'sample_type', 'tissue_type', 'preservation_method', 'intermediate_dimension', 'pathology_report_uuid', 'shortest_dimension', 'longest_dimension', 'days_to_collection', 'initial_weight', 'synchronous_malignancy', 'ajcc_pathologic_stage', 'days_to_diagnosis', 'laterality', 'treatments', 'tissue_or_organ_of_origin', 'age_at_diagnosis', 'primary_diagnosis', 'prior_malignancy', 'year_of_diagnosis', 'prior_treatment', 'diagnosis_is_primary_disease', 'ajcc_staging_system_edition', 'ajcc_pathologic_t', 'morphology', 'ajcc_pathologic_n', 'ajcc_pathologic_m', 'residual_disease', 'classification_of_tumor', 'diagnosis_id', 'icd_10_code', 'site_of_resection_or_biopsy', 'sites_of_involvement', 'tumor_of_origin', 'tobacco_smoking_status', 'exposure_id', 'exposure_type', 'tobacco_smoking_onset_year', 'pack

Unnamed: 0,barcode,patient,sample,shortLetterCode,definition,sample_submitter_id,tumor_descriptor,specimen_type,sample_id,submitter_id,...,paper_Purity.ABSOLUTE.calls,paper_M.stage,paper_Pack.years,paper_Nonsilent.Mutatios,paper_Nonsilent.Mutatios.per.Mb,paper_Selected.Mutation.Summary,paper_High.Level.Amplifications,paper_Homozygous.Deletions,paper_Expression.Subtype,patient_id
0,TCGA-MP-A4SV-01A-11R-A24X-07,TCGA-MP-A4SV-01A-11R-A24X-07,TCGA-MP-A4SV,TCGA-MP-A4SV-01A,TP,Primary solid Tumor,TCGA-MP-A4SV-01A,Primary,Unknown,b5c9beed-da9d-4110-9b6d-54a66498d5cc,...,,,,,,,,,,


In [23]:
def pick_col(cols, candidates):
    cols_lower = {c.lower(): c for c in cols}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    # fallback: partial match
    for c in cols:
        cl = c.lower()
        for cand in candidates:
            if cand.lower() in cl:
                return c
    return None

uuid_col = pick_col(meta.columns, ["file_id", "id", "file_uuid", "uuid"])
patient_col = pick_col(meta.columns, ["cases.0.submitter_id", "cases.submitter_id", "submitter_id", "case_submitter_id", "patient_id"])

print("Detected uuid_col:", uuid_col)
print("Detected patient_col:", patient_col)

if uuid_col is None or patient_col is None:
    raise ValueError("Could not auto-detect uuid/patient columns. Paste meta.columns.tolist() output and we’ll map manually.")

Detected uuid_col: sample_submitter_id
Detected patient_col: submitter_id


In [24]:
import re

uuid_re = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)

def looks_like_uuid_series(s):
    s = s.dropna().astype(str).str.strip()
    if len(s) == 0:
        return 0
    return (s.apply(lambda x: bool(uuid_re.match(x)))).mean()

scores = []
for c in meta.columns:
    frac = looks_like_uuid_series(meta[c])
    if frac > 0:
        scores.append((c, frac))

scores = sorted(scores, key=lambda x: x[1], reverse=True)
print("UUID-like columns (top):")
for c, frac in scores[:10]:
    print(f"{c:40s}  uuid_fraction={frac:.3f}")

UUID-like columns (top):
submitter_id                              uuid_fraction=1.000
shortest_dimension                        uuid_fraction=1.000


In [25]:
# quick peek at what those "uuid-like" columns actually contain
for c in ["submitter_id", "shortest_dimension"]:
    if c in meta.columns:
        vals = meta[c].dropna().astype(str).str.strip().head(10).tolist()
        print(f"\nColumn: {c}")
        for v in vals:
            print("  ", v)


Column: submitter_id
   b5c9beed-da9d-4110-9b6d-54a66498d5cc

Column: shortest_dimension
   8E809652-9783-4B7A-B92D-8C393A2940AB


In [26]:
def tcga_case_fraction(s):
    s = s.dropna().astype(str).str.strip()
    if len(s) == 0:
        return 0
    return s.str.match(r"^TCGA-[A-Z0-9]{2}-[A-Z0-9]{4}$").mean()

tcga_scores = []
for c in meta.columns:
    frac = tcga_case_fraction(meta[c])
    if frac > 0:
        tcga_scores.append((c, frac))

tcga_scores = sorted(tcga_scores, key=lambda x: x[1], reverse=True)

print("TCGA-case-like columns (top):")
for c, frac in tcga_scores[:10]:
    print(f"{c:40s}  tcga_case_fraction={frac:.3f}")

TCGA-case-like columns (top):
sample                                    tcga_case_fraction=1.000
state                                     tcga_case_fraction=1.000


In [28]:
# Define correct columns
uuid_col = "submitter_id"   # UUID (file-level)
patient_col = "sample"      # TCGA patient ID

# Build mapping
m = meta[[uuid_col, patient_col]].dropna().copy()
m[uuid_col] = m[uuid_col].astype(str).str.strip()
m[patient_col] = m[patient_col].astype(str).str.strip()

uuid_to_patient = dict(zip(m[uuid_col], m[patient_col]))

print("UUID → patient mappings:", len(uuid_to_patient))
print("Example mappings:")
for k, v in list(uuid_to_patient.items())[:5]:
    print(k, "→", v)

UUID → patient mappings: 1
Example mappings:
b5c9beed-da9d-4110-9b6d-54a66498d5cc → TCGA-MP-A4SV


In [29]:
# Attach patient IDs to RNA files
annotated = []

for d in rna_files:
    uuid = d["uuid_folder"]
    patient = uuid_to_patient.get(uuid)
    if patient is not None:
        annotated.append({
            "cancer": d["cancer"],
            "uuid": uuid,
            "patient_id": patient,
            "path": d["path"]
        })

rna_df = pd.DataFrame(annotated)

print("RNA files with patient IDs:", rna_df.shape[0])
rna_df.head()

RNA files with patient IDs: 0


In [30]:
meta.columns.tolist()

['barcode',
 'patient',
 'sample',
 'shortLetterCode',
 'definition',
 'sample_submitter_id',
 'tumor_descriptor',
 'specimen_type',
 'sample_id',
 'submitter_id',
 'state',
 'sample_type',
 'tissue_type',
 'preservation_method',
 'intermediate_dimension',
 'pathology_report_uuid',
 'shortest_dimension',
 'longest_dimension',
 'days_to_collection',
 'initial_weight',
 'synchronous_malignancy',
 'ajcc_pathologic_stage',
 'days_to_diagnosis',
 'laterality',
 'treatments',
 'tissue_or_organ_of_origin',
 'age_at_diagnosis',
 'primary_diagnosis',
 'prior_malignancy',
 'year_of_diagnosis',
 'prior_treatment',
 'diagnosis_is_primary_disease',
 'ajcc_staging_system_edition',
 'ajcc_pathologic_t',
 'morphology',
 'ajcc_pathologic_n',
 'ajcc_pathologic_m',
 'residual_disease',
 'classification_of_tumor',
 'diagnosis_id',
 'icd_10_code',
 'site_of_resection_or_biopsy',
 'sites_of_involvement',
 'tumor_of_origin',
 'tobacco_smoking_status',
 'exposure_id',
 'exposure_type',
 'tobacco_smoking_onset

In [31]:
# show UUID-like columns with examples
for c in meta.columns:
    if meta[c].astype(str).str.match(
        r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
        na=False
    ).mean() > 0.5:
        print(c)
        print(meta[c].dropna().iloc[:3])
        print()

submitter_id
0    b5c9beed-da9d-4110-9b6d-54a66498d5cc
Name: submitter_id, dtype: object



## 1) Collect all RNA STAR gene count files (LUAD + LUSC)
Each UUID folder contains one `*.rna_seq.augmented_star_gene_counts.tsv`.
We will build a list of file paths first.

In [32]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

RNA_ROOTS = {
    "LUAD": Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification"),
    "LUSC": Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUSC/Transcriptome_Profiling/Gene_Expression_Quantification"),
}

rna_files = []  # list of dicts: {cancer, uuid_folder, path}
for cancer, root in RNA_ROOTS.items():
    folders = [p for p in root.iterdir() if p.is_dir()]
    for folder in folders:
        tsvs = list(folder.glob("*.rna_seq.augmented_star_gene_counts.tsv"))
        if len(tsvs) == 1:
            rna_files.append({"cancer": cancer, "uuid_folder": folder.name, "path": tsvs[0]})

print("Total RNA count files found:", len(rna_files))
print("Example:", rna_files[0] if rna_files else None)

Total RNA count files found: 825
Example: {'cancer': 'LUAD', 'uuid_folder': 'ffec28dd-cc44-4057-bcc3-ccc0cd2331c2', 'path': PosixPath('/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/ffec28dd-cc44-4057-bcc3-ccc0cd2331c2/d21d91e9-6e19-4c5f-aa2e-cdd41d98e542.rna_seq.augmented_star_gene_counts.tsv')}


In [35]:
rna_files = [
  {
    "uuid_folder": "f0890456-1013-4e8a-9132-f05e2a1be5f2",
    "path": ".../f0890456.../xxx.tsv",
    "cancer": "LUAD"
  },
  ...
]

In [36]:
rna_files_df = pd.DataFrame(rna_files)

AttributeError: 'ellipsis' object has no attribute 'keys'

## 2) Extract TCGA patient ID from each RNA count file
We will read only the header/comments quickly (not the full file) and pull out the TCGA barcode.
Then we convert barcode → patient_id (first 3 blocks: TCGA-XX-YYYY).

In [33]:
TCGA_PAT_RE = re.compile(r"(TCGA-[A-Z0-9]{2}-[A-Z0-9]{4})")

def extract_patient_id_from_counts(path, max_lines=80):
    """
    Reads the first `max_lines` lines and searches for a TCGA case ID like TCGA-XX-YYYY.
    Returns patient_id or None.
    """
    try:
        with open(path, "r") as f:
            for _ in range(max_lines):
                line = f.readline()
                if not line:
                    break
                m = TCGA_PAT_RE.search(line)
                if m:
                    return m.group(1)
    except Exception as e:
        return None
    return None

# test on first few
for d in rna_files[:5]:
    pid = extract_patient_id_from_counts(d["path"])
    print(d["uuid_folder"], "->", pid)

ffec28dd-cc44-4057-bcc3-ccc0cd2331c2 -> None
fe20855f-d3ce-4e02-b2dd-3df15eab44bf -> None
fdcad604-c1b4-40ff-a652-3c1e5fe5f9a1 -> None
fd52d8a9-e044-4f17-861a-9780752b20ee -> None
fc757476-1d31-470b-b648-79608b201bdc -> None


## 3) Build the RNA mapping table: uuid_folder → patient_id → file_path
This gives us a clean index we can join with labels (LUAD/LUSC).

In [34]:
rows = []
for d in rna_files:
    pid = extract_patient_id_from_counts(d["path"])
    if pid is not None:
        rows.append({
            "cancer": d["cancer"],
            "uuid_folder": d["uuid_folder"],
            "patient_id": pid,
            "path": str(d["path"])
        })

rna_map = pd.DataFrame(rows)
print("Mapped RNA files to patient IDs:", rna_map.shape[0], "of", len(rna_files))
rna_map.head()

Mapped RNA files to patient IDs: 0 of 825


# Preparation

## Locate all RNA count files

We identify all RNA-seq gene count files downloaded from the GDC portal

- We scan both cancer subtype directories:
    - TCGA-LUAD
    - TCGA-LUSC
- Recursively search for files ending with `rna_seq.augmented_star_gene_counts.tsv` where each file corresponds to one RNA-seq sample

**Output of this step:**
- A list of file paths for LUAD samples
- A list of file paths for LUSC samples

In [1]:
from pathlib import Path

# Base directories
LUAD_ROOT = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD")
LUSC_ROOT = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUSC")

# File pattern used by GDC STAR workflow
FILE_PATTERN = "**/*.rna_seq.augmented_star_gene_counts.tsv"

# Locate RNA count files
luad_files = sorted(LUAD_ROOT.glob(FILE_PATTERN))
lusc_files = sorted(LUSC_ROOT.glob(FILE_PATTERN))

print(f"LUAD RNA files found: {len(luad_files)}")
print(f"LUSC RNA files found: {len(lusc_files)}")

# show a few example paths
print("\nExample LUAD file:")
print(luad_files[0] if luad_files else "None found")

print("\nExample LUSC file:")
print(lusc_files[0] if lusc_files else "None found")


LUAD RNA files found: 455
LUSC RNA files found: 370

Example LUAD file:
/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/0052ae83-7ae5-470a-a125-5cd94a9fa9e9/a6a6b9c6-9db7-42b3-a09f-770b7e126fbb.rna_seq.augmented_star_gene_counts.tsv

Example LUSC file:
/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUSC/Transcriptome_Profiling/Gene_Expression_Quantification/000b6b94-572d-4d06-a8f4-2e43829f83d4/e74f321c-217f-4bdc-ad17-f132501b5157.rna_seq.augmented_star_gene_counts.tsv


## Step 2: Decide the sample ID for RNA-seq samples

Each RNA-seq file must be assigned a unique sample identifier so that it can
index the RNA expression matrix and later be aligned with other modalities
(clinical, methylation, WSI).

Because GDC RNA-seq files are often stored using UUID-based folder and file names,
we proceed as follows:

- Use the **file UUID** (extracted from the parent folder name) as the `sample_id`
  for each RNA-seq file.
- This provides a unique and stable identifier for preprocessing.
- In a later step, these UUIDs can be mapped to TCGA barcodes using GDC metadata
  to enable multimodal harmonization.

**Output of this step:**
- A `sample_id` associated with each RNA-seq count file.

In [2]:
def extract_sample_id(rna_file_path: Path) -> str:
    """
    Extract sample ID from a GDC RNA-seq file path.
    Assumes the immediate parent directory is the file UUID.
    """
    return rna_file_path.parent.name


# Build file → sample_id mappings
luad_sample_ids = {extract_sample_id(f): f for f in luad_files}
lusc_sample_ids = {extract_sample_id(f): f for f in lusc_files}

print(f"LUAD samples: {len(luad_sample_ids)}")
print(f"LUSC samples: {len(lusc_sample_ids)}")

# show a few sample IDs
print("\nExample LUAD sample IDs:")
for sid in list(luad_sample_ids.keys())[:5]:
    print(sid)

print("\nExample LUSC sample IDs:")
for sid in list(lusc_sample_ids.keys())[:5]:
    print(sid)

LUAD samples: 455
LUSC samples: 370

Example LUAD sample IDs:
0052ae83-7ae5-470a-a125-5cd94a9fa9e9
023a34d9-c000-4053-b695-5b984b4a6fc1
030d778f-ecb4-44eb-8baa-4286e18c9fdd
0344d3bb-8bb2-459e-bc1f-ba31c7470d08
03680aea-84a2-4775-bb9c-24f8c5907247

Example LUSC sample IDs:
000b6b94-572d-4d06-a8f4-2e43829f83d4
00810078-a4fc-4e1d-bbdc-e5e0d3080025
00bc0fe4-481a-4893-b57e-d5113a6bcce2
01977422-7f46-41d2-bc19-ed0093af8f25
021326fa-7b6c-45c0-b078-4646c2260068


## Step 3: Extract gene-level unstranded counts from one RNA-seq file (GDC augmented STAR)

The GDC `*.rna_seq.augmented_star_gene_counts.tsv` file contains:
- STAR summary rows (e.g., N_unmapped, N_multimapping)
- Gene rows with Ensembl IDs that include version suffixes (e.g., ENSG...15)
- Multiple quantifications (counts + TPM/FPKM)

In this step we:
1. Read the file while skipping the comment line(s) starting with '#'
2. Remove STAR summary rows
3. Keep only Ensembl genes
4. Strip Ensembl version suffixes (ENSGxxxx.y → ENSGxxxx)
5. Extract the `unstranded` raw counts as our gene expression vector

In [7]:
from pathlib import Path

sample_id, sample_file = next(iter(luad_sample_ids.items()))
print("Sample ID:", sample_id)
print("File:", sample_file)

# Print first 25 lines to understand structure
with open(sample_file, "r", encoding="utf-8", errors="replace") as f:
    for i in range(25):
        line = f.readline()
        if not line:
            break
        print(f"{i+1:02d}: {line.rstrip()}")

Sample ID: 0052ae83-7ae5-470a-a125-5cd94a9fa9e9
File: /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/0052ae83-7ae5-470a-a125-5cd94a9fa9e9/a6a6b9c6-9db7-42b3-a09f-770b7e126fbb.rna_seq.augmented_star_gene_counts.tsv
01: # gene-model: GENCODE v36
02: gene_id	gene_name	gene_type	unstranded	stranded_first	stranded_second	tpm_unstranded	fpkm_unstranded	fpkm_uq_unstranded
03: N_unmapped			579900	579900	579900
04: N_multimapping			3656223	3656223	3656223
05: N_noFeature			636730	12987335	13046964
06: N_ambiguous			3124033	840639	837658
07: ENSG00000000003.15	TSPAN6	protein_coding	3715	1834	1881	94.7370	36.5986	46.3790
08: ENSG00000000005.6	TNMD	protein_coding	0	0	0	0.0000	0.0000	0.0000
09: ENSG00000000419.13	DPM1	protein_coding	2468	1232	1236	236.5221	91.3728	115.7908
10: ENSG00000000457.14	SCYL3	protein_coding	249	296	318	4.1846	1.6166	2.0486
11: ENSG00000000460.17	C1orf112	protein_coding	395	385	398	7.65

In [8]:
import pandas as pd

# Use the same example file you printed
sample_id, sample_file = next(iter(luad_sample_ids.items()))
print("Sample ID:", sample_id)
print("File:", sample_file)

# 1) Read file, skipping comment lines beginning with '#'
df = pd.read_csv(sample_file, sep="\t", comment="#")

print("\nParsed shape:", df.shape)
print("Columns:", list(df.columns))
display(df.head(8))

# 2) Keep only Ensembl gene rows
df["gene_id"] = df["gene_id"].astype(str)
mask = df["gene_id"].str.startswith("ENSG")
df_genes = df.loc[mask, ["gene_id", "unstranded"]].copy()

# 3) Strip Ensembl version suffix (ENSG000...15 -> ENSG000...)
df_genes["gene_id"] = df_genes["gene_id"].str.replace(r"\.\d+$", "", regex=True)

# 4) Convert counts to numeric
df_genes["unstranded"] = pd.to_numeric(df_genes["unstranded"], errors="coerce")

# 5) Sanity checks
print("\nGene rows:", df_genes.shape[0])
print("Any missing counts?:", df_genes["unstranded"].isna().any())
print("Library size (sum counts):", int(df_genes["unstranded"].sum()))

print("\nFirst 5 genes:")
display(df_genes.head())

Sample ID: 0052ae83-7ae5-470a-a125-5cd94a9fa9e9
File: /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/01_download/rna/TCGA-LUAD/Transcriptome_Profiling/Gene_Expression_Quantification/0052ae83-7ae5-470a-a125-5cd94a9fa9e9/a6a6b9c6-9db7-42b3-a09f-770b7e126fbb.rna_seq.augmented_star_gene_counts.tsv

Parsed shape: (60664, 9)
Columns: ['gene_id', 'gene_name', 'gene_type', 'unstranded', 'stranded_first', 'stranded_second', 'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded']


Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded
0,N_unmapped,,,579900,579900,579900,,,
1,N_multimapping,,,3656223,3656223,3656223,,,
2,N_noFeature,,,636730,12987335,13046964,,,
3,N_ambiguous,,,3124033,840639,837658,,,
4,ENSG00000000003.15,TSPAN6,protein_coding,3715,1834,1881,94.737,36.5986,46.379
5,ENSG00000000005.6,TNMD,protein_coding,0,0,0,0.0,0.0,0.0
6,ENSG00000000419.13,DPM1,protein_coding,2468,1232,1236,236.5221,91.3728,115.7908
7,ENSG00000000457.14,SCYL3,protein_coding,249,296,318,4.1846,1.6166,2.0486



Gene rows: 60660
Any missing counts?: False
Library size (sum counts): 23216388

First 5 genes:


Unnamed: 0,gene_id,unstranded
4,ENSG00000000003,3715
5,ENSG00000000005,0
6,ENSG00000000419,2468
7,ENSG00000000457,249
8,ENSG00000000460,395


## Step 4 (Best): Build the raw RNA count matrix using a 2-pass NumPy approach

To make this fast, reproducible, and memory-stable, we:

**Pass 1:** scan all RNA files and collect the union of Ensembl gene IDs (version-stripped).  
**Pass 2:** allocate a single NumPy matrix and fill it row-by-row using a gene→column index map.

Outputs:
- `rna_counts_raw.npy`        (samples × genes)
- `rna_sample_ids.npy`
- `rna_gene_ids.npy`
- `rna_labels.npy`            (LUAD=0, LUSC=1)

In [9]:
import numpy as np
import pandas as pd
from pathlib import Path

def extract_counts_series(path: Path) -> pd.Series:
    """
    Returns Series: index=ENSG (no version), values=unstranded counts (int)
    """
    df = pd.read_csv(path, sep="\t", comment="#", usecols=["gene_id", "unstranded"])
    df["gene_id"] = df["gene_id"].astype(str)

    # keep ENSG rows only
    df = df[df["gene_id"].str.startswith("ENSG")].copy()

    # strip version ENSG....15 -> ENSG....
    df["gene_id"] = df["gene_id"].str.replace(r"\.\d+$", "", regex=True)

    # numeric counts
    counts = pd.to_numeric(df["unstranded"], errors="coerce").fillna(0).astype(np.int32).values
    genes = df["gene_id"].values

    s = pd.Series(counts, index=genes)
    # if duplicates after stripping, sum them
    s = s.groupby(level=0).sum()
    return s


# ---- Build sample list + labels (deterministic order) ----
all_items = []
labels = []

for sid in sorted(luad_sample_ids.keys()):
    all_items.append((sid, luad_sample_ids[sid]))
    labels.append(0)

for sid in sorted(lusc_sample_ids.keys()):
    all_items.append((sid, lusc_sample_ids[sid]))
    labels.append(1)

sample_ids = np.array([sid for sid, _ in all_items], dtype=object)
y = np.array(labels, dtype=np.int64)

print("Total samples:", len(sample_ids))
print("Label counts:", {0: int((y==0).sum()), 1: int((y==1).sum())})


# ---- PASS 1: collect union of genes ----
gene_set = set()
for i, (_, fpath) in enumerate(all_items, start=1):
    s = extract_counts_series(fpath)
    gene_set.update(s.index.tolist())
    if i % 50 == 0:
        print(f"Pass 1: scanned {i}/{len(all_items)} files...")

gene_ids = np.array(sorted(gene_set), dtype=object)
gene_to_col = {g: j for j, g in enumerate(gene_ids)}

print("\nTotal unique genes:", len(gene_ids))


# ---- PASS 2: allocate matrix and fill ----
X = np.zeros((len(sample_ids), len(gene_ids)), dtype=np.float32)

for i, (sid, fpath) in enumerate(all_items):
    s = extract_counts_series(fpath)
    cols = [gene_to_col[g] for g in s.index]
    X[i, cols] = s.values.astype(np.float32)
    if (i + 1) % 50 == 0:
        print(f"Pass 2: filled {i+1}/{len(all_items)} samples...")

print("\nRaw RNA matrix built.")
print("X shape (samples x genes):", X.shape)
print("X dtype:", X.dtype)


# ---- Save outputs ----
OUT_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")
OUT_DIR.mkdir(parents=True, exist_ok=True)

np.save(OUT_DIR / "rna_counts_raw.npy", X)
np.save(OUT_DIR / "rna_sample_ids.npy", sample_ids)
np.save(OUT_DIR / "rna_gene_ids.npy", gene_ids)
np.save(OUT_DIR / "rna_labels.npy", y)

print("\nSaved outputs to:", OUT_DIR)

Total samples: 825
Label counts: {0: 455, 1: 370}
Pass 1: scanned 50/825 files...
Pass 1: scanned 100/825 files...
Pass 1: scanned 150/825 files...
Pass 1: scanned 200/825 files...
Pass 1: scanned 250/825 files...
Pass 1: scanned 300/825 files...
Pass 1: scanned 350/825 files...
Pass 1: scanned 400/825 files...
Pass 1: scanned 450/825 files...
Pass 1: scanned 500/825 files...
Pass 1: scanned 550/825 files...
Pass 1: scanned 600/825 files...
Pass 1: scanned 650/825 files...
Pass 1: scanned 700/825 files...
Pass 1: scanned 750/825 files...
Pass 1: scanned 800/825 files...

Total unique genes: 60660
Pass 2: filled 50/825 samples...
Pass 2: filled 100/825 samples...
Pass 2: filled 150/825 samples...
Pass 2: filled 200/825 samples...
Pass 2: filled 250/825 samples...
Pass 2: filled 300/825 samples...
Pass 2: filled 350/825 samples...
Pass 2: filled 400/825 samples...
Pass 2: filled 450/825 samples...
Pass 2: filled 500/825 samples...
Pass 2: filled 550/825 samples...
Pass 2: filled 600/825 

## Step 5: Basic QC checks (before filtering)

Before filtering genes, we verify the raw RNA count matrix is consistent and usable.

Checks:
- Confirm matrix dimensions and dtypes
- Confirm sample IDs and gene IDs have no duplicates
- Compute per-sample library size (total counts)
- Inspect library size distribution (min/median/max)
- Optionally flag/remove extreme low-library samples (potential failed sequencing)

Output:
- Verified (and optionally cleaned) `X_raw`, `sample_ids`, `gene_ids`, `y`
- Saved QC summary statistics

In [10]:
import numpy as np
from pathlib import Path

# Load what we saved (ensures reproducibility)
IN_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")

X_raw = np.load(IN_DIR / "rna_counts_raw.npy")          # (samples x genes)
sample_ids = np.load(IN_DIR / "rna_sample_ids.npy", allow_pickle=True)
gene_ids = np.load(IN_DIR / "rna_gene_ids.npy", allow_pickle=True)
y = np.load(IN_DIR / "rna_labels.npy")

print("X_raw:", X_raw.shape, X_raw.dtype)
print("sample_ids:", sample_ids.shape, sample_ids.dtype)
print("gene_ids:", gene_ids.shape, gene_ids.dtype)
print("y:", y.shape, y.dtype, " | label counts:", {0:int((y==0).sum()), 1:int((y==1).sum())})

# 1) Duplicates check
n_dup_samples = len(sample_ids) - len(np.unique(sample_ids))
n_dup_genes = len(gene_ids) - len(np.unique(gene_ids))
print("\nDuplicate sample IDs:", n_dup_samples)
print("Duplicate gene IDs:", n_dup_genes)

assert n_dup_samples == 0, "Duplicate sample IDs found!"
assert n_dup_genes == 0, "Duplicate gene IDs found!"

# 2) Library size per sample
lib_sizes = X_raw.sum(axis=1)  # float32 sum, okay
lib_sizes_int = lib_sizes.astype(np.int64)

print("\nLibrary size summary (total counts per sample):")
print("min:", int(lib_sizes_int.min()))
print("p1 :", int(np.percentile(lib_sizes_int, 1)))
print("p5 :", int(np.percentile(lib_sizes_int, 5)))
print("median:", int(np.median(lib_sizes_int)))
print("p95:", int(np.percentile(lib_sizes_int, 95)))
print("p99:", int(np.percentile(lib_sizes_int, 99)))
print("max:", int(lib_sizes_int.max()))

# 3) Flag extremely low-library samples (very conservative rule)
# Anything below the 1st percentile / 10 is usually an obvious failure.
threshold = np.percentile(lib_sizes, 1) / 10.0
flag_low = lib_sizes < threshold
n_flag = int(flag_low.sum())

print(f"\nLow-library threshold: {threshold:.2f}")
print("Flagged low-library samples:", n_flag)

# If none flagged (most likely), we keep everything.
# If some are flagged, we REMOVE them for safety.
if n_flag > 0:
    keep = ~flag_low
    X_raw = X_raw[keep]
    sample_ids = sample_ids[keep]
    y = y[keep]
    lib_sizes = lib_sizes[keep]
    print("After removing flagged samples:", X_raw.shape)
else:
    print("No samples removed.")

# 4) Save QC summary for reproducibility
OUT_QC = IN_DIR / "qc_summary"
OUT_QC.mkdir(exist_ok=True)

np.save(OUT_QC / "library_sizes.npy", lib_sizes.astype(np.float32))
with open(OUT_QC / "library_size_summary.txt", "w") as f:
    f.write(f"X_raw shape: {X_raw.shape}\n")
    f.write(f"min: {int(lib_sizes.min())}\n")
    f.write(f"p1: {int(np.percentile(lib_sizes,1))}\n")
    f.write(f"p5: {int(np.percentile(lib_sizes,5))}\n")
    f.write(f"median: {int(np.median(lib_sizes))}\n")
    f.write(f"p95: {int(np.percentile(lib_sizes,95))}\n")
    f.write(f"p99: {int(np.percentile(lib_sizes,99))}\n")
    f.write(f"max: {int(lib_sizes.max())}\n")
    f.write(f"removed_low_library_samples: {n_flag}\n")

print("\nQC summary saved to:", OUT_QC)


X_raw: (825, 60660) float32
sample_ids: (825,) object
gene_ids: (60660,) object
y: (825,) int64  | label counts: {0: 455, 1: 370}

Duplicate sample IDs: 0
Duplicate gene IDs: 0

Library size summary (total counts per sample):
min: 14305997
p1 : 20725502
p5 : 24957956
median: 45832184
p95: 79989892
p99: 94894554
max: 104891856

Low-library threshold: 2072550.25
Flagged low-library samples: 0
No samples removed.

QC summary saved to: /home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna/qc_summary


## Step 6: Filter low-expression genes (critical)

Raw RNA-seq matrices include many genes that are not reliably expressed and act as noise.

We apply a standard, library-size aware filter:
- Compute CPM (counts per million) for each gene in each sample
- Keep genes with **CPM ≥ 1** in at least **20% of samples**

This reduces dimensionality while preserving biologically meaningful signal.

Output:
- `X_counts_filt`  (samples × filtered_genes)
- `gene_ids_filt`
- Saved filtered raw counts for reproducibility

In [11]:
import numpy as np
from pathlib import Path

IN_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")

X_raw = np.load(IN_DIR / "rna_counts_raw.npy")  # float32 (samples x genes)
gene_ids = np.load(IN_DIR / "rna_gene_ids.npy", allow_pickle=True)
sample_ids = np.load(IN_DIR / "rna_sample_ids.npy", allow_pickle=True)
y = np.load(IN_DIR / "rna_labels.npy")

n_samples, n_genes = X_raw.shape
print("Loaded X_raw:", X_raw.shape)

# Library sizes
lib_sizes = X_raw.sum(axis=1)  # shape (n_samples,)
# Avoid divide-by-zero (shouldn't happen, but safe)
lib_sizes = np.maximum(lib_sizes, 1.0)

# CPM computation (vectorized)
# CPM = counts / lib_size * 1e6
cpm = (X_raw / lib_sizes[:, None]) * 1e6

# Filter rule: CPM >= 1 in >= 20% of samples
min_prop = 0.20
min_n = int(np.ceil(min_prop * n_samples))

expressed = (cpm >= 1.0).sum(axis=0) >= min_n
kept = int(expressed.sum())

print(f"Filter rule: CPM >= 1 in at least {min_n}/{n_samples} samples (>= {min_prop*100:.0f}%)")
print("Genes kept:", kept, "/", n_genes)

# Apply filter on raw counts
X_counts_filt = X_raw[:, expressed]
gene_ids_filt = gene_ids[expressed]

print("Filtered raw counts shape:", X_counts_filt.shape)

# Save filtered raw counts (still counts; next step we log-transform)
OUT_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")
np.save(OUT_DIR / "rna_counts_filtered.npy", X_counts_filt.astype(np.float32))
np.save(OUT_DIR / "rna_gene_ids_filtered.npy", gene_ids_filt)

print("Saved:")
print(" - rna_counts_filtered.npy")
print(" - rna_gene_ids_filtered.npy")

Loaded X_raw: (825, 60660)
Filter rule: CPM >= 1 in at least 165/825 samples (>= 20%)
Genes kept: 17850 / 60660
Filtered raw counts shape: (825, 17850)
Saved:
 - rna_counts_filtered.npy
 - rna_gene_ids_filtered.npy


## Step 7: Normalize and log-transform RNA expression (CPM → log2)

Raw RNA-seq counts cannot be compared directly across samples due to differences
in sequencing depth (library size).

We apply the following standard transformation:
1. Compute CPM (Counts Per Million) to normalize for library size
2. Apply log2(CPM + 1) to stabilize variance

This produces a matrix that behaves well for downstream machine learning models.

Output:
- `X_rna_logcpm`  (samples × genes, float32)
- Saved for reproducibility

In [12]:
import numpy as np
from pathlib import Path

IN_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")

# Load filtered raw counts
X_counts_filt = np.load(IN_DIR / "rna_counts_filtered.npy")   # (samples x genes)
gene_ids_filt = np.load(IN_DIR / "rna_gene_ids_filtered.npy", allow_pickle=True)
sample_ids = np.load(IN_DIR / "rna_sample_ids.npy", allow_pickle=True)
y = np.load(IN_DIR / "rna_labels.npy")

print("Loaded filtered counts:", X_counts_filt.shape)

# 1) Compute library sizes from filtered counts
lib_sizes = X_counts_filt.sum(axis=1)
lib_sizes = np.maximum(lib_sizes, 1.0)  # safety

# 2) CPM normalization
cpm = (X_counts_filt / lib_sizes[:, None]) * 1e6

# 3) log2(CPM + 1)
X_rna_logcpm = np.log2(cpm + 1.0).astype(np.float32)

print("RNA logCPM matrix shape:", X_rna_logcpm.shape)
print("Value range: min =", float(X_rna_logcpm.min()), 
      "max =", float(X_rna_logcpm.max()))

# Save
np.save(IN_DIR / "rna_logcpm.npy", X_rna_logcpm)

print("Saved: rna_logcpm.npy")

Loaded filtered counts: (825, 17850)
RNA logCPM matrix shape: (825, 17850)
Value range: min = 0.0 max = 16.71758460998535
Saved: rna_logcpm.npy


## Step 8: Train/test split and feature standardization

To avoid data leakage, we:
1. Perform a single stratified train/test split using labels
2. Fit `StandardScaler` **only on the training data**
3. Apply the fitted scaler to both train and test sets

This ensures that test data statistics are never used during training.

Outputs:
- `X_rna_train_scaled`
- `X_rna_test_scaled`
- `y_train`, `y_test`
- `train_ids`, `test_ids`
- saved scaler object for reproducibility

In [13]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

IN_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")

# Load logCPM data
X_log = np.load(IN_DIR / "rna_logcpm.npy")                 # (samples x genes)
sample_ids = np.load(IN_DIR / "rna_sample_ids.npy", allow_pickle=True)
y = np.load(IN_DIR / "rna_labels.npy")

print("X_log shape:", X_log.shape)
print("Labels:", {0: int((y==0).sum()), 1: int((y==1).sum())})

# 1) Train/test split (stratified, reproducible)
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X_log,
    y,
    sample_ids,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("\nTrain shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("Train labels:", {0: int((y_train==0).sum()), 1: int((y_train==1).sum())})
print("Test labels :", {0: int((y_test==0).sum()), 1: int((y_test==1).sum())})

# 2) Standardize features (fit on train only)
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nScaled data check:")
print("Train mean (first 5 features):", X_train_scaled.mean(axis=0)[:5])
print("Train std  (first 5 features):", X_train_scaled.std(axis=0)[:5])

# 3) Save outputs
OUT_DIR = IN_DIR

np.save(OUT_DIR / "rna_train_scaled.npy", X_train_scaled.astype(np.float32))
np.save(OUT_DIR / "rna_test_scaled.npy", X_test_scaled.astype(np.float32))
np.save(OUT_DIR / "rna_y_train.npy", y_train)
np.save(OUT_DIR / "rna_y_test.npy", y_test)
np.save(OUT_DIR / "rna_train_ids.npy", ids_train)
np.save(OUT_DIR / "rna_test_ids.npy", ids_test)

joblib.dump(scaler, OUT_DIR / "rna_standard_scaler.joblib")

print("\nSaved:")
print(" - rna_train_scaled.npy")
print(" - rna_test_scaled.npy")
print(" - rna_y_train.npy / rna_y_test.npy")
print(" - rna_train_ids.npy / rna_test_ids.npy")
print(" - rna_standard_scaler.joblib")

X_log shape: (825, 17850)
Labels: {0: 455, 1: 370}

Train shape: (660, 17850)
Test shape : (165, 17850)
Train labels: {0: 364, 1: 296}
Test labels : {0: 91, 1: 74}

Scaled data check:
Train mean (first 5 features): [ 1.1469378e-08  3.9736432e-09 -1.1740309e-08 -4.4251935e-09
 -1.2643410e-09]
Train std  (first 5 features): [1.0000001 1.0000001 1.0000001 0.9999998 1.0000002]

Saved:
 - rna_train_scaled.npy
 - rna_test_scaled.npy
 - rna_y_train.npy / rna_y_test.npy
 - rna_train_ids.npy / rna_test_ids.npy
 - rna_standard_scaler.joblib


## Step 9: Dimensionality reduction with PCA (train-only)

RNA expression remains high-dimensional after filtering.
To reduce dimensionality while preserving variance, we apply PCA.

Procedure:
- Fit PCA on training data only
- Select number of components based on explained variance
- Transform both train and test using the fitted PCA

Outputs:
- `X_rna_train_pca`
- `X_rna_test_pca`
- saved PCA object
- explained variance ratios for reporting

In [14]:
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
import joblib

IN_DIR = Path("/home/steps4growth/gmriechi/Lung-Cancer-Subtyping/Data/02_preprocessed/rna")

# Load scaled data
X_train_scaled = np.load(IN_DIR / "rna_train_scaled.npy")
X_test_scaled = np.load(IN_DIR / "rna_test_scaled.npy")

print("Train scaled shape:", X_train_scaled.shape)
print("Test scaled shape :", X_test_scaled.shape)

# ---- Fit PCA on training data only ----
# Use enough components to explain ~90–95% variance
pca_full = PCA(random_state=42)
pca_full.fit(X_train_scaled)

cum_var = np.cumsum(pca_full.explained_variance_ratio_)

# Choose K = smallest number of components reaching 90% variance
K = int(np.searchsorted(cum_var, 0.90) + 1)

print(f"\nNumber of PCA components for 90% variance: {K}")
print("Cumulative variance at K:", cum_var[K-1])

# Fit final PCA with chosen K
pca = PCA(n_components=K, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("\nPCA-transformed shapes:")
print("X_rna_train_pca:", X_train_pca.shape)
print("X_rna_test_pca :", X_test_pca.shape)

# Save outputs
np.save(IN_DIR / "rna_train_pca.npy", X_train_pca.astype(np.float32))
np.save(IN_DIR / "rna_test_pca.npy", X_test_pca.astype(np.float32))
np.save(IN_DIR / "rna_pca_explained_variance.npy", pca.explained_variance_ratio_)

joblib.dump(pca, IN_DIR / "rna_pca_model.joblib")

print("\nSaved:")
print(" - rna_train_pca.npy")
print(" - rna_test_pca.npy")
print(" - rna_pca_explained_variance.npy")
print(" - rna_pca_model.joblib")

Train scaled shape: (660, 17850)
Test scaled shape : (165, 17850)

Number of PCA components for 90% variance: 278
Cumulative variance at K: 0.90033233

PCA-transformed shapes:
X_rna_train_pca: (660, 278)
X_rna_test_pca : (165, 278)

Saved:
 - rna_train_pca.npy
 - rna_test_pca.npy
 - rna_pca_explained_variance.npy
 - rna_pca_model.joblib
