In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Obstructive_sleep_apnea"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Obstructive_sleep_apnea/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Obstructive_sleep_apnea/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Obstructive_sleep_apnea/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Obstructive_sleep_apnea/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
import os

# Step 1: Look for directories related to Obstructive Sleep Apnea (OSA)
tcga_subdirs = os.listdir(tcga_root_dir)
print(f"Available TCGA subdirectories: {tcga_subdirs}")

# Look for directory related to Obstructive Sleep Apnea
# OSA is associated with head and neck morphology, obesity, and is more common in certain cancers
# It can be a comorbidity in various cancers, especially those affecting respiratory pathways

potential_matches = [
    'TCGA_Head_and_Neck_Cancer_(HNSC)',  # Head and neck anatomical relationship with upper airway
    'TCGA_Lung_Cancer_(LUNG)',           # Respiratory system relationship
    'TCGA_Lung_Adenocarcinoma_(LUAD)',   # Respiratory system relationship
    'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)'  # Respiratory system relationship
]

# Select the most comprehensive dataset that might include sleep apnea data
target_dir = 'TCGA_Head_and_Neck_Cancer_(HNSC)'  # Head and neck cancer has strongest association with OSA
target_path = os.path.join(tcga_root_dir, target_dir)

print(f"Selected directory: {target_dir} - this dataset may contain clinical information about upper airway anatomy and potentially sleep apnea as a comorbidity")

# Step 2: Get the clinical and genetic data file paths
clinical_path, genetic_path = tcga_get_relevant_filepaths(target_path)

# Step 3: Load the datasets
clinical_df = pd.read_csv(clinical_path, sep='\t', index_col=0)
genetic_df = pd.read_csv(genetic_path, sep='\t', index_col=0)

# Step 4: Print column names of clinical data
print("\nClinical data columns:")
print(clinical_df.columns.tolist())

# Check if we have both gene data and potential trait data
has_gene_data = not genetic_df.empty
has_potential_trait_data = not clinical_df.empty

# Record our initial assessment
validate_and_save_cohort_info(
    is_final=False, 
    cohort="TCGA", 
    info_path=json_path, 
    is_gene_available=has_gene_data, 
    is_trait_available=has_potential_trait_data
)


Available TCGA subdirectories: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREAD)', 'TCGA_Colon_Cancer_(COAD)', 'TCGA_Endometrioid_Canc


Clinical data columns:
['_INTEGRATION', '_PANCAN_CNA_PANCAN_K8', '_PANCAN_Cluster_Cluster_PANCAN', '_PANCAN_DNAMethyl_HNSC', '_PANCAN_DNAMethyl_PANCAN', '_PANCAN_RPPA_PANCAN_K8', '_PANCAN_UNC_RNAseq_PANCAN_K16', '_PANCAN_miRNA_PANCAN', '_PANCAN_mirna_HNSC', '_PANCAN_mutation_PANCAN', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_surgery_locoregional_procedure', 'additional_surgery_metastatic_procedure', 'age_at_initial_pathologic_diagnosis', 'alcohol_history_documented', 'amount_of_alcohol_consumption_per_day', 'anatomic_neoplasm_subdivision', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'clinical_M', 'clinical_N', 'clinical_T', 'clinical_stage', 'days_to_additional_surgery_locoregional_procedure', 'days_to_additional_surgery_metastatic_procedure', 'days_to_birth', 'days_to_collection', 'days_to_completion_of_curative_tx', 'days_to_death', 'days_to_initial_pathologic_

False

### Step 2: Find Candidate Demographic Features

In [3]:
# Find candidates for age and gender columns
clinical_file, genetic_file = tcga_get_relevant_filepaths(os.path.join(tcga_root_dir, 'TCGA_Head_and_Neck_Cancer_(HNSC)'))
clinical_df = pd.read_csv(clinical_file, sep='\t', index_col=0)

# Identify age-related columns
candidate_age_cols = ['age_at_initial_pathologic_diagnosis', 'days_to_birth']
candidate_gender_cols = ['gender']

# Preview age columns
if candidate_age_cols:
    age_preview = {col: clinical_df[col].head(5).tolist() for col in candidate_age_cols}
    print("Age column previews:")
    print(age_preview)

# Preview gender columns
if candidate_gender_cols:
    gender_preview = {col: clinical_df[col].head(5).tolist() for col in candidate_gender_cols}
    print("Gender column previews:")
    print(gender_preview)


Age column previews:
{'age_at_initial_pathologic_diagnosis': [66.0, 69.0, 49.0, 39.0, 45.0], 'days_to_birth': [-24222.0, -25282.0, -17951.0, -14405.0, -16536.0]}
Gender column previews:
{'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']}


### Step 3: Select Demographic Features

In [4]:
# Examining the age column candidates
age_columns = {'age_at_initial_pathologic_diagnosis': [66.0, 69.0, 49.0, 39.0, 45.0], 
               'days_to_birth': [-24222.0, -25282.0, -17951.0, -14405.0, -16536.0]}

# Examining the gender column candidates
gender_columns = {'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']}

# Select appropriate columns
# For age, prefer age_at_initial_pathologic_diagnosis as it's already in years
# and easier to interpret than days_to_birth
age_col = 'age_at_initial_pathologic_diagnosis' if 'age_at_initial_pathologic_diagnosis' in age_columns else None

# For gender, select the gender column if available
gender_col = 'gender' if 'gender' in gender_columns else None

# Print the chosen columns
print(f"Chosen age column: {age_col}")
print(f"Chosen gender column: {gender_col}")


Chosen age column: age_at_initial_pathologic_diagnosis
Chosen gender column: gender


### Step 4: Feature Engineering and Validation

In [5]:
# Step 1: Extract and standardize the clinical features
# Get file paths
cohort_dir = os.path.join(tcga_root_dir, 'TCGA_Head_and_Neck_Cancer_(HNSC)')
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)

# Load data
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)
genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)

# Create standardized clinical features dataframe with trait, age, and gender
# The trait for Head and Neck Cancer is based on tumor/normal classification
clinical_features = tcga_select_clinical_features(
    clinical_df, 
    trait=trait,  # Using predefined trait variable
    age_col=age_col, 
    gender_col=gender_col
)

# Save clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_features.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")
print(f"Clinical data shape: {clinical_features.shape}")
print(clinical_features.head())

# Step 2: Normalize gene symbols in gene expression data
# Transpose the genetic data to have genes as rows
genetic_data = genetic_df.copy()

# Normalize gene symbols using the NCBI Gene database synonyms
normalized_gene_data = normalize_gene_symbols_in_index(genetic_data)

# Save normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")
print(f"Normalized gene data shape: {normalized_gene_data.shape}")

# Step 3: Link clinical and genetic data
# Transpose genetic data to get samples as rows, genes as columns
genetic_data_transposed = normalized_gene_data.T

# Ensure clinical and genetic data have the same samples (index values)
common_samples = clinical_features.index.intersection(genetic_data_transposed.index)
clinical_subset = clinical_features.loc[common_samples]
genetic_subset = genetic_data_transposed.loc[common_samples]

# Combine clinical and genetic data
linked_data = pd.concat([clinical_subset, genetic_subset], axis=1)
print(f"Linked data shape: {linked_data.shape}")

# Step 4: Handle missing values
linked_data = handle_missing_values(linked_data, trait_col=trait)
print(f"After handling missing values - linked data shape: {linked_data.shape}")

# Step 5: Determine biased features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait=trait)
print(f"After removing biased features - linked data shape: {linked_data.shape}")

# Step 6: Validate data quality and save cohort info
# First check if we have both gene and trait data
is_gene_available = linked_data.shape[1] > 3  # More than just trait, Age, Gender
is_trait_available = trait in linked_data.columns

# Take notes of special findings
notes = f"TCGA Head and Neck Cancer dataset processed. Used tumor/normal classification as the trait for Obstructive Sleep Apnea analysis."

# Validate the data quality
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort="TCGA",
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available,
    is_biased=is_biased,
    df=linked_data,
    note=notes
)

# Step 7: Save linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Linked data not saved due to quality concerns")

Clinical data saved to ../../output/preprocess/Obstructive_sleep_apnea/clinical_data/TCGA.csv
Clinical data shape: (604, 3)
                 Obstructive_sleep_apnea   Age  Gender
sampleID                                              
TCGA-4P-AA8J-01                        1  66.0       1
TCGA-BA-4074-01                        1  69.0       1
TCGA-BA-4075-01                        1  49.0       1
TCGA-BA-4076-01                        1  39.0       1
TCGA-BA-4077-01                        1  45.0       0


Normalized gene data saved to ../../output/preprocess/Obstructive_sleep_apnea/gene_data/TCGA.csv
Normalized gene data shape: (19848, 566)
Linked data shape: (566, 19851)


After handling missing values - linked data shape: (566, 19851)
For the feature 'Obstructive_sleep_apnea', the least common label is '0' with 44 occurrences. This represents 7.77% of the dataset.
The distribution of the feature 'Obstructive_sleep_apnea' in this dataset is fine.

Quartiles for 'Age':
  25%: 53.0
  50% (Median): 61.0
  75%: 68.0
Min: 19.0
Max: 90.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 151 occurrences. This represents 26.68% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

After removing biased features - linked data shape: (566, 19851)


Linked data saved to ../../output/preprocess/Obstructive_sleep_apnea/TCGA.csv
