In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Polycystic_Kidney_Disease"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Polycystic_Kidney_Disease/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Polycystic_Kidney_Disease/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Polycystic_Kidney_Disease/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Polycystic_Kidney_Disease/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
# Step 1: Find the directory corresponding to Polycystic Kidney Disease
import os

# List all directories in TCGA root directory
tcga_dirs = os.listdir(tcga_root_dir)

# Find directories related to kidney disease
matching_dirs = [dir_name for dir_name in tcga_dirs 
                if "kidney" in dir_name.lower() or "renal" in dir_name.lower() or "polycystic" in dir_name.lower()]

if not matching_dirs:
    print(f"No matching directory found for trait: {trait}")
    # Record that this trait is not available and exit
    validate_and_save_cohort_info(
        is_final=False,
        cohort="TCGA",
        info_path=json_path,
        is_gene_available=False,
        is_trait_available=False
    )
else:
    # Print all matching directories
    print(f"Found matching directories: {matching_dirs}")
    
    # Select the most relevant directory - prioritize any that might mention polycystic
    selected_dir = matching_dirs[0]  # Default to first match
    for dir_name in matching_dirs:
        if "polycystic" in dir_name.lower():
            selected_dir = dir_name
            break
    
    print(f"Selected directory: {selected_dir}")
    cohort_dir = os.path.join(tcga_root_dir, selected_dir)
    
    # Step 2: Get file paths for clinical and genetic data
    clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)
    
    # Step 3: Load the files
    clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)
    genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)
    
    # Step 4: Print column names of clinical data
    print("Clinical data columns:")
    print(clinical_df.columns.tolist())


Found matching directories: ['TCGA_Kidney_Chromophobe_(KICH)', 'TCGA_Kidney_Clear_Cell_Carcinoma_(KIRC)', 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)']
Selected directory: TCGA_Kidney_Chromophobe_(KICH)


Clinical data columns:
['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_surgery_locoregional_procedure', 'additional_surgery_metastatic_procedure', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'clinical_M', 'days_to_additional_surgery_metastatic_procedure', 'days_to_birth', 'days_to_death', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_after_initial_treatment', 'eastern_cancer_oncology_group', 'followup_case_report_form_submission_reason', 'followup_treatment_success', 'form_completion_date', 'gender', 'hemoglobin_result', 'histological_type', 'history_of_neoadjuvant_treatment', 'icd_10', 'icd_o_3_histology', 'icd_o_3_site', 'informed_consent_verified', 'intermediate_dimension', 'is_ffpe', 'karnofsky_performance_score', 'lactate_dehydrogenase_result', 'laterality', 'longest

### Step 2: Find Candidate Demographic Features

In [3]:
# Step 1: Identify candidate columns for age and gender
candidate_age_cols = ['age_at_initial_pathologic_diagnosis', 'days_to_birth']
candidate_gender_cols = ['gender']

# Step 2: Extract and preview these columns from the clinical data
# First, load clinical data
if 'clinical_df' not in locals():  # Check if clinical_df is already defined
    # Get the file paths
    cohort_dir = os.path.join(tcga_root_dir, 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)')
    clinical_file_path, _ = tcga_get_relevant_filepaths(cohort_dir)
    
    # Load the clinical data
    clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

# Extract and preview age columns
if candidate_age_cols:
    age_preview_data = {}
    for col in candidate_age_cols:
        if col in clinical_df.columns:
            age_preview_data[col] = clinical_df[col].head(5).tolist()
    print("Age columns preview:")
    print(age_preview_data)

# Extract and preview gender columns
if candidate_gender_cols:
    gender_preview_data = {}
    for col in candidate_gender_cols:
        if col in clinical_df.columns:
            gender_preview_data[col] = clinical_df[col].head(5).tolist()
    print("Gender columns preview:")
    print(gender_preview_data)


Age columns preview:
{'age_at_initial_pathologic_diagnosis': [57, 67, 67, 56, 69], 'days_to_birth': [-20849, -24650, -24650, -20768, -25267]}
Gender columns preview:
{'gender': ['FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'MALE']}


### Step 3: Select Demographic Features

In [4]:
# Inspecting the available columns for age and gender information

# Selecting the age column
age_candidate_cols = {
    'age_at_initial_pathologic_diagnosis': [53.0, 71.0, 71.0, 67.0, 80.0], 
    'days_to_birth': [-19603.0, -26244.0, -26134.0, -24626.0, -29275.0]
}

# Selecting the gender column
gender_candidate_cols = {
    'gender': ['MALE', 'MALE', 'FEMALE', 'MALE', 'MALE']
}

# For age, both columns have values, but 'age_at_initial_pathologic_diagnosis' is more directly interpretable
# as it gives age in years rather than days since birth (negative values)
age_col = 'age_at_initial_pathologic_diagnosis' if age_candidate_cols else None

# For gender, we have one column with clear male/female values
gender_col = 'gender' if gender_candidate_cols and 'gender' in gender_candidate_cols else None

# Print the chosen columns
print(f"Chosen age column: {age_col}")
print(f"Chosen gender column: {gender_col}")


Chosen age column: age_at_initial_pathologic_diagnosis
Chosen gender column: gender


### Step 4: Feature Engineering and Validation

In [5]:
# Step 1: Extract and standardize clinical features
# Use the Kidney Papillary Cell Carcinoma directory already identified in Step 1
selected_dir = 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)'
cohort_dir = os.path.join(tcga_root_dir, selected_dir)

# Get the file paths for clinical and genetic data
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)

# Load the data
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)
genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)

# Extract standardized clinical features using the provided trait variable
clinical_features = tcga_select_clinical_features(
    clinical_df, 
    trait=trait,  # Using the provided trait variable
    age_col=age_col, 
    gender_col=gender_col
)

# Save the clinical data to out_clinical_data_file
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_features.to_csv(out_clinical_data_file)
print(f"Saved clinical data with {len(clinical_features)} samples")

# Step 2: Normalize gene symbols in gene expression data
# Transpose to get genes as rows
gene_df = genetic_df

# Normalize gene symbols using NCBI Gene database synonyms
normalized_gene_df = normalize_gene_symbols_in_index(gene_df)
print(f"After normalization: {len(normalized_gene_df)} genes remaining")

# Save the normalized gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_df.to_csv(out_gene_data_file)
print(f"Saved normalized gene expression data")

# Step 3: Link clinical and genetic data
# Merge clinical features with genetic expression data
linked_data = clinical_features.join(normalized_gene_df.T, how='inner')
print(f"Linked data shape: {linked_data.shape} (samples x features)")

# Step 4: Handle missing values
cleaned_data = handle_missing_values(linked_data, trait_col=trait)
print(f"After handling missing values, data shape: {cleaned_data.shape}")

# Step 5: Determine if trait or demographics are severely biased
trait_biased, cleaned_data = judge_and_remove_biased_features(cleaned_data, trait=trait)

# Step 6: Validate data quality and save cohort information
note = "The dataset contains gene expression data along with clinical information for kidney papillary cell carcinoma patients, which is relevant for studying polycystic kidney disease."
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort="TCGA",
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=trait_biased,
    df=cleaned_data,
    note=note
)

# Step 7: Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    cleaned_data.to_csv(out_data_file)
    print(f"Saved usable linked data to {out_data_file}")
else:
    print("Dataset was determined to be unusable and was not saved.")

Saved clinical data with 352 samples
After normalization: 19848 genes remaining


Saved normalized gene expression data
Linked data shape: (323, 19851) (samples x features)


After handling missing values, data shape: (323, 19851)
For the feature 'Polycystic_Kidney_Disease', the least common label is '0' with 32 occurrences. This represents 9.91% of the dataset.
The distribution of the feature 'Polycystic_Kidney_Disease' in this dataset is fine.

Quartiles for 'Age':
  25%: 54.0
  50% (Median): 61.459375
  75%: 71.0
Min: 28.0
Max: 88.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 87 occurrences. This represents 26.93% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



Saved usable linked data to ../../output/preprocess/Polycystic_Kidney_Disease/TCGA.csv
