In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Prostate_Cancer"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Prostate_Cancer/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Prostate_Cancer/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Prostate_Cancer/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Prostate_Cancer/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
import os

# Check if there's a suitable cohort directory for Prostate Cancer
print(f"Looking for a relevant cohort directory for {trait}...")

# Check available cohorts
available_dirs = os.listdir(tcga_root_dir)
print(f"Available cohorts: {available_dirs}")

# Prostate cancer related keywords
prostate_related_keywords = ['prostate', 'prad']

# Look for prostate cancer related directories
prostate_related_dirs = []
for d in available_dirs:
    if any(keyword in d.lower() for keyword in prostate_related_keywords):
        prostate_related_dirs.append(d)

print(f"Prostate cancer related cohorts: {prostate_related_dirs}")

if not prostate_related_dirs:
    print(f"No suitable cohort found for {trait}.")
    # Mark the task as completed by recording the unavailability
    validate_and_save_cohort_info(
        is_final=False,
        cohort="TCGA",
        info_path=json_path,
        is_gene_available=False,
        is_trait_available=False
    )
    # Exit the script early since no suitable cohort was found
    selected_cohort = None
else:
    # For Prostate Cancer, select the most specific match
    selected_cohort = prostate_related_dirs[0]

if selected_cohort:
    print(f"Selected cohort: {selected_cohort}")
    
    # Get the full path to the selected cohort directory
    cohort_dir = os.path.join(tcga_root_dir, selected_cohort)
    
    # Get the clinical and genetic data file paths
    clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)
    
    print(f"Clinical data file: {os.path.basename(clinical_file_path)}")
    print(f"Genetic data file: {os.path.basename(genetic_file_path)}")
    
    # Load the clinical and genetic data
    clinical_df = pd.read_csv(clinical_file_path, index_col=0, sep='\t')
    genetic_df = pd.read_csv(genetic_file_path, index_col=0, sep='\t')
    
    # Print the column names of the clinical data
    print("\nClinical data columns:")
    print(clinical_df.columns.tolist())
    
    # Basic info about the datasets
    print(f"\nClinical data shape: {clinical_df.shape}")
    print(f"Genetic data shape: {genetic_df.shape}")


Looking for a relevant cohort directory for Prostate_Cancer...
Available cohorts: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREAD)', 


Clinical data columns:
['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'biochemical_recurrence', 'bone_scan_results', 'clinical_M', 'clinical_T', 'days_to_birth', 'days_to_bone_scan_performed', 'days_to_collection', 'days_to_death', 'days_to_diagnostic_computed_tomography_performed', 'days_to_diagnostic_mri_performed', 'days_to_first_biochemical_recurrence', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_after_initial_treatment', 'days_to_psa', 'days_to_second_biochemical_recurrence', 'days_to_third_biochemical_recurrence', 'diagnostic_ct_abd_pelvis_performed', 'diagnostic_ct_abd_pelvis_result', 'diagnostic_mri_performed', 'diagnostic_mri_result', 'followup_case_report_form_submission_reason', 'followup_treatment_success', 'form_completion_

### Step 2: Find Candidate Demographic Features

In [3]:
# Identify candidate columns for age and gender
candidate_age_cols = [
    "age_at_initial_pathologic_diagnosis", 
    "days_to_birth"
]
candidate_gender_cols = ["gender"]

# Extract candidate columns for age and gender from clinical data
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(os.path.join(tcga_root_dir, "TCGA_Prostate_Cancer_(PRAD)"))
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

# Preview age columns
age_preview = {}
for col in candidate_age_cols:
    if col in clinical_df.columns:
        age_preview[col] = clinical_df[col].head(5).tolist()
print("Age columns preview:")
print(age_preview)

# Preview gender columns
gender_preview = {}
for col in candidate_gender_cols:
    if col in clinical_df.columns:
        gender_preview[col] = clinical_df[col].head(5).tolist()
print("\nGender columns preview:")
print(gender_preview)


Age columns preview:
{'age_at_initial_pathologic_diagnosis': [51, 57, 47, 52, 70], 'days_to_birth': [-18658.0, -20958.0, -17365.0, -19065.0, -25904.0]}

Gender columns preview:
{'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'MALE']}


### Step 3: Select Demographic Features

In [4]:
# Inspect the age column options
print("Age column options:")
for col, values in {'age_at_initial_pathologic_diagnosis': [51, 57, 47, 52, 70], 'days_to_birth': [-18658.0, -20958.0, -17365.0, -19065.0, -25904.0]}.items():
    print(f"{col}: {values}")
    
# Inspect the gender column options
print("\nGender column options:")
for col, values in {'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'MALE']}.items():
    print(f"{col}: {values}")

# For age, choose 'age_at_initial_pathologic_diagnosis' as it gives the direct age values
# For gender, we only have one option 'gender', but we need to check if values are consistent
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'

print("\nChosen demographic columns:")
print(f"Age column: {age_col}")
print(f"Gender column: {gender_col}")


Age column options:
age_at_initial_pathologic_diagnosis: [51, 57, 47, 52, 70]
days_to_birth: [-18658.0, -20958.0, -17365.0, -19065.0, -25904.0]

Gender column options:
gender: ['MALE', 'MALE', 'MALE', 'MALE', 'MALE']

Chosen demographic columns:
Age column: age_at_initial_pathologic_diagnosis
Gender column: gender


### Step 4: Feature Engineering and Validation

In [5]:
# 1. Extract and standardize clinical features
# Use tcga_select_clinical_features which will automatically create the trait variable and add age/gender if provided
cohort_dir = os.path.join(tcga_root_dir, 'TCGA_Prostate_Cancer_(PRAD)')
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)

# Load the clinical data if not already loaded
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

linked_clinical_df = tcga_select_clinical_features(
    clinical_df, 
    trait=trait, 
    age_col=age_col, 
    gender_col=gender_col
)

# Print preview of clinical features
print("Clinical features (first 5 rows):")
print(linked_clinical_df.head())

# 2. Process gene expression data
print("\nProcessing gene expression data...")
# Load genetic data from the same cohort directory
genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)

# Check gene data shape
print(f"Original gene data shape: {genetic_df.shape}")

# Save a version of the gene data before normalization (as a backup)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
genetic_df.to_csv(out_gene_data_file.replace('.csv', '_original.csv'))

# We need to transpose genetic data so genes are rows and samples are columns for normalization
gene_df_for_norm = genetic_df.copy()  # Keep original orientation for now

# Try to normalize gene symbols - adding debug output to understand what's happening
print("Attempting to normalize gene symbols...")
try:
    # First check if we need to transpose based on the data format
    # In TCGA data, typically genes are rows and samples are columns
    if gene_df_for_norm.shape[0] > gene_df_for_norm.shape[1]:
        # More rows than columns, likely genes are rows already
        normalized_gene_df = normalize_gene_symbols_in_index(gene_df_for_norm)
    else:
        # Need to transpose first
        normalized_gene_df = normalize_gene_symbols_in_index(gene_df_for_norm.T)
        
    print(f"Gene data shape after normalization: {normalized_gene_df.shape}")
    
    # Check if normalization returned empty DataFrame
    if normalized_gene_df.shape[0] == 0:
        print("WARNING: Gene symbol normalization returned an empty DataFrame.")
        print("Using original gene data instead of normalized data.")
        # Use original data
        normalized_gene_df = genetic_df
    
except Exception as e:
    print(f"Error during gene symbol normalization: {e}")
    print("Using original gene data instead.")
    normalized_gene_df = genetic_df

# Save gene data
normalized_gene_df.to_csv(out_gene_data_file)
print(f"Gene data saved to: {out_gene_data_file}")

# 3. Link clinical and genetic data
# TCGA data uses the same sample IDs in both datasets
print("\nLinking clinical and genetic data...")
print(f"Clinical data shape: {linked_clinical_df.shape}")
print(f"Genetic data shape: {normalized_gene_df.shape}")

# Find common samples between clinical and genetic data
# In TCGA, samples are typically columns in the gene data and index in the clinical data
common_samples = set(linked_clinical_df.index).intersection(set(normalized_gene_df.columns))
print(f"Number of common samples: {len(common_samples)}")

if len(common_samples) == 0:
    print("ERROR: No common samples found between clinical and genetic data.")
    # Try the alternative orientation
    common_samples = set(linked_clinical_df.index).intersection(set(normalized_gene_df.index))
    print(f"Checking alternative orientation: {len(common_samples)} common samples found.")
    
    if len(common_samples) == 0:
        # Use is_final=False mode which doesn't require df and is_biased
        validate_and_save_cohort_info(
            is_final=False,
            cohort="TCGA",
            info_path=json_path,
            is_gene_available=True,
            is_trait_available=True
        )
        print("The dataset was determined to be unusable for this trait due to no common samples. No data files were saved.")
else:
    # Filter clinical data to only include common samples
    linked_clinical_df = linked_clinical_df.loc[list(common_samples)]
    
    # Create linked data by merging
    linked_data = pd.concat([linked_clinical_df, normalized_gene_df[list(common_samples)].T], axis=1)
    
    print(f"\nLinked data shape: {linked_data.shape}")
    print("Linked data preview (first 5 rows, first few columns):")
    display_cols = [trait, 'Age', 'Gender'] + list(linked_data.columns[3:5])
    print(linked_data[display_cols].head())
    
    # 4. Handle missing values
    linked_data = handle_missing_values(linked_data, trait)
    print(f"\nData shape after handling missing values: {linked_data.shape}")
    
    # 5. Check for bias in trait and demographic features
    print("\nChecking for bias in features:")
    is_trait_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 6. Validate and save cohort info
    print("\nPerforming final validation...")
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort="TCGA",
        info_path=json_path,
        is_gene_available=len(linked_data.columns) > 3,  # More than just trait/age/gender columns
        is_trait_available=trait in linked_data.columns,
        is_biased=is_trait_biased,
        df=linked_data,
        note="Data from TCGA Prostate Cancer cohort used for Prostate Cancer gene expression analysis."
    )
    
    # 7. Save linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to: {out_data_file}")
        
        # Also save clinical data separately
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_columns = [col for col in linked_data.columns if col in [trait, 'Age', 'Gender']]
        linked_data[clinical_columns].to_csv(out_clinical_data_file)
        print(f"Clinical data saved to: {out_clinical_data_file}")
    else:
        print("The dataset was determined to be unusable for this trait. No data files were saved.")

Clinical features (first 5 rows):
                 Prostate_Cancer  Age  Gender
sampleID                                     
TCGA-2A-A8VL-01                1   51       1
TCGA-2A-A8VO-01                1   57       1
TCGA-2A-A8VT-01                1   47       1
TCGA-2A-A8VV-01                1   52       1
TCGA-2A-A8VX-01                1   70       1

Processing gene expression data...


Original gene data shape: (20530, 550)


Attempting to normalize gene symbols...


Gene data shape after normalization: (19848, 550)


Gene data saved to: ../../output/preprocess/Prostate_Cancer/gene_data/TCGA.csv

Linking clinical and genetic data...
Clinical data shape: (566, 3)
Genetic data shape: (19848, 550)
Number of common samples: 550

Linked data shape: (550, 19851)
Linked data preview (first 5 rows, first few columns):
                 Prostate_Cancer  Age  Gender      A1BG  A1BG-AS1
TCGA-KK-A8IB-01                1   65       1 -1.449074 -1.407783
TCGA-KK-A8I5-01                1   55       1 -3.363674 -2.436983
TCGA-EJ-7125-01                1   44       1 -1.923574 -1.619283
TCGA-HC-7233-01                1   73       1 -2.559574 -2.060483
TCGA-EJ-7317-11                0   71       1 -0.162874 -0.135083



Data shape after handling missing values: (550, 19851)

Checking for bias in features:
For the feature 'Prostate_Cancer', the least common label is '0' with 52 occurrences. This represents 9.45% of the dataset.
The distribution of the feature 'Prostate_Cancer' in this dataset is fine.

Quartiles for 'Age':
  25%: 56.0
  50% (Median): 61.0
  75%: 66.0
Min: 41
Max: 78
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1' with 550 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.




Performing final validation...


Linked data saved to: ../../output/preprocess/Prostate_Cancer/TCGA.csv
Clinical data saved to: ../../output/preprocess/Prostate_Cancer/clinical_data/TCGA.csv
