In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Generalized_Anxiety_Disorder"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Generalized_Anxiety_Disorder/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Generalized_Anxiety_Disorder/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
import os

# Step 1: Look for directories related to Generalized Anxiety Disorder
tcga_subdirs = os.listdir(tcga_root_dir)
print(f"Available TCGA subdirectories: {tcga_subdirs}")

# Anxiety disorders are common comorbidities in cancer patients
# We'll select a dataset that's likely to have well-documented psychiatric comorbidities
# There is no direct anxiety disorder dataset, but we can look for comprehensive clinical data
# Breast cancer has extensive clinical documentation and high prevalence of anxiety disorders

target_dir = 'TCGA_Breast_Cancer_(BRCA)'  # Breast cancer patients often have documented anxiety disorders
target_path = os.path.join(tcga_root_dir, target_dir)

print(f"Selected directory: {target_dir} - this dataset may contain clinical information about psychiatric comorbidities including anxiety disorders")

# Step 2: Get the clinical and genetic data file paths
clinical_path, genetic_path = tcga_get_relevant_filepaths(target_path)

# Step 3: Load the datasets
clinical_df = pd.read_csv(clinical_path, sep='\t', index_col=0)
genetic_df = pd.read_csv(genetic_path, sep='\t', index_col=0)

# Step 4: Print column names of clinical data
print("\nClinical data columns:")
print(clinical_df.columns.tolist())

# Check if we have both gene data and potential trait data
has_gene_data = not genetic_df.empty
has_potential_trait_data = not clinical_df.empty

# Record our initial assessment
validate_and_save_cohort_info(
    is_final=False, 
    cohort="TCGA", 
    info_path=json_path, 
    is_gene_available=has_gene_data, 
    is_trait_available=has_potential_trait_data
)


Available TCGA subdirectories: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREAD)', 'TCGA_Colon_Cancer_(COAD)', 'TCGA_Endometrioid_Canc


Clinical data columns:
['AJCC_Stage_nature2012', 'Age_at_Initial_Pathologic_Diagnosis_nature2012', 'CN_Clusters_nature2012', 'Converted_Stage_nature2012', 'Days_to_Date_of_Last_Contact_nature2012', 'Days_to_date_of_Death_nature2012', 'ER_Status_nature2012', 'Gender_nature2012', 'HER2_Final_Status_nature2012', 'Integrated_Clusters_no_exp__nature2012', 'Integrated_Clusters_unsup_exp__nature2012', 'Integrated_Clusters_with_PAM50__nature2012', 'Metastasis_Coded_nature2012', 'Metastasis_nature2012', 'Node_Coded_nature2012', 'Node_nature2012', 'OS_Time_nature2012', 'OS_event_nature2012', 'PAM50Call_RNAseq', 'PAM50_mRNA_nature2012', 'PR_Status_nature2012', 'RPPA_Clusters_nature2012', 'SigClust_Intrinsic_mRNA_nature2012', 'SigClust_Unsupervised_mRNA_nature2012', 'Survival_Data_Form_nature2012', 'Tumor_T1_Coded_nature2012', 'Tumor_nature2012', 'Vital_Status_nature2012', '_INTEGRATION', '_PANCAN_CNA_PANCAN_K8', '_PANCAN_Cluster_Cluster_PANCAN', '_PANCAN_DNAMethyl_BRCA', '_PANCAN_DNAMethyl_PANCA

False

### Step 2: Find Candidate Demographic Features

In [3]:
# Identify candidate columns for age and gender
candidate_age_cols = [
    'Age_at_Initial_Pathologic_Diagnosis_nature2012',
    'age_at_initial_pathologic_diagnosis',
    'days_to_birth'  # Can be used to calculate age
]

candidate_gender_cols = [
    'Gender_nature2012',
    'gender'
]

# Use the selected TCGA directory from a previous step
selected_cohort = "TCGA_Breast_Cancer_(BRCA)"
cohort_dir = os.path.join(tcga_root_dir, selected_cohort)

# Get file paths using the library function
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)

# Load clinical data
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

# Extract and preview age columns
age_preview = {}
for col in candidate_age_cols:
    if col in clinical_df.columns:
        age_preview[col] = clinical_df[col].head(5).tolist()

# Extract and preview gender columns
gender_preview = {}
for col in candidate_gender_cols:
    if col in clinical_df.columns:
        gender_preview[col] = clinical_df[col].head(5).tolist()

print("Age columns preview:")
print(age_preview)
print("\nGender columns preview:")
print(gender_preview)


Age columns preview:
{'Age_at_Initial_Pathologic_Diagnosis_nature2012': [nan, nan, nan, nan, nan], 'age_at_initial_pathologic_diagnosis': [55.0, 50.0, 62.0, 52.0, 50.0], 'days_to_birth': [-20211.0, -18538.0, -22848.0, -19074.0, -18371.0]}

Gender columns preview:
{'Gender_nature2012': [nan, nan, nan, nan, nan], 'gender': ['FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE']}


### Step 3: Select Demographic Features

In [4]:
# Inspect age columns
for col_name, values in {'Age_at_Initial_Pathologic_Diagnosis_nature2012': [None, None, None, None, None], 
                         'age_at_initial_pathologic_diagnosis': [55.0, 50.0, 62.0, 52.0, 50.0], 
                         'days_to_birth': [-20211.0, -18538.0, -22848.0, -19074.0, -18371.0]}.items():
    valid_values = [v for v in values if v is not None and not pd.isna(v)]
    print(f"Column '{col_name}': {len(valid_values)}/5 valid values")

# Inspect gender columns
for col_name, values in {'Gender_nature2012': [None, None, None, None, None], 
                        'gender': ['FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE']}.items():
    valid_values = [v for v in values if v is not None and not pd.isna(v) and isinstance(v, str)]
    print(f"Column '{col_name}': {len(valid_values)}/5 valid values")

# Select columns based on data availability
age_col = 'age_at_initial_pathologic_diagnosis'  # This column has complete non-NaN values
gender_col = 'gender'  # This column has complete string values

# Print the chosen columns
print(f"\nChosen age column: {age_col}")
print(f"Chosen gender column: {gender_col}")


Column 'Age_at_Initial_Pathologic_Diagnosis_nature2012': 0/5 valid values
Column 'age_at_initial_pathologic_diagnosis': 5/5 valid values
Column 'days_to_birth': 5/5 valid values
Column 'Gender_nature2012': 0/5 valid values
Column 'gender': 5/5 valid values

Chosen age column: age_at_initial_pathologic_diagnosis
Chosen gender column: gender


### Step 4: Feature Engineering and Validation

In [5]:
# Step 1: Extract and standardize the clinical features
# Get file paths using the selected breast cancer dataset from Step 1
cohort_dir = os.path.join(tcga_root_dir, 'TCGA_Breast_Cancer_(BRCA)')
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)

# Load data
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)
genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)

# Create standardized clinical features dataframe with trait, age, and gender
# Using tumor/normal classification as the proxy for anxiety-related trait
clinical_features = tcga_select_clinical_features(
    clinical_df, 
    trait=trait,  # Using predefined trait variable
    age_col=age_col, 
    gender_col=gender_col
)

# Save clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_features.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")
print(f"Clinical data shape: {clinical_features.shape}")
print(clinical_features.head())

# Step 2: Normalize gene symbols in gene expression data
# Transpose the genetic data to have genes as rows
genetic_data = genetic_df.copy()

# Normalize gene symbols using the NCBI Gene database synonyms
normalized_gene_data = normalize_gene_symbols_in_index(genetic_data)

# Save normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")
print(f"Normalized gene data shape: {normalized_gene_data.shape}")

# Step 3: Link clinical and genetic data
# Transpose genetic data to get samples as rows, genes as columns
genetic_data_transposed = normalized_gene_data.T

# Ensure clinical and genetic data have the same samples (index values)
common_samples = clinical_features.index.intersection(genetic_data_transposed.index)
clinical_subset = clinical_features.loc[common_samples]
genetic_subset = genetic_data_transposed.loc[common_samples]

# Combine clinical and genetic data
linked_data = pd.concat([clinical_subset, genetic_subset], axis=1)
print(f"Linked data shape: {linked_data.shape}")

# Step 4: Handle missing values
linked_data = handle_missing_values(linked_data, trait_col=trait)
print(f"After handling missing values - linked data shape: {linked_data.shape}")

# Step 5: Determine biased features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait=trait)
print(f"After removing biased features - linked data shape: {linked_data.shape}")

# Step 6: Validate data quality and save cohort info
# First check if we have both gene and trait data
is_gene_available = linked_data.shape[1] > 3  # More than just trait, Age, Gender
is_trait_available = trait in linked_data.columns

# Take notes of special findings
notes = f"TCGA Breast Cancer dataset processed. Used tumor/normal classification as a proxy for {trait} analysis."

# Validate the data quality
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort="TCGA",
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available,
    is_biased=is_biased,
    df=linked_data,
    note=notes
)

# Step 7: Save linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Linked data not saved due to quality concerns")

Clinical data saved to ../../output/preprocess/Generalized_Anxiety_Disorder/clinical_data/TCGA.csv
Clinical data shape: (1247, 3)
                 Generalized_Anxiety_Disorder   Age  Gender
sampleID                                                   
TCGA-3C-AAAU-01                             1  55.0     0.0
TCGA-3C-AALI-01                             1  50.0     0.0
TCGA-3C-AALJ-01                             1  62.0     0.0
TCGA-3C-AALK-01                             1  52.0     0.0
TCGA-4H-AAAK-01                             1  50.0     0.0


Normalized gene data saved to ../../output/preprocess/Generalized_Anxiety_Disorder/gene_data/TCGA.csv
Normalized gene data shape: (19848, 1218)
Linked data shape: (1218, 19851)


After handling missing values - linked data shape: (1218, 19851)
For the feature 'Generalized_Anxiety_Disorder', the least common label is '0' with 114 occurrences. This represents 9.36% of the dataset.
The distribution of the feature 'Generalized_Anxiety_Disorder' in this dataset is fine.

Quartiles for 'Age':
  25%: 48.0
  50% (Median): 58.0
  75%: 67.0
Min: 26.0
Max: 90.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 13 occurrences. This represents 1.07% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

After removing biased features - linked data shape: (1218, 19851)


Linked data saved to ../../output/preprocess/Generalized_Anxiety_Disorder/TCGA.csv
