In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
import os

# List all subdirectories in tcga_root_dir
subdirs = os.listdir(tcga_root_dir)
print(f"Available TCGA directories: {subdirs}")

# Type 2 Diabetes is a metabolic disorder that can impact various organs,
# particularly the pancreas. The closest match might be pancreatic cancer.
selected_dir = None
for subdir in subdirs:
    if 'Pancreatic' in subdir:
        selected_dir = subdir
        break

if selected_dir is None:
    print(f"No suitable directory found for {trait}. Skipping this trait.")
    validate_and_save_cohort_info(is_final=False, 
                                  cohort="TCGA", 
                                  info_path=json_path, 
                                  is_gene_available=False, 
                                  is_trait_available=False)
    exit()

# Construct the full path to the selected directory
cohort_dir = os.path.join(tcga_root_dir, selected_dir)
print(f"Selected directory: {cohort_dir}")

# Get the paths to clinical and genetic data files
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_dir)
print(f"Clinical data file: {clinical_file_path}")
print(f"Genetic data file: {genetic_file_path}")

# Load the data files
clinical_df = pd.read_csv(clinical_file_path, index_col=0, sep='\t')
genetic_df = pd.read_csv(genetic_file_path, index_col=0, sep='\t')

print("Clinical data columns:")
print(clinical_df.columns.tolist())


Available TCGA directories: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREAD)', 'TCGA_Colon_Cancer_(COAD)', 'TCGA_Endometrioid_Cancer_

Clinical data columns:
['CDE_ID_3226963', '_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'adenocarcinoma_invasion', 'age_at_initial_pathologic_diagnosis', 'alcohol_history_documented', 'alcoholic_exposure_category', 'amount_of_alcohol_consumption_per_day', 'anatomic_neoplasm_subdivision', 'anatomic_neoplasm_subdivision_other', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'days_to_birth', 'days_to_collection', 'days_to_death', 'days_to_diabetes_onset', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_additional_surgery_procedure', 'days_to_new_tumor_event_after_initial_treatment', 'days_to_pancreatitis_onset', 'family_history_of_cancer', 'followup_case_report_form_submission_reason', 'followup_treatment_success', 'form_completion_date', 'frequency_of_alcohol_consumption', 'gender', 'histologic_grading_tier_category', 'histolog

### Step 2: Find Candidate Demographic Features

In [3]:
import pandas as pd
import os

# Identify candidate age and gender columns
candidate_age_cols = ['age_at_initial_pathologic_diagnosis', 'days_to_birth', 'year_of_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']

# Read the clinical data
clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(os.path.join(tcga_root_dir, "TCGA_Pancreatic_Cancer_(PAAD)"))
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

# Preview age columns
age_preview = {}
for col in candidate_age_cols:
    if col in clinical_df.columns:
        age_preview[col] = clinical_df[col].head(5).tolist()

# Preview gender columns
gender_preview = {}
for col in candidate_gender_cols:
    if col in clinical_df.columns:
        gender_preview[col] = clinical_df[col].head(5).tolist()

print("Candidate age columns:")
print(candidate_age_cols)
print("\nAge data preview:")
print(age_preview)

print("\nCandidate gender columns:")
print(candidate_gender_cols)
print("\nGender data preview:")
print(gender_preview)


Candidate age columns:
['age_at_initial_pathologic_diagnosis', 'days_to_birth', 'year_of_initial_pathologic_diagnosis']

Age data preview:
{'age_at_initial_pathologic_diagnosis': [65, 48, 75, 71, 70], 'days_to_birth': [-23962, -17794, -27600, -26028, -25920], 'year_of_initial_pathologic_diagnosis': [2012.0, 2012.0, 2012.0, 2012.0, 2012.0]}

Candidate gender columns:
['gender']

Gender data preview:
{'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']}


### Step 3: Select Demographic Features

In [4]:
# Step: Select Demographic Features

# Selecting the age column
# Analyze the candidate age columns:
# 1. 'age_at_initial_pathologic_diagnosis' - contains explicit age values (preferable)
# 2. 'days_to_birth' - contains negative values indicating days before birth (can be converted)
# 3. 'year_of_initial_pathologic_diagnosis' - year of diagnosis, not directly age information

# Select 'age_at_initial_pathologic_diagnosis' as it contains explicit age values
age_col = 'age_at_initial_pathologic_diagnosis'

# Selecting the gender column
# Only one candidate column 'gender', which contains values like 'MALE' and 'FEMALE'
gender_col = 'gender'

# Print the chosen demographic columns
print(f"Selected age column: {age_col}")
print(f"Selected gender column: {gender_col}")
print(f"Age sample values: {[65, 48, 75, 71, 70]}")
print(f"Gender sample values: {['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']}")


Selected age column: age_at_initial_pathologic_diagnosis
Selected gender column: gender
Age sample values: [65, 48, 75, 71, 70]
Gender sample values: ['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']


### Step 4: Feature Engineering and Validation

In [5]:
# Step: Feature Engineering and Validation

# 1. Extract and standardize clinical features
print("\nExtracting clinical features...")
clinical_features = tcga_select_clinical_features(
    clinical_df, 
    trait=trait, 
    age_col=age_col, 
    gender_col=gender_col
)

print(f"Clinical features shape: {clinical_features.shape}")
print(f"Preview of clinical features:\n{clinical_features.head()}")

# Save clinical data
os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
clinical_features.to_csv(out_clinical_data_file)
print(f"Clinical data saved to {out_clinical_data_file}")

# 2. Normalize gene symbols in the gene expression data
print("\nNormalizing gene expression data...")
genetic_df_normalized = normalize_gene_symbols_in_index(genetic_df)
print(f"Original gene expression data shape: {genetic_df.shape}")
print(f"Normalized gene expression data shape: {genetic_df_normalized.shape}")

# Save the normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
genetic_df_normalized.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# 3. Link the clinical and genetic data on sample IDs
print("\nLinking clinical and genetic data...")
# Transpose genetic data to have samples as rows and genes as columns
genetic_df_for_linking = genetic_df_normalized.T

# Ensure sample IDs in clinical features match those in genetic data
common_samples = clinical_features.index.intersection(genetic_df_for_linking.index)
print(f"Number of common samples: {len(common_samples)}")

# Filter both dataframes to keep only common samples
clinical_features_common = clinical_features.loc[common_samples]
genetic_df_common = genetic_df_for_linking.loc[common_samples]

# Combine clinical and genetic data
linked_data = pd.concat([clinical_features_common, genetic_df_common], axis=1)
print(f"Linked data shape: {linked_data.shape}")

# 4. Handle missing values systematically
print("\nHandling missing values...")
linked_data_clean = handle_missing_values(linked_data, trait_col=trait)
print(f"Data shape after handling missing values: {linked_data_clean.shape}")

# 5. Determine if trait or demographic features are biased
print("\nChecking for biased features...")
is_trait_biased, linked_data_clean = judge_and_remove_biased_features(linked_data_clean, trait)

# 6. Validate data quality and save cohort information
print("\nValidating final data quality...")
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort="TCGA",
    info_path=json_path,
    is_gene_available=genetic_df_normalized.shape[0] > 0,
    is_trait_available=clinical_features.shape[0] > 0,
    is_biased=is_trait_biased,
    df=linked_data_clean,
    note="Pancreatic cancer dataset used as proxy for Type 2 Diabetes due to pancreatic involvement in diabetes."
)

# 7. Save the linked data if usable
if is_usable:
    print(f"\nData is usable. Saving linked data to {out_data_file}...")
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data_clean.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("\nData is not usable. Linked data will not be saved.")


Extracting clinical features...
Clinical features shape: (196, 3)
Preview of clinical features:
                 Type_2_Diabetes  Age  Gender
sampleID                                     
TCGA-2J-AAB1-01                1   65       1
TCGA-2J-AAB4-01                1   48       1
TCGA-2J-AAB6-01                1   75       1
TCGA-2J-AAB8-01                1   71       1
TCGA-2J-AAB9-01                1   70       0
Clinical data saved to ../../output/preprocess/Type_2_Diabetes/clinical_data/TCGA.csv

Normalizing gene expression data...
Original gene expression data shape: (20530, 183)
Normalized gene expression data shape: (19848, 183)


Normalized gene expression data saved to ../../output/preprocess/Type_2_Diabetes/gene_data/TCGA.csv

Linking clinical and genetic data...
Number of common samples: 183
Linked data shape: (183, 19851)

Handling missing values...


Data shape after handling missing values: (183, 19851)

Checking for biased features...
For the feature 'Type_2_Diabetes', the least common label is '0' with 4 occurrences. This represents 2.19% of the dataset.
The distribution of the feature 'Type_2_Diabetes' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 57.0
  50% (Median): 65.0
  75%: 73.0
Min: 35
Max: 88
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 82 occurrences. This represents 44.81% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.


Validating final data quality...

Data is not usable. Linked data will not be saved.
