In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Telomere_Length"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Telomere_Length/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Telomere_Length/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Telomere_Length/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Telomere_Length/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
# Step 1: Review subdirectories to find one related to telomere length
import os

# List all directories in TCGA root directory
tcga_dirs = os.listdir(tcga_root_dir)
print(f"Available TCGA directories: {tcga_dirs}")

# For telomere length, we need to check if any directory appears relevant
# Telomere length is not specific to a particular cancer type, but is a cellular characteristic
# that might be studied across different cancer types

# Since telomere length is not directly represented by a specific cancer directory,
# we need to make a decision here:
# Option 1: Choose a dataset that might have comprehensive molecular profiling (like PANCAN)
# Option 2: Acknowledge that there's no direct match for this trait

# For this trait, there's no exact match in the directory names
print(f"No directory specifically matches the trait: {trait}")

# Since the trait is not directly represented, we should record this fact
validate_and_save_cohort_info(
    is_final=False,
    cohort="TCGA",
    info_path=json_path,
    is_gene_available=False,
    is_trait_available=False
)
print(f"Task marked as completed. {trait} is not directly represented in the TCGA dataset.")

Available TCGA directories: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREAD)', 'TCGA_Colon_Cancer_(COAD)', 'TCGA_Endometrioid_Cancer_