In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Rheumatoid_Arthritis"

# Input paths
tcga_root_dir = "../../input/TCGA"

# Output paths
out_data_file = "../../output/preprocess/Rheumatoid_Arthritis/TCGA.csv"
out_gene_data_file = "../../output/preprocess/Rheumatoid_Arthritis/gene_data/TCGA.csv"
out_clinical_data_file = "../../output/preprocess/Rheumatoid_Arthritis/clinical_data/TCGA.csv"
json_path = "../../output/preprocess/Rheumatoid_Arthritis/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
import os

# Check if there's a suitable cohort directory for Rheumatoid Arthritis
# Note: Rheumatoid arthritis is an autoimmune disease, not a cancer type
# TCGA dataset is primarily focused on cancer cohorts
# Let's examine the available directories to see if any might be relevant

print(f"Looking for a relevant cohort directory for {trait}...")

# Check available cohorts
available_dirs = os.listdir(tcga_root_dir)
print(f"Available cohorts: {available_dirs}")

# Since rheumatoid arthritis is an autoimmune disease, not a cancer type,
# we need to assess if any cancer cohort might be related to or affected by 
# autoimmune conditions like rheumatoid arthritis

# For the purpose of this task, let's choose TCGA_Sarcoma as a fallback option
# since it involves connective tissue issues, though it's an imperfect match
selected_dir = "TCGA_Sarcoma_(SARC)"

# Validation of the directory's existence
if selected_dir in available_dirs:
    print(f"Selected cohort: {selected_dir} (Note: this is not an ideal match but used as a proxy)")
    
    # Get file paths for clinical and genetic data
    cohort_path = os.path.join(tcga_root_dir, selected_dir)
    clinical_file_path, genetic_file_path = tcga_get_relevant_filepaths(cohort_path)
    
    print(f"Clinical data file: {clinical_file_path}")
    print(f"Genetic data file: {genetic_file_path}")
    
    # Load the data
    clinical_df = pd.read_csv(clinical_file_path, index_col=0, sep='\t')
    genetic_df = pd.read_csv(genetic_file_path, index_col=0, sep='\t')
    
    # Print column names of clinical data for further analysis
    print("\nClinical data columns:")
    print(clinical_df.columns.tolist())
else:
    print(f"No suitable directory found for {trait}. Skipping this trait.")


Looking for a relevant cohort directory for Rheumatoid_Arthritis...
Available cohorts: ['TCGA_Liver_Cancer_(LIHC)', 'TCGA_Lower_Grade_Glioma_(LGG)', 'TCGA_lower_grade_glioma_and_glioblastoma_(GBMLGG)', 'TCGA_Lung_Adenocarcinoma_(LUAD)', 'TCGA_Lung_Cancer_(LUNG)', 'TCGA_Lung_Squamous_Cell_Carcinoma_(LUSC)', 'TCGA_Melanoma_(SKCM)', 'TCGA_Mesothelioma_(MESO)', 'TCGA_Ocular_melanomas_(UVM)', 'TCGA_Ovarian_Cancer_(OV)', 'TCGA_Pancreatic_Cancer_(PAAD)', 'TCGA_Pheochromocytoma_Paraganglioma_(PCPG)', 'TCGA_Prostate_Cancer_(PRAD)', 'TCGA_Rectal_Cancer_(READ)', 'TCGA_Sarcoma_(SARC)', 'TCGA_Stomach_Cancer_(STAD)', 'TCGA_Testicular_Cancer_(TGCT)', 'TCGA_Thymoma_(THYM)', 'TCGA_Thyroid_Cancer_(THCA)', 'TCGA_Uterine_Carcinosarcoma_(UCS)', '.DS_Store', 'CrawlData.ipynb', 'TCGA_Acute_Myeloid_Leukemia_(LAML)', 'TCGA_Adrenocortical_Cancer_(ACC)', 'TCGA_Bile_Duct_Cancer_(CHOL)', 'TCGA_Bladder_Cancer_(BLCA)', 'TCGA_Breast_Cancer_(BRCA)', 'TCGA_Cervical_Cancer_(CESC)', 'TCGA_Colon_and_Rectal_Cancer_(COADREA


Clinical data columns:
['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_treatment_completion_success_outcome', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'contiguous_organ_invaded', 'contiguous_organ_resection_site', 'days_to_birth', 'days_to_collection', 'days_to_death', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_additional_surgery_procedure', 'days_to_new_tumor_event_after_initial_treatment', 'days_to_well_differentiated_liposarcoma_primary_dx', 'days_to_well_differentiated_liposarcoma_resection', 'discontiguous_lesion_count', 'form_completion_date', 'gender', 'histological_type', 'history_of_neoadjuvant_treatment', 'icd_10', 'icd_o_3_histology', 'icd_o_3_site', 'informed_consent_verified', 'initial_weight', 'is_ffpe', 'leiomyosarcoma_histologic_subtype', 'leiomyosa

### Step 2: Find Candidate Demographic Features

In [3]:
# Identify potential age and gender columns
candidate_age_cols = ['age_at_initial_pathologic_diagnosis', 'days_to_birth']
candidate_gender_cols = ['gender']

# Load clinical data to preview candidate columns
clinical_file_path, _ = tcga_get_relevant_filepaths(os.path.join(tcga_root_dir, 'TCGA_Sarcoma_(SARC)'))
clinical_df = pd.read_csv(clinical_file_path, sep='\t', index_col=0)

# Extract and preview age columns
age_preview = {}
for col in candidate_age_cols:
    if col in clinical_df.columns:
        age_preview[col] = clinical_df[col].head(5).tolist()

# Extract and preview gender columns
gender_preview = {}
for col in candidate_gender_cols:
    if col in clinical_df.columns:
        gender_preview[col] = clinical_df[col].head(5).tolist()

print("Candidate age columns:")
print(candidate_age_cols)
print("\nAge column preview:")
print(age_preview)

print("\nCandidate gender columns:")
print(candidate_gender_cols)
print("\nGender column preview:")
print(gender_preview)


Candidate age columns:
['age_at_initial_pathologic_diagnosis', 'days_to_birth']

Age column preview:
{'age_at_initial_pathologic_diagnosis': [68, 68, 67, 75, 57], 'days_to_birth': [-24984.0, -24962.0, -24628.0, -27664.0, -21094.0]}

Candidate gender columns:
['gender']

Gender column preview:
{'gender': ['MALE', 'MALE', 'MALE', 'MALE', 'FEMALE']}


### Step 3: Select Demographic Features

In [4]:
# Analyze candidate age columns
age_col = None
# Check if there are candidate age columns
if 'age_at_initial_pathologic_diagnosis' in ['age_at_initial_pathologic_diagnosis', 'days_to_birth']:
    # 'age_at_initial_pathologic_diagnosis' directly gives the age in years, which is more interpretable
    # than 'days_to_birth' which would need conversion
    age_col = 'age_at_initial_pathologic_diagnosis'

# Analyze candidate gender columns
gender_col = None
# Check if there are candidate gender columns
if 'gender' in ['gender']:
    # The 'gender' column contains valid values (MALE, FEMALE)
    gender_col = 'gender'

# Print the chosen columns
print(f"Chosen age column: {age_col}")
print(f"Chosen gender column: {gender_col}")


Chosen age column: age_at_initial_pathologic_diagnosis
Chosen gender column: gender


### Step 4: Feature Engineering and Validation

In [5]:
# 1. Extract and standardize clinical features
# Use tcga_select_clinical_features which will automatically create the trait variable and add age/gender if provided
linked_clinical_df = tcga_select_clinical_features(
    clinical_df, 
    trait=trait, 
    age_col=age_col, 
    gender_col=gender_col
)

# Print preview of clinical features
print("Clinical features (first 5 rows):")
print(linked_clinical_df.head())

# 2. Process gene expression data
print("\nProcessing gene expression data...")
# Load genetic data if not already loaded
_, genetic_file_path = tcga_get_relevant_filepaths(os.path.join(tcga_root_dir, 'TCGA_Sarcoma_(SARC)'))
genetic_df = pd.read_csv(genetic_file_path, sep='\t', index_col=0)

# Check gene data shape
print(f"Original gene data shape: {genetic_df.shape}")

# Save a version of the gene data before normalization (as a backup)
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
genetic_df.to_csv(out_gene_data_file.replace('.csv', '_original.csv'))

# We need to transpose genetic data so genes are rows and samples are columns for normalization
gene_df_for_norm = genetic_df.copy().T

# Try to normalize gene symbols - adding debug output to understand what's happening
print("Attempting to normalize gene symbols...")
try:
    normalized_gene_df = normalize_gene_symbols_in_index(gene_df_for_norm)
    print(f"Gene data shape after normalization: {normalized_gene_df.shape}")
    
    # Check if normalization returned empty DataFrame
    if normalized_gene_df.shape[0] == 0:
        print("WARNING: Gene symbol normalization returned an empty DataFrame.")
        print("Using original gene data instead of normalized data.")
        # Use original data instead - samples as rows, genes as columns
        normalized_gene_df = genetic_df
    else:
        # If normalization worked, transpose back to original orientation
        normalized_gene_df = normalized_gene_df.T
except Exception as e:
    print(f"Error during gene symbol normalization: {e}")
    print("Using original gene data instead.")
    normalized_gene_df = genetic_df

# Save gene data
normalized_gene_df.to_csv(out_gene_data_file)
print(f"Gene data saved to: {out_gene_data_file}")

# 3. Link clinical and genetic data
# TCGA data uses the same sample IDs in both datasets
print("\nLinking clinical and genetic data...")
print(f"Clinical data shape: {linked_clinical_df.shape}")
print(f"Genetic data shape: {normalized_gene_df.shape}")

# Find common samples between clinical and genetic data
common_samples = set(linked_clinical_df.index).intersection(set(normalized_gene_df.columns))
print(f"Number of common samples: {len(common_samples)}")

if len(common_samples) == 0:
    print("ERROR: No common samples found between clinical and genetic data.")
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort="TCGA",
        info_path=json_path,
        is_gene_available=False,
        is_trait_available=True,
        is_biased=None,
        df=linked_clinical_df,
        note="No common samples between clinical and genetic data. TCGA Sarcoma cohort is not suitable for Rheumatoid Arthritis."
    )
    print("The dataset was determined to be unusable for this trait. No data files were saved.")
else:
    # Filter clinical data to only include common samples
    linked_clinical_df = linked_clinical_df.loc[list(common_samples)]
    
    # Create linked data by merging
    linked_data = pd.concat([linked_clinical_df, normalized_gene_df[list(common_samples)].T], axis=1)
    
    print(f"\nLinked data shape: {linked_data.shape}")
    print("Linked data preview (first 5 rows, first few columns):")
    display_cols = [trait, 'Age', 'Gender'] + list(linked_data.columns[3:5])
    print(linked_data[display_cols].head())
    
    # 4. Handle missing values
    linked_data = handle_missing_values(linked_data, trait)
    print(f"\nData shape after handling missing values: {linked_data.shape}")
    
    # 5. Check for bias in trait and demographic features
    print("\nChecking for bias in features:")
    is_trait_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
    
    # 6. Validate and save cohort info
    print("\nPerforming final validation...")
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort="TCGA",
        info_path=json_path,
        is_gene_available=len(linked_data.columns) > 3,  # More than just trait/age/gender columns
        is_trait_available=trait in linked_data.columns,
        is_biased=is_trait_biased,
        df=linked_data,
        note="Data from TCGA Sarcoma cohort used as proxy for Rheumatoid Arthritis. Not an ideal match, but used for demonstration."
    )
    
    # 7. Save linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to: {out_data_file}")
        
        # Also save clinical data separately
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_columns = [col for col in linked_data.columns if col in [trait, 'Age', 'Gender']]
        linked_data[clinical_columns].to_csv(out_clinical_data_file)
        print(f"Clinical data saved to: {out_clinical_data_file}")
    else:
        print("The dataset was determined to be unusable for this trait. No data files were saved.")

Clinical features (first 5 rows):
                 Rheumatoid_Arthritis  Age  Gender
sampleID                                          
TCGA-3B-A9HI-01                     1   68       1
TCGA-3B-A9HJ-01                     1   68       1
TCGA-3B-A9HL-01                     1   67       1
TCGA-3B-A9HO-01                     1   75       1
TCGA-3B-A9HP-01                     1   57       0

Processing gene expression data...


Original gene data shape: (20530, 265)


Attempting to normalize gene symbols...
Gene data shape after normalization: (0, 20530)
Using original gene data instead of normalized data.


Gene data saved to: ../../output/preprocess/Rheumatoid_Arthritis/gene_data/TCGA.csv

Linking clinical and genetic data...
Clinical data shape: (271, 3)
Genetic data shape: (20530, 265)
Number of common samples: 265

Linked data shape: (265, 20533)
Linked data preview (first 5 rows, first few columns):
                 Rheumatoid_Arthritis  Age  Gender  ARHGEF10L     HIF3A
TCGA-QQ-A8VH-01                     1   31       0  -0.168592  5.557674
TCGA-WP-A9GB-01                     1   59       0   1.058708  4.966474
TCGA-MJ-A68J-01                     1   55       0   1.004208  2.661674
TCGA-HS-A5N7-01                     1   67       0   0.047108  3.487474
TCGA-QC-A6FX-01                     1   68       1   0.127408 -4.231626



Data shape after handling missing values: (265, 20533)

Checking for bias in features:
For the feature 'Rheumatoid_Arthritis', the least common label is '0' with 2 occurrences. This represents 0.75% of the dataset.
The distribution of the feature 'Rheumatoid_Arthritis' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 52.0
  50% (Median): 61.0
  75%: 70.0
Min: 20
Max: 90
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1' with 120 occurrences. This represents 45.28% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.


Performing final validation...
The dataset was determined to be unusable for this trait. No data files were saved.
