In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import glob
import xmltodict
from IPython.display import display, HTML


# Function to drop columns with all same values or all NaN values
def drop_constant_columns(df):
    # Drop columns with all NaN values
    df = df.dropna(axis=1, how='all')

    # Find columns where all values are the same (including columns with a single unique non-NaN value)
    nunique = df.apply(pd.Series.nunique, dropna=False)
    cols_to_drop = nunique[nunique <= 1].index

    # Drop these columns
    df = df.drop(cols_to_drop, axis=1)

    return df

# Display the DataFrame in a scrollable format
def display_scrollable_dataframe(df, max_rows=20):
    display(HTML(df.to_html(max_rows=max_rows, classes='table table-striped table-bordered table-hover')))

# Function to recursively extract tags and values
def extract_tags_and_values(elem, parent_tag="", tag_count=None):
    if tag_count is None:
        tag_count = {}

    data = {}
    for child in elem:
        # Get the base tag without any namespaces
        base_tag = child.tag.split('}')[-1]

        # If this tag has a sequence attribute, append it to the tag
        sequence = child.attrib.get('sequence')

        # Construct full tag name, including parent if necessary
        if parent_tag:
            full_tag = f"{parent_tag}.{base_tag}"
        else:
            full_tag = base_tag

        # Handle repeated tags: append index or sequence to make the tag unique
        if sequence:
            full_tag += f"_seq_{sequence}"
        elif full_tag in tag_count:
            tag_count[full_tag] += 1
            full_tag += f"_{tag_count[full_tag]}"
        else:
            tag_count[full_tag] = 1

        # If the child has text and it's not just whitespace, store the value
        if child.text and child.text.strip():
            data[full_tag] = child.text.strip()

        # Recursively call the function to process the child elements
        data.update(extract_tags_and_values(child, full_tag, tag_count))

    return data

# Function to process all XML files in subfolders and consolidate them into a DataFrame
def process_xml_files(root_folder):
    all_data = []

    # Recursively find all XML files in the root folder and subfolders
    xml_files = glob.glob(os.path.join(root_folder, '**/*.xml'), recursive=True)

    for xml_file in xml_files:
        # Load and parse each XML file
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Extract the patient data using the extract_tags_and_values function
        patient_data = extract_tags_and_values(root)

        # Add the patient data to the list
        all_data.append(patient_data)

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(all_data)

    # Optionally: Drop columns that have all NaN values or are constant
    df_cleaned = drop_constant_columns(df)

    return df_cleaned

# Function to extract and organize follow-up columns dynamically
def extract_and_organize_followup(df, prefix="follow_up_seq"):
    # Extract columns that match the follow-up sequence prefix
    follow_up_columns = [col for col in df.columns if prefix in col]
    
    # Extract unique sequence numbers dynamically (e.g., _1, _2, etc.)
    seq_numbers = sorted(set([col.split(f"{prefix}_")[1].split('_')[0] for col in follow_up_columns if f"{prefix}_" in col]))
    
    # Organize columns based on the sequence numbers
    organized_columns = []
    for seq in seq_numbers:
        seq_columns = [col for col in follow_up_columns if f"{prefix}_{seq}" in col]
        organized_columns.extend(seq_columns)
    
    return organized_columns

# Function to extract and organize drug columns, accounting for drug_1 being just 'patient.drugs.drug'
def extract_and_organize_drugs(df, base_prefix="patient.drugs.drug"):
    # Extract all drug-related columns (both drug_1 and drug_x)
    drug_columns = [col for col in df.columns if base_prefix in col]
    
    # Special handling for 'drug_1' columns without a suffix
    drug_1_columns = [col for col in drug_columns if base_prefix + "_" not in col]
    
    # Extract unique drug sequence numbers dynamically, handling possible extra parts in the column names
    drug_seq_numbers = sorted(set([int(col.split(f"{base_prefix}_")[1].split('.')[0])
                                   for col in drug_columns if f"{base_prefix}_" in col]))
    
    # Organize columns based on the sequence numbers
    organized_columns = drug_1_columns  # Start with drug_1 columns
    for seq in drug_seq_numbers:
        seq_columns = [col for col in drug_columns if f"{base_prefix}_{seq}." in col]
        organized_columns.extend(seq_columns)
    
    return organized_columns

Edition standardizations scripts here

In [2]:
def map_icd_to_site(icd_code):
    """Maps ICD-10 code to the corresponding AJCC site."""
    
    if icd_code in ['C02.9', 'C04.9', 'C06.9', 'C06.0', 'C03.9', 'C00.9', 'C05.0', 'C03.1', 'C04.0', 'C06.2', 'C02.1', 'C05.9', 'C03.0', 'C02.2']:
        return 'Section 3'
    
    elif icd_code in ['C32.9','C32.1']:
        return 'Section 5'
    
    elif icd_code == 'C14.8':
        return 'Section 9'
    
    elif icd_code in ['C09.9', 'C01', 'C10.9', 'C10.3','C13.9']:
        return 'Section 4'
    
    elif icd_code == 'C41.1':
        return 'Section 27'
    
    else:
        return 'Unknown Site'
    
def map_section_3_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th):
    """Maps AJCC 6th edition to 7th edition for Section 3: Lip and Oral Cavity."""
    # T4 lesions have been divided into T4a (moderately advanced local disease) 
    # and T4b (very advanced local disease), leading to the stratification of 
    # Stage IV into Stage IVA, IVB, and IVC.
    if t_stage_6th == 'T4':
        return 'T4a', n_stage_6th, m_stage_6th  # T4 has been split into T4a and T4b
    else:
        return t_stage_6th, n_stage_6th, m_stage_6th


def map_section_4_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th, tissue_or_organ):
    """Maps AJCC 6th edition to 7th edition for Section 4 using the provided mappings."""
    
    if tissue_or_organ in ['Oral Tongue', 'Oral Cavity', 'Floor of mouth', 'Tonsil', 'Base of tongue', 'Buccal Mucosa', 
                           'Alveolar Ridge', 'Hard Palate', 'Lip', 'Oropharynx', 'Hypopharynx', 'Larynx']:
        # The conditions here should match your exact categories
        if t_stage_6th == 'T4':
            return 'T4a', n_stage_6th, m_stage_6th  # Modify based on specific stratification
        else:
            return t_stage_6th, n_stage_6th, m_stage_6th
    else:
        return t_stage_6th, n_stage_6th, m_stage_6th

def map_section_5_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th):
    """Maps AJCC 6th edition to 7th edition for Section 5: Larynx."""
    # T4 lesions have been divided into T4a (moderately advanced local disease) 
    # and T4b (very advanced local disease), leading to the stratification of 
    # Stage IV into Stage IVA, IVB, and IVC.
    if t_stage_6th == 'T4':
        return 'T4a', n_stage_6th, m_stage_6th  # T4 has been split into T4a and T4b
    else:
        return t_stage_6th, n_stage_6th, m_stage_6th

def map_section_9_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th):
    """Maps AJCC 6th edition to 7th edition for Section 9: Mucosal Melanoma of the Head and Neck."""
    # No changes needed for Section 9 as it didn't exist in the 6th edition.
    return t_stage_6th, n_stage_6th, m_stage_6th

def map_section_27_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th):
    """Maps AJCC 6th edition to 7th edition for Section 27: Bone (Mandible)."""
    # Stage III is reserved for G3 and G4 grades in the 7th edition.
    return t_stage_6th, n_stage_6th, m_stage_6th


    
def map_ajcc_6th_to_7th(icd_code, t_stage_6th, n_stage_6th, m_stage_6th, tissue_or_organ, grade=None):
    """Maps AJCC 6th edition to 7th edition based on the section derived from the ICD-10 code."""
    section = map_icd_to_site(icd_code)
    
    if section == 'Section 3':
        return map_section_3_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th)
    elif section == 'Section 4':
        return map_section_4_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th, tissue_or_organ)
    elif section == 'Section 5':
        return map_section_5_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th)
    elif section == 'Section 9':
        return map_section_9_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th)
    elif section == 'Section 27':
        return map_section_27_6th_to_7th(t_stage_6th, n_stage_6th, m_stage_6th)
    else:
        return 'Unknown Section', t_stage_6th, n_stage_6th, m_stage_6th

def map_section_3_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th):
    """Maps AJCC 5th edition to 6th edition for Section 3: Lip and Oral Cavity."""
    # T4 lesions have been divided into T4a (resectable) and T4b (unresectable), 
    # leading to the division of Stage IV into Stage IVA, IVB, and IVC.
    if t_stage_5th == 'T4':
        return 'T4a', n_stage_5th, m_stage_5th  # Assuming resectable; adjust if needed
    else:
        return t_stage_5th, n_stage_5th, m_stage_5th

def map_section_4_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th, tissue_or_organ):
    """Maps AJCC 5th edition to 6th edition for Section 4: Pharynx, considering the specific subregions."""

    # For oropharynx and hypopharynx subdivisions:
    # T4 lesions are divided into T4a and T4b in the 6th edition.

    if tissue_or_organ in ['Oral Tongue', 'Oral Cavity', 'Floor of mouth', 'Tonsil', 'Base of tongue',
                           'Buccal Mucosa', 'Alveolar Ridge', 'Hard Palate', 'Lip', 'Oropharynx', 'Hypopharynx', 'Larynx']:
        if t_stage_5th == 'T4':
            return 'T4a', n_stage_5th, m_stage_5th  # Assuming resectable
        else:
            return t_stage_5th, n_stage_5th, m_stage_5th
    else:
        return t_stage_5th, n_stage_5th, m_stage_5th


def map_section_5_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th):
    """Maps AJCC 5th edition to 6th edition for Section 5: Larynx."""
    # T4 lesions have been divided into T4a (resectable) and T4b (unresectable),
    # leading to the division of Stage IV into Stage IVA, IVB, and IVC.
    if t_stage_5th == 'T4':
        return 'T4a', n_stage_5th, m_stage_5th  # Assuming resectable; adjust if needed
    else:
        return t_stage_5th, n_stage_5th, m_stage_5th

def map_section_9_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th):
    """Maps AJCC 5th edition to 6th edition for Section 9: Mucosal Melanoma of the Head and Neck."""
    # No changes needed for Section 9 as it did not exist in the 6th edition.
    return t_stage_5th, n_stage_5th, m_stage_5th

def map_section_27_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th):
    """Maps AJCC 5th edition to 6th edition for Section 27: Bone (Mandible)."""
    # M1 lesions have been divided into M1a and M1b:
    # - M1a is lung-only metastases (Stage IVA)
    # - M1b is metastases to other distant sites, including lymph nodes (Stage IVB).
    if m_stage_5th == 'M1':
        return t_stage_5th, n_stage_5th, 'M1a'  
    else:
        return t_stage_5th, n_stage_5th, m_stage_5th

def map_ajcc_5th_to_6th(icd_code, t_stage_5th, n_stage_5th, m_stage_5th, tissue_or_organ):
    """Maps AJCC 5th edition to 6th edition based on the section derived from the ICD-10 code."""
    section = map_icd_to_site(icd_code)
    
    if section == 'Section 3':
        return map_section_3_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th)
    elif section == 'Section 4':
        return map_section_4_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th, tissue_or_organ)
    elif section == 'Section 5':
        return map_section_5_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th)
    elif section == 'Section 9':
        return map_section_9_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th)
    elif section == 'Section 27':
        return map_section_27_5th_to_6th(t_stage_5th, n_stage_5th, m_stage_5th)
    else:
        return t_stage_5th, n_stage_5th, m_stage_5th

def map_clinical_5th_to_6th(row):
    # Clinical mapping
    clinical_t, clinical_n, clinical_m = map_ajcc_5th_to_6th(
        icd_code=row['patient.icd_10'],
        t_stage_5th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_T'],
        n_stage_5th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_N'],
        m_stage_5th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_M'],
        tissue_or_organ=row['patient.anatomic_neoplasm_subdivision']
    )
    
    return pd.Series({
        'ajcc_clinical_t': clinical_t,
        'ajcc_clinical_n': clinical_n,
        'ajcc_clinical_m': clinical_m

    })

def map_clinical_6th_to_7th(row):
    # Clinical mapping
    clinical_t, clinical_n, clinical_m = map_ajcc_6th_to_7th(
        icd_code=row['patient.icd_10'],
        t_stage_6th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_T'],
        n_stage_6th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_N'],
        m_stage_6th=row['patient.stage_event.tnm_categories.clinical_categories.clinical_M'],
        tissue_or_organ=row['patient.anatomic_neoplasm_subdivision']
    )
    
    return pd.Series({
        'ajcc_clinical_t': clinical_t,
        'ajcc_clinical_n': clinical_n,
        'ajcc_clinical_m': clinical_m
    })

In [3]:
# Example usage
root_folder = './tcga_hnsc_xml_clinical'  # Specify your root folder here

# Process all XML files and get the consolidated DataFrame
df = process_xml_files(root_folder)

In [4]:
radiation_columns = extract_and_organize_drugs(df,"patient.radiations.radiation")
reordered_columns_df_1 = pd.concat([df.drop(columns=radiation_columns), df[radiation_columns]], axis=1)

drug_columns = extract_and_organize_drugs(reordered_columns_df_1)
reordered_columns_df_2 = pd.concat([reordered_columns_df_1.drop(columns=drug_columns), reordered_columns_df_1[drug_columns]], axis=1)

follow_up_columns = extract_and_organize_followup(reordered_columns_df_2)
reordered_columns_df_3 = pd.concat([reordered_columns_df_2.drop(columns=follow_up_columns), reordered_columns_df_2[follow_up_columns]], axis=1)

1. Remove follow_up, radiation and drug information

In [5]:
new_df = reordered_columns_df_3.loc[:, ~reordered_columns_df_3.columns.str.contains("patient.follow_ups|patient.radiations|patient.drugs")]

2. Need to keep patient.bcr_patient_barcode and patient.bcr_patient_uuid for WSI mapping. remove other ID related fields (admin.file_uuid,admin.batch_number,patient.patient_id). 

In [6]:

new_df = new_df.drop(["admin.file_uuid", "admin.batch_number", "patient.patient_id"], axis=1)
#remove: admin.file_uuid, admin.batch_numer, patient.patient_id, 

3. Remove values that are post-treatment

In [7]:
new_df = new_df.drop(columns=[
    "patient.radiation_therapy",
    "patient.postoperative_rx_tx",
    "patient.primary_therapy_outcome_success",
    "patient.stage_event.pathologic_stage",
    "patient.stage_event.tnm_categories.pathologic_categories.pathologic_T",
    "patient.stage_event.tnm_categories.pathologic_categories.pathologic_N",
    "patient.stage_event.tnm_categories.pathologic_categories.pathologic_M",
    "patient.margin_status",
    "patient.presence_of_pathological_nodal_extracapsular_spread",
    "patient.lymphovascular_invasion_present",
    "patient.perineural_invasion_present",
    "patient.new_tumor_events.new_tumor_event_after_initial_treatment",
    "patient.new_tumor_events.new_tumor_event.days_to_new_tumor_event_after_initial_treatment",
    "patient.new_tumor_events.new_tumor_event.new_neoplasm_event_occurrence_anatomic_site",
    "patient.new_tumor_events.new_tumor_event.new_neoplasm_occurrence_anatomic_site_text",
    "patient.new_tumor_events.new_tumor_event.progression_determined_by",
    "patient.new_tumor_events.new_tumor_event.new_tumor_event_additional_surgery_procedure",
    "patient.new_tumor_events.new_tumor_event.additional_radiation_therapy",
    "patient.new_tumor_events.new_tumor_event.additional_pharmaceutical_therapy",
    "patient.new_tumor_events.new_tumor_event.new_neoplasm_event_type",
    "patient.new_tumor_events.new_tumor_event.days_to_new_tumor_event_additional_surgery_procedure",
    "patient.vital_status",
    "patient.days_to_last_followup",
    "patient.days_to_death",
    "patient.days_to_last_known_alive",
    "patient.number_of_lymphnodes_positive_by_ihc",
    "patient.neoplasm_histologic_grade",
    "patient.history_of_neoadjuvant_treatment",
    "patient.lymph_node_examined_count",
    "patient.number_of_lymphnodes_positive_by_he",
    "patient.lymphnode_neck_dissection",
    "patient.lymphnode_dissection_method_left",
    "patient.lymphnode_dissection_method_right",
    "patient.person_neoplasm_cancer_status",
    "patient.primary_lymph_node_presentation_assessment"
])


4. Remove values that are empty or useless

In [8]:
new_df = new_df.drop(columns=[
    "patient.tissue_prospective_collection_indicator",
    "patient.tissue_retrospective_collection_indicator",
    "patient.tissue_source_site",
    "patient.days_to_initial_pathologic_diagnosis",
    "patient.year_of_initial_pathologic_diagnosis",
    "patient.day_of_form_completion",
    "patient.month_of_form_completion",
    "patient.year_of_form_completion",
    "patient.age_at_initial_pathologic_diagnosis" #removed because there is already "days_to_birth" available which is in days. 
])


In [9]:
# Create a copy of clinical_data for processing
clinical_data_filtered = new_df.copy()

# Step 1: Mapping 4th/5th edition to 6th edition
mask_5th = clinical_data_filtered['patient.stage_event.system_version'].isin(['4th', '5th'])
df_filtered_5th = clinical_data_filtered[mask_5th].copy()

# Apply the 5th to 6th mapping
mapped_values_5th = df_filtered_5th.apply(map_clinical_5th_to_6th, axis=1)
df_filtered_5th.update(mapped_values_5th)

# Update clinical_data_filtered to include the new 6th edition values
clinical_data_filtered.update(df_filtered_5th)

# Step 2: Mapping 6th edition (including 5th-to-6th mapped) to 7th edition
mask_6th_combined = clinical_data_filtered['patient.stage_event.system_version'].isin(['4th', '5th', '6th'])
df_filtered_6th = clinical_data_filtered[mask_6th_combined].copy()

# Apply the 6th to 7th mapping
mapped_values_6th_to_7th = df_filtered_6th.apply(map_clinical_6th_to_7th, axis=1)
df_filtered_6th.update(mapped_values_6th_to_7th)

# Directly update clinical_data_filtered with 6th-to-7th mapped values
clinical_data_filtered.update(df_filtered_6th)

# Final processed DataFrame
clinical_data_updated = clinical_data_filtered

In [10]:
clinical_data_final = clinical_data_updated.drop('patient.stage_event.system_version', axis=1)


for column in clinical_data_final.columns:
    print(f"Column: {column}")
    value_counts = clinical_data_final[column].value_counts(normalize=True,dropna=False) * 100
    for value, percentage in value_counts.items():
        print(f"  Value: {value}, Percentage: {percentage:.2f}%")
    print("\n" + "="*50 + "\n")

Column: patient.histological_type
  Value: Head & Neck Squamous Cell Carcinoma, Percentage: 97.92%
  Value: Head & Neck Squamous Cell Carcinoma Basaloid Type, Percentage: 1.89%
  Value: Head & Neck Squamous Cell Carcinoma, Spindle Cell Variant, Percentage: 0.19%


Column: patient.gender
  Value: MALE, Percentage: 73.11%
  Value: FEMALE, Percentage: 26.89%


Column: patient.days_to_birth
  Value: -17125, Percentage: 0.38%
  Value: -18812, Percentage: 0.38%
  Value: -18718, Percentage: 0.38%
  Value: -28221, Percentage: 0.38%
  Value: -25020, Percentage: 0.38%
  Value: -24558, Percentage: 0.38%
  Value: -22182, Percentage: 0.38%
  Value: -18536, Percentage: 0.38%
  Value: -22836, Percentage: 0.19%
  Value: -20464, Percentage: 0.19%
  Value: -20386, Percentage: 0.19%
  Value: -18272, Percentage: 0.19%
  Value: -27261, Percentage: 0.19%
  Value: -22828, Percentage: 0.19%
  Value: -22455, Percentage: 0.19%
  Value: -20175, Percentage: 0.19%
  Value: -29530, Percentage: 0.19%
  Value: -17471

Note: a patient does not have age at diagnosis which is weird but not a big deal for now I guess. At this point the data is ready for processing

In [11]:
import pandas as pd

# Load the Excel file with endpoints
file_path = "/home/sorkwos/1b5f413e-a8d1-4d10-92eb-7c4ae739ed81/TCGA-CDR-SupplementalTableS1.xlsx"
df = pd.read_excel(file_path)

df_hnsc = df[df['type'].str.contains("HNSC", na=False)]

display_scrollable_dataframe(df_hnsc)

Unnamed: 0.1,Unnamed: 0,bcr_patient_barcode,type,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,menopause_status,birth_days_to,vital_status,tumor_status,last_contact_days_to,death_days_to,cause_of_death,new_tumor_event_type,new_tumor_event_site,new_tumor_event_site_other,new_tumor_event_dx_days_to,treatment_outcome_first_course,margin_status,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
3241,3242,TCGA-4P-AA8J,HNSC,66.0,MALE,BLACK OR AFRICAN AMERICAN,Stage IVA,Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2013.0,[Not Available],-24222.0,Alive,TUMOR FREE,102.0,,[Not Available],,,,,Complete Remission/Response,,,0.0,102.0,0.0,102.0,,,0.0,102.0,
3242,3243,TCGA-BA-4074,HNSC,69.0,MALE,WHITE,Stage IVA,Stage IVA,Head & Neck Squamous Cell Carcinoma,G3,2003.0,[Not Available],-25282.0,Dead,WITH TUMOR,,462.0,[Not Available],,Distant Metastasis,bone (humerus),396.0,Complete Remission/Response,,,1.0,462.0,1.0,462.0,,,1.0,396.0,
3243,3244,TCGA-BA-4075,HNSC,49.0,MALE,BLACK OR AFRICAN AMERICAN,Stage III,Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2004.0,[Not Available],-17951.0,Dead,WITH TUMOR,,283.0,[Not Available],,Oral Cavity,,236.0,Progressive Disease,,,1.0,283.0,1.0,283.0,,,1.0,236.0,
3244,3245,TCGA-BA-4076,HNSC,39.0,MALE,WHITE,[Not Available],Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2003.0,[Not Available],-14405.0,Dead,WITH TUMOR,,415.0,[Not Available],,Larynx,,286.0,Complete Remission/Response,,,1.0,415.0,1.0,415.0,,,1.0,286.0,
3245,3246,TCGA-BA-4077,HNSC,45.0,FEMALE,WHITE,Stage IVA,Stage IVB,Head & Neck Squamous Cell Carcinoma,G2,2003.0,[Not Available],-16536.0,Dead,WITH TUMOR,,1134.0,[Not Available],,,,,Persistent Disease,,,1.0,1134.0,1.0,1134.0,,,1.0,1134.0,
3246,3247,TCGA-BA-4078,HNSC,83.0,MALE,WHITE,[Not Available],Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2003.0,[Not Available],-30480.0,Dead,WITH TUMOR,,276.0,[Not Available],,,,,[Not Available],,,1.0,276.0,1.0,276.0,,,1.0,276.0,
3247,3248,TCGA-BA-5149,HNSC,47.0,MALE,WHITE,Stage IVA,Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2010.0,[Not Available],-17177.0,Dead,WITH TUMOR,,806.0,[Not Available],,Distant Metastasis,Lung,389.0,Complete Remission/Response,,,1.0,806.0,1.0,806.0,,,1.0,389.0,
3248,3249,TCGA-BA-5151,HNSC,72.0,MALE,WHITE,Stage IVA,Stage IVA,Head & Neck Squamous Cell Carcinoma,G1,2010.0,[Not Available],-26569.0,Alive,TUMOR FREE,722.0,,[Not Available],,,,517.0,Complete Remission/Response,,,0.0,722.0,0.0,722.0,,,1.0,517.0,
3249,3250,TCGA-BA-5152,HNSC,56.0,MALE,WHITE,Stage IVA,Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,2009.0,[Not Available],-20783.0,Alive,TUMOR FREE,1288.0,,[Not Available],,,,,Complete Remission/Response,,,0.0,1288.0,0.0,1288.0,,,0.0,1288.0,
3250,3251,TCGA-BA-5153,HNSC,51.0,MALE,WHITE,[Not Available],Stage III,Head & Neck Squamous Cell Carcinoma,G2,2005.0,[Not Available],-18743.0,Dead,WITH TUMOR,,1762.0,[Not Available],,Distant Metastasis,bone (hip),1522.0,Complete Remission/Response,,,1.0,1762.0,1.0,1762.0,1.0,1522.0,1.0,1522.0,


In [12]:
# Step 1: Select relevant columns from df_hnsc
df_hnsc_filtered = df_hnsc[['bcr_patient_barcode', 'PFI', 'PFI.time']]

# Step 2: Merge the two DataFrames based on the patient barcode
clinical_data_final_updated = clinical_data_final.merge(
    df_hnsc_filtered,
    left_on='patient.bcr_patient_barcode',
    right_on='bcr_patient_barcode',
    how='left'  # Use 'left' to keep all rows from clinical_data_final and add matches from df_hnsc
)

# Step 3: Drop the redundant 'bar_patient_barcode' column after merging
clinical_data_final_updated = clinical_data_final_updated.drop(columns=['bcr_patient_barcode'])

# Display the updated DataFrame
#display_scrollable_dataframe(clinical_data_final_updated)

clinical_data_yes = clinical_data_final_updated.drop(columns=['patient.bcr_patient_barcode','patient.bcr_patient_uuid'])

In [13]:
print(clinical_data_yes.iloc[7])
#numerical columns are: patient.days_to_birth, patient.tobacco_smoking_history, patient.year_of_tobacco_smoking_onset, patient.number_pack_years_smoked, 

patient.histological_type                                            Head & Neck Squamous Cell Carcinoma
patient.gender                                                                                    FEMALE
patient.days_to_birth                                                                             -22299
patient.race_list.race                                                                             WHITE
patient.icd_o_3_site                                                                               C32.9
patient.icd_o_3_histology                                                                         8070/3
patient.icd_10                                                                                     C32.9
patient.anatomic_neoplasm_subdivision                                                             Larynx
patient.laterality                                                                                   NaN
patient.tobacco_smoking_history                        

In [14]:
display_scrollable_dataframe(clinical_data_yes)

Unnamed: 0,patient.histological_type,patient.gender,patient.days_to_birth,patient.race_list.race,patient.icd_o_3_site,patient.icd_o_3_histology,patient.icd_10,patient.anatomic_neoplasm_subdivision,patient.laterality,patient.tobacco_smoking_history,patient.year_of_tobacco_smoking_onset,patient.number_pack_years_smoked,patient.alcohol_history_documented,patient.other_dx,patient.stage_event.clinical_stage,patient.stage_event.tnm_categories.clinical_categories.clinical_T,patient.stage_event.tnm_categories.clinical_categories.clinical_N,patient.stage_event.tnm_categories.clinical_categories.clinical_M,patient.ethnicity,patient.stopped_smoking_year,patient.frequency_of_alcohol_consumption,patient.amount_of_alcohol_consumption_per_day,patient.hpv_status_by_p16_testing,patient.hpv_status_by_ish_testing,patient.egfr_amplication_status,PFI,PFI.time
0,Head & Neck Squamous Cell Carcinoma,MALE,-22836,WHITE,C32.9,8070/3,C32.9,Larynx,Midline,2,1959,50,NO,No,Stage IVA,T4a,N0,M0,,,,,,,,0.0,1971.0
1,Head & Neck Squamous Cell Carcinoma,MALE,-21659,WHITE,C02.9,8071/3,C02.9,Oral Tongue,Left,3,1973,39,NO,No,Stage IVA,T2,N2,M0,NOT HISPANIC OR LATINO,,,,,,,0.0,539.0
2,Head & Neck Squamous Cell Carcinoma,MALE,-19956,BLACK OR AFRICAN AMERICAN,C32.9,8070/3,C32.9,Larynx,Right,4,,62,YES,No,Stage IVA,T2,N2c,M0,NOT HISPANIC OR LATINO,2010,0,0,,,,1.0,248.0
3,Head & Neck Squamous Cell Carcinoma,MALE,-23185,ASIAN,C06.9,8070/3,C06.9,Oral Tongue,Right,1,,,YES,No,Stage IVA,T2,N2c,M0,NOT HISPANIC OR LATINO,,,,Positive,Negative,,0.0,1168.0
4,Head & Neck Squamous Cell Carcinoma,MALE,-14133,WHITE,C09.9,8070/3,C09.9,Tonsil,Right,1,,,YES,No,Stage IVA,T1,N2c,M0,NOT HISPANIC OR LATINO,,5,2.5,,,,0.0,1521.0
5,Head & Neck Squamous Cell Carcinoma,FEMALE,-28250,WHITE,C02.9,8070/3,C02.9,Oral Tongue,Left,,,,NO,No,Stage IVA,T2,N2c,M0,NOT HISPANIC OR LATINO,,,,,,,0.0,144.0
6,Head & Neck Squamous Cell Carcinoma,MALE,-21443,WHITE,C32.9,8070/3,C32.9,Larynx,,2,,40,YES,No,Stage IVA,T3,N2b,M0,NOT HISPANIC OR LATINO,,7,6,,,,0.0,2016.0
7,Head & Neck Squamous Cell Carcinoma,FEMALE,-22299,WHITE,C32.9,8070/3,C32.9,Larynx,,2,,,NO,No,Stage II,T2,N0,M0,NOT HISPANIC OR LATINO,,,,,Negative,Amplified,0.0,1483.0
8,Head & Neck Squamous Cell Carcinoma,MALE,-25020,BLACK OR AFRICAN AMERICAN,C32.9,8070/3,C32.9,Larynx,,2,,60,YES,No,Stage IVA,T4a,N2c,M0,NOT HISPANIC OR LATINO,,7,10,,,,0.0,187.0
9,Head & Neck Squamous Cell Carcinoma,FEMALE,-24180,WHITE,C02.9,8070/3,C02.9,Oral Tongue,,2,1948,75,NO,No,Stage IVA,T3,N2b,M0,NOT HISPANIC OR LATINO,,,,,,,1.0,65.0


In [15]:
import pandas as pd
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler

# Convert string columns to numeric columns
clinical_data_test = clinical_data_yes.copy()  # Avoid modifying the original dataframe

# Define columns for specific treatments
num_cols = ['patient.days_to_birth', 'patient.tobacco_smoking_history', 'patient.year_of_tobacco_smoking_onset',
            'patient.number_pack_years_smoked', 'patient.stopped_smoking_year',
            'patient.frequency_of_alcohol_consumption', 'patient.amount_of_alcohol_consumption_per_day']

tobacco_alcohol_cols = [
    'patient.tobacco_smoking_history', 'patient.year_of_tobacco_smoking_onset',
    'patient.number_pack_years_smoked', 'patient.stopped_smoking_year',
    'patient.frequency_of_alcohol_consumption', 'patient.amount_of_alcohol_consumption_per_day'
]

days_to_birth_col = ['patient.days_to_birth']

# Convert all numerical columns to float and handle conversion errors
for col in tobacco_alcohol_cols + days_to_birth_col:
    clinical_data_test[col] = pd.to_numeric(clinical_data_test[col], errors='coerce')

# Replace NaNs in tobacco and alcohol-related columns with -1
for col in tobacco_alcohol_cols:
    clinical_data_test[col].fillna(-1, inplace=True)

# Replace NaNs in 'days_to_birth' with the mean
clinical_data_test['patient.days_to_birth'].fillna(clinical_data_test['patient.days_to_birth'].mean(), inplace=True)

# Replace NaNs in categorical columns with the string "missing"
categorical_cols = clinical_data_test.select_dtypes(include=['object']).columns
clinical_data_test[categorical_cols] = clinical_data_test[categorical_cols].fillna("missing")

# Extract time-to-event and event data
clinical_data_test['PFI.time'].fillna(clinical_data_test['PFI.time'].mean(), inplace=True)
time = clinical_data_test['PFI.time'].values.astype(float)
event = clinical_data_test['PFI'].values.astype(bool)

# Define the structured array for scikit-survival
survival_data = np.array([(e, t) for e, t in zip(event, time)], dtype=[('event', bool), ('time', float)])

# Select features
X = clinical_data_test[num_cols].values

# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize empty list to store C-index scores for each fold
c_index_scores = []

# Cross-validation loop
for fold_idx, (train_index, test_index) in enumerate(skf.split(X, event)):
    # Split the data into training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = survival_data[train_index], survival_data[test_index]

    # Extract the 'event' part for balancing
    event_labels = y_train['event'].astype(int)  # Convert to integer for compatibility with RandomOverSampler

    # Print training and test details before balancing
    n_samples_train_before = len(y_train)
    n_events_before = np.sum(event_labels)
    n_non_events_before = n_samples_train_before - n_events_before
    n_samples_test = len(y_test)
    n_events_test = np.sum(y_test['event'])
    n_non_events_test = n_samples_test - n_events_test
    print(f"Fold {fold_idx + 1} - Before Balancing: Training Samples: {n_samples_train_before} | Events: {n_events_before} | Non-Events: {n_non_events_before}")
    print(f"Fold {fold_idx + 1} - Test Samples: {n_samples_test} | Events: {n_events_test} | Non-Events: {n_non_events_test}")

    # Balance training set using RandomOverSampler (oversampling minority events)
    ros = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_train_balanced, event_labels_balanced = ros.fit_resample(X_train, event_labels)

    # Recreate the structured array for y_train_balanced using balanced labels and original time values
    y_train_balanced = np.array([(e, t) for e, t in zip(event_labels_balanced, y_train['time'][ros.sample_indices_])], 
                                dtype=[('event', bool), ('time', float)])

    # Print sample count and event details after oversampling
    n_samples_train_after = len(y_train_balanced)
    n_events_after = np.sum(y_train_balanced['event'])
    n_non_events_after = n_samples_train_after - n_events_after
    print(f"Fold {fold_idx + 1} - After Balancing: Training Samples: {n_samples_train_after} | Events: {n_events_after} | Non-Events: {n_non_events_after}")

    # Train the Random Survival Forest model
    rsf = RandomSurvivalForest(n_estimators=100, random_state=42)
    rsf.fit(X_train_balanced, y_train_balanced)

    # Evaluate on the test set (no balancing here)
    c_index = rsf.score(X_test, y_test)
    c_index_scores.append(c_index)

# Average C-index across folds
average_c_index = np.mean(c_index_scores)
print(f"Average Concordance index across folds: {average_c_index:.4f}")



Fold 1 - Before Balancing: Training Samples: 422 | Events: 158 | Non-Events: 264
Fold 1 - Test Samples: 106 | Events: 40 | Non-Events: 66
Fold 1 - After Balancing: Training Samples: 528 | Events: 264 | Non-Events: 264
Fold 2 - Before Balancing: Training Samples: 422 | Events: 158 | Non-Events: 264
Fold 2 - Test Samples: 106 | Events: 40 | Non-Events: 66
Fold 2 - After Balancing: Training Samples: 528 | Events: 264 | Non-Events: 264
Fold 3 - Before Balancing: Training Samples: 422 | Events: 158 | Non-Events: 264
Fold 3 - Test Samples: 106 | Events: 40 | Non-Events: 66
Fold 3 - After Balancing: Training Samples: 528 | Events: 264 | Non-Events: 264
Fold 4 - Before Balancing: Training Samples: 423 | Events: 159 | Non-Events: 264
Fold 4 - Test Samples: 105 | Events: 39 | Non-Events: 66
Fold 4 - After Balancing: Training Samples: 528 | Events: 264 | Non-Events: 264
Fold 5 - Before Balancing: Training Samples: 423 | Events: 159 | Non-Events: 264
Fold 5 - Test Samples: 105 | Events: 39 | Non-E