# Import

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# Load Files

In [2]:
# Load File
DATA_PATH = Path('../../Data/')
data = pd.read_csv(Path(DATA_PATH, 'processed/TCGA.HNSC.expression_log_all.txt.zip'), sep = '\t', compression='zip').fillna("NaN")
metadata = pd.read_csv(Path(DATA_PATH, 'original/TCGA.HNSC.metadata.txt'), sep = '\t').fillna("NaN")

# metadata of samples

original information from patient metadata df

In [3]:
metadata_sample = pd.DataFrame({
    'patient_id': data[' patient_id'].values
}, index=data['sample_id'])
metadata_sample.index.name = 'sample_id'

In [4]:
metadata_sample = metadata_sample.reset_index().merge(
    metadata, left_on='patient_id', right_on='patient_id', how='left'
).set_index('sample_id')

sample_id information

In [5]:
# tissue source site
tss = pd.read_csv(Path(DATA_PATH, 'tcga_code_tables/tissueSourceSite.tsv'), sep = '\t').fillna("NaN")
metadata_sample['TSS_code'] = metadata_sample.index.str.split('-').str[1]
metadata_sample = metadata_sample.reset_index().merge(
    tss, left_on='TSS_code', right_on='TSS Code', how='left'
).set_index('sample_id')
metadata_sample = metadata_sample.drop(columns=['TSS_code', 'TSS Code', 'Study Name'], errors='ignore')

In [6]:
# sample type
st = pd.read_csv(Path(DATA_PATH, 'tcga_code_tables/sampleType.tsv'), sep = '\t').fillna("NaN")
metadata_sample['ST'] = metadata_sample.index.str.split('-').str[3].str[:2].astype(str).astype(int)
metadata_sample = metadata_sample.reset_index().merge(
    st, left_on='ST', right_on='Code', how='left'
).set_index('sample_id')
metadata_sample = metadata_sample.drop(columns=['ST', 'Code', 'Short Letter Code'], errors='ignore')
metadata_sample = metadata_sample.rename(columns={'Definition': 'sample_type'})

additional metadata from paper

In [7]:
HPDF = pd.read_csv(Path(DATA_PATH, 'Original_paper_files/1.1_HNSC_Patient_Data_Freeze.tsv'), sep = '\t').fillna("NaN")
HP = pd.read_csv(Path(DATA_PATH, 'Original_paper_files/1.1_HNSC_Patient.tsv'), sep = '\t').fillna("NaN")
HP_all = pd.concat([HPDF, HP]).drop_duplicates(subset='bcr_patient_barcode', keep='first')
cols_to_add = [
    'bcr_patient_barcode',
    'ajcc_clinical_group_stage', 'ajcc_clinical_primary_tumor_t_stage',
    'ajcc_clinical_primary_tumor_n_stage', 'ajcc_clinical_primary_tumor_m_stage',
    'tumor_stage', 'primary_therapy_outcome_success',
    'radiation_therapy', 'postoperative_rx_tx', 'history_of_neoadjuvant_treatment',
    'hpv_status_by_ish_testing', 'hpv_status_by_p16_testing',
    'number_pack_years_smoked', 'tobacco_smoking_history_indicator',
    'alcohol_history_documented', 'frequency_of_alcohol_consumption',
    'amount_of_alcohol_consumption_per_day', 'p53_gene_analysis',
    'egfr_amplication_status'
]
metadata_sample = metadata_sample.reset_index().merge(
    HP_all[cols_to_add],
    left_on='patient_id',
    right_on='bcr_patient_barcode',
    how='left'
).set_index('sample_id')

# Output enriched metadata

In [11]:
HP_all = pd.concat([HPDF, HP]).drop_duplicates(subset='bcr_patient_barcode', keep='first')
metadata_sample = metadata_sample.reset_index().merge(
    HP_all[cols_to_add],
    left_on='patient_id',
    right_on='bcr_patient_barcode',
    how='left'
).set_index('sample_id')

In [12]:
HP_all

Unnamed: 0,bcr_patient_barcode,age_at_initial_pathologic_diagnosis,ajcc_cancer_staging_handbook_edition,ajcc_clinical_group_stage,ajcc_clinical_primary_tumor_m_stage,ajcc_clinical_primary_tumor_n_stage,ajcc_clinical_primary_tumor_t_stage,alcohol_history_documented,amount_of_alcohol_consumption_per_day,anatomic_organ_subdivision,...,race,radiation_therapy,stopped_smoking_year,tissue_prospective_collection_indicator,tissue_retrospective_collection_indicator,tissue_source_site,tobacco_smoking_history_indicator,tumor_stage,tumor_tissue_site,vital_status
0,TCGA-BA-4074,69,6th,Stage IVA,M0,N2c,T3,YES,,,...,WHITE,,,NO,YES,BA,Current smoker,Stage IVA,HEAD/NECK,DECEASED
1,TCGA-BA-4076,39,6th,Stage IVA,M0,N2c,T3,YES,,,...,WHITE,,,NO,YES,BA,Current smoker,,HEAD/NECK,DECEASED
2,TCGA-BA-4077,45,6th,Stage IVB,M0,N3,T4b,YES,0.0,,...,WHITE,,2005.0,NO,YES,BA,Current reformed smoker for < or = 15 years,Stage IVA,HEAD/NECK,DECEASED
3,TCGA-BA-4078,83,6th,Stage IVA,M0,N2a,T2,NO,,,...,WHITE,,1994.0,NO,YES,BA,Current reformed smoker for < or = 15 years,,HEAD/NECK,DECEASED
4,TCGA-BA-5149,47,7th,Stage IVA,M0,N2b,T3,YES,7.0,,...,WHITE,,,YES,NO,BA,Current smoker,Stage IVA,HEAD/NECK,LIVING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TCGA-H7-8502,[Completed],7th,Stage IVA,MX,N2b,T4a,YES,8,[Not Available],...,WHITE,YES,[Not Available],YES,NO,H7,Current smoker,Stage IVA,HEAD/NECK,LIVING
311,TCGA-HD-7917,62,7th,Stage II,M0,N0,T2,YES,12,[Not Available],...,WHITE,[Not Available],[Not Available],YES,NO,HD,Current smoker,Stage II,HEAD/NECK,LIVING
312,TCGA-HD-8224,63,7th,Stage III,M0,N0,T3,YES,5,[Not Available],...,WHITE,[Not Available],[Not Available],YES,NO,HD,Lifelong Non-smoker,Stage IVA,HEAD/NECK,LIVING
313,TCGA-HD-8314,58,7th,Stage III,M0,N1,T1,NO,[Not Available],[Not Available],...,WHITE,NO,[Not Available],YES,NO,HD,Lifelong Non-smoker,Stage III,HEAD/NECK,LIVING
