In [None]:
import pandas as pd
import urllib.request
import zipfile
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
    
    
def fst_to_pandas(file_path):
    read_fst_code = """
                library(fst)
                df=read_fst('FILE_PATH')
                df
                """.replace('FILE_PATH',file_path)
    df = robjects.r(read_fst_code)
    with (robjects.default_converter + pandas2ri.converter).context():
        df = robjects.conversion.get_conversion().rpy2py(df)
    return df


def rda_to_pandas(file_path):
    read_rda_code = """
                load('FILE_PATH')
                mutTable
                """.replace('FILE_PATH',file_path)
    df = robjects.r(read_rda_code)
    with (robjects.default_converter + pandas2ri.converter).context():
        df = robjects.conversion.get_conversion().rpy2py(df)
    return df


def rds_to_pandas(file_path):
    read_fst_code = """
                df=readRDS('FILE_PATH')
                df
                """.replace('FILE_PATH',file_path)
    df = robjects.r(read_fst_code)
    with (robjects.default_converter + pandas2ri.converter).context():
        df = robjects.conversion.get_conversion().rpy2py(df)
    return df

In [None]:
zenodo_primary_url='https://zenodo.org/records/7822002' # primary paper: https://pubmed.ncbi.nlm.nih.gov/37046096/
zenodo_mets_url='https://zenodo.org/records/7649257' # mets paper: https://pubmed.ncbi.nlm.nih.gov/37046095/
zenodo_transcriptomics_url='https://zenodo.org/records/7603386' # transcriptomics paper: https://pubmed.ncbi.nlm.nih.gov/37046093/

'''
Nomenclature:
'Sample' (sometimes 'region') refers to a single biopsy or resection sample from a tumour. 
In TRACERx we typically have multiple samples from a single tumour and often mutliple tumours from single patient
i.e. multiple samples from the primary site and then multiple samples from metastatic sites if patient has metastatic disease and we followed them up

TRACERx naming convention:

CRUK0001_SU_T1-R1
<CRUKID>_<timepoint>_<site>_<region(sample)>
BS	Before surgery	
SU	Surgery	primary	
FU After Surgery/Adjuvant Chemo	
FR	First Recurrence
BR	Biopsy After Recurrence
BP	Biopsy After Progression	
MR	Metastasectomy After Recurrence
BP	Biopsy After First Progression
TH	Tumour Harvest		


Region Codes
Code	TissueType
T1-R1	Tumour 1 region 1
T2-R1	Tumour 2 region 1
T2-R2	Tumour 2 region 2
T<A>-R<B>	Tumour <A> region <B>
T1-N1	Tumour 1 normal region 1
T2-N2	Tumour 2 normal region 2
LN1	Lymph node 1
LN2	Lymph node 2
LN<x>	Lymph node <x>
FT1	FFPE Tumour 1
FT2	FFPE Tumour 2
FT<x>	FFPE Tumour <x>
FLN1	FFPE Lymph node 1
FLN2	FFPE Lymph node 2
FLN<x>	FFPE Lymph node <x>
T1-FR1	Tumour 1 FFPE region 1
T<A>-FR<B>	Tumour <A> FFPE region <B>
'''

In [None]:
# download data:
# primary data
primary_code_url = 'https://zenodo.org/records/7822002/files/figurecode.zip'
urllib.request.urlretrieve(primary_code_url, 'primary_figurecode.zip')
# unzip
with zipfile.ZipFile('primary_figurecode.zip', 'r') as zip_ref:
    zip_ref.extractall('primary')
# mets data
mets_code_url = 'https://zenodo.org/records/7649257/files/metsFigures.zip'
urllib.request.urlretrieve(mets_code_url, 'mets_figurecode.zip')
# unzip
with zipfile.ZipFile('mets_figurecode.zip', 'r') as zip_ref:
    zip_ref.extractall('mets')
# transciptomics data
transcriptomics_code_url = 'https://zenodo.org/records/7603386/files/transcriptomics_scripts_data_20230330.zip'
urllib.request.urlretrieve(transcriptomics_code_url, 'transcriptomics_figurecode.zip')
# unzip
with zipfile.ZipFile('transcriptomics_figurecode.zip', 'r') as zip_ref:
    zip_ref.extractall('transcriptomics')
    


In [None]:
# load mutation tables:
mut_table_path_primary = 'primary/figurecode/data/20221109_TRACERx421_mutation_table.fst'
mut_table_path_mets = 'mets/metsFigures/data/patientMutTable.20220726.rda'
mut_table_primary = fst_to_pandas(mut_table_path_primary)
mut_table_mets = rda_to_pandas(mut_table_path_mets)

# this tables list each detected mutation. Using column 'Is.present' we can filter out mutations that detected only in some samples (subclonal mutations)
# columns 'func' and 'exonic.func' tells you the type of mutation (e.g. missense, nonsense, etc)
# there is also a column 'DriverMut' that tells you if the mutation is a known driver mutation
# using DriverMut we keep only samples with driver mutations in KRAS:
mut_table_KRAS_driver_primary = mut_table_primary[(mut_table_primary['Hugo_Symbol']=='KRAS') & (mut_table_primary['DriverMut'])]
mut_table_KRAS_driver_mets = mut_table_mets[(mut_table_mets['Hugo_Symbol']=='KRAS') & (mut_table_mets['DriverMut'])]


In [None]:
# get clinical and  histological data:

clinical_data_path_primary = 'primary/figurecode/data/20221109_TRACERx421_all_tumour_df.rds'
clinical_data_primary = rds_to_pandas(clinical_data_path_primary)
# a few rare patients had multiple primary tumours, hence why we have tumour_id_muttable_cruk and cruk_id columns (e.g. CRUK0881)
# and corresponding patient_id and tumour_id columns in mutation tables
# for most patients these are the same

In [None]:
histology = clinical_data_primary[['tumour_id_muttable_cruk','histology_3','Histology_per_tumour_id_muttable']] # keep only relevant columns; histology_3 is the simplified histology classification
histology

In [None]:

# open expression matrix:
# read_counts
em_path = 'transcriptomics/transcriptomics_scripts_data_updated/20221014_transcriptomic_DATA/2022-10-17_rsem_counts_mat.fst'
# or tpm:
# em_path = 'transcriptomics/transcriptomics_scripts_data_updated/20221014_transcriptomic_DATA/2022-10-17_rsem_tpm_mat.fst'
em = fst_to_pandas(em_path)
em.set_index('gene_id', inplace=True)
em

In [None]:
# keep only patients with KRAS mutation
primary_KRAS_mut = mut_table_KRAS_driver_primary['patient_id']
primary_KRAS_mut_em = em[[x for x in em.columns if len([y for y in primary_KRAS_mut if y in x])>0]]
mets_KRAS_mut = mut_table_KRAS_driver_mets['patient_id']
mets_KRAS_mut_em = em[[x for x in em.columns if len([y for y in mets_KRAS_mut if y in x])>0]]
# keep only mets LUAD patients:
LUAD_patients = clinical_data_primary[clinical_data_primary['histology_3']=='LUAD']['cruk_id'] 
LUAD_mets_KRAS_mut_em = mets_KRAS_mut_em[[x for x in mets_KRAS_mut_em.columns if len([y for y in LUAD_patients if y in x])>0]]

In [None]:
LUAD_mets_KRAS_mut_em

In [None]:
# count patients:
len(set([x.split('_')[0] for x in LUAD_mets_KRAS_mut_em.columns]))
