In [12]:
from bravado.client import SwaggerClient
import pandas as pd

In [13]:
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/v2/api-docs',
                                    config={"validate_requests":False,"validate_responses":False,"validate_swagger_spec": False})

In [14]:
# read in the genes from the genes.txt file
with open("genes.txt", 'r') as f:
    genes = [gene.strip() for gene in f]
entrezGeneIds = {cbioportal.Genes.getGeneUsingGET(geneId=gene).result().entrezGeneId: gene for gene in genes}

In [15]:
# read in the cancers from the cancers.txt file
with open("cancers.txt", 'r') as f:
    cancers = [cancer.strip() for cancer in f]

In [16]:
samples = cbioportal.Samples.getAllSamplesInStudyUsingGET(studyId='BRCA_tcga_pan_can_atlas_2018').response().result

In [18]:
for cancer in cancers:
    # get the entire list of sampleIDs in the tcga cancer study
    samples = cbioportal.Samples.getAllSamplesInStudyUsingGET(studyId=f'{cancer}_tcga_pan_can_atlas_2018').response().result
    sample_patient = {sample['sampleId']: sample['patientId'] for sample in samples}

    # create a dataframe with the sampleId as the index, patient_id as a column, and the genes as the columns with gene values to 0
    df_blank = pd.DataFrame(index=sample_patient.keys(), columns=entrezGeneIds.values(), data=0)

    # create a dataframe from sample_patient, sampleID as index, patient_id as column
    df_patient = pd.DataFrame.from_dict(sample_patient, orient='index', columns=['patient_id'])

    # merge df_blank and df_patient
    df = pd.merge(df_blank, df_patient, left_index=True, right_index=True)

    # move patient_id column to the front
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]

    # query the portal for the mutations for the cancer study
    muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
        molecularProfileId=f'{cancer}_tcga_pan_can_atlas_2018_mutations', # {study_id}_mutations gives default mutations profile for study 
        sampleListId=f'{cancer}_tcga_pan_can_atlas_2018_all', # {study_id}_all includes all samples
        projection="DETAILED" # include gene info
        ).result()

    # load the mutations into the dataframe
    for m in muts:
        if m["entrezGeneId"] in entrezGeneIds.keys():
            # only add the mutation if the index exists in the dataframe
            df.loc[m['sampleId']][entrezGeneIds[m["entrezGeneId"]]] = 1
    
    # save the dataframe to a csv file
    df.to_csv(f"queried_data/mutation_matrices/{cancer}_mut_matrix.csv")