In [26]:
choices=[
    'Breast Invasive Carcinoma',
    'Glioblastoma Multiforme',
    'Ovarian Serous Cystadenocarcinoma',
    'Lung Adenocarcinoma',
    'Uterine Corpus Endometrial Carcinoma',
    'Kidney Renal Clear Cell Carcinoma',
    'Head and Neck Squamous Cell Carcinoma',
    'Brain Lower Grade Glioma',
    'Thyroid Carcinoma',
    'Lung Squamous Cell Carcinoma',
    'Prostate Adenocarcinoma',
    'Skin Cutaneous Melanoma',
    'Colon Adenocarcinoma',
    'Stomach Adenocarcinoma',
    'Bladder Urothelial Carcinoma',
    'Liver Hepatocellular Carcinoma',
    'Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma',
    'Kidney Renal Papillary Cell Carcinoma',
    'Sarcoma',
    'Acute Myeloid Leukemia',
    'Esophageal Carcinoma',
    'Pancreatic Adenocarcinoma',
    'Pheochromocytoma and Paraganglioma',
    'Rectum Adenocarcinoma',
    'Testicular Germ Cell Tumors',
    'Thymoma',
    'Kidney Chromophobe',
    'Adrenocortical Carcinoma',
    'Mesothelioma',
    'Uveal Melanoma',
    'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
    'Uterine Carcinosarcoma',
    'Cholangiocarcinoma',
]
choices.sort()
print(choices)

['Acute Myeloid Leukemia', 'Adrenocortical Carcinoma', 'Bladder Urothelial Carcinoma', 'Brain Lower Grade Glioma', 'Breast Invasive Carcinoma', 'Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma', 'Cholangiocarcinoma', 'Colon Adenocarcinoma', 'Esophageal Carcinoma', 'Glioblastoma Multiforme', 'Head and Neck Squamous Cell Carcinoma', 'Kidney Chromophobe', 'Kidney Renal Clear Cell Carcinoma', 'Kidney Renal Papillary Cell Carcinoma', 'Liver Hepatocellular Carcinoma', 'Lung Adenocarcinoma', 'Lung Squamous Cell Carcinoma', 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma', 'Mesothelioma', 'Ovarian Serous Cystadenocarcinoma', 'Pancreatic Adenocarcinoma', 'Pheochromocytoma and Paraganglioma', 'Prostate Adenocarcinoma', 'Rectum Adenocarcinoma', 'Sarcoma', 'Skin Cutaneous Melanoma', 'Stomach Adenocarcinoma', 'Testicular Germ Cell Tumors', 'Thymoma', 'Thyroid Carcinoma', 'Uterine Carcinosarcoma', 'Uterine Corpus Endometrial Carcinoma', 'Uveal Melanoma']


In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import requests
import json
import re
import pandas as pd
import gzip
import numpy as np
from tqdm import tqdm
import io

In [3]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

# data type of files we want
data_type = "htseq.counts"

cancer_type = 'Mixed germ cell tumor'

In [4]:
filters = {
    "op": "and",
    "content": [{
        "op": "in",
        "content":
            {
                "field": "project.program.name",
                "value": ["TCGA"],
            },
        },
    ],
}

In [5]:
params = {
    'filters': json.dumps(filters),
    "size": 0,
    "facets":"project.name"
}
response = requests.get('https://api.gdc.cancer.gov/cases', params=params)
data = response.json()

In [6]:
with open('programs.txt', 'w') as f: 
    for k in data['data']['aggregations']['project.name']['buckets']:
        print(k['key'], file=f)

## File Name and IDs

In [7]:
fields = [
    'cases.case_id'
]

filters = {
    'op': 'and',
    'content': [{
        'op': 'in',
        'content': {
            'field': 'experimental_strategy',
            'value': ['RNA-Seq'],
        }
    }, 
    {
        'op': 'in',
        'content': {
            'field': 'access',
            'value': ['open'],
        }
    },
    {
        'op': 'in',
        'content': {
            'field': 'file_name',
            'value': ['*htseq.counts.gz'],
        }
    },
    {
        'op': 'in',
        'content': {
            'field': 'cases.project.name',
            'value': ['Cholangiocarcinoma'],
        }
    }
    ],
}

In [8]:
params = {
    'fields': ','.join(fields),
    'filters': json.dumps(filters),
    'size': 100000,
    'facets': 'cases.case_id'
}
response = requests.get('https://api.gdc.cancer.gov/files', params=params)
data = response.json()['data']['hits']

In [9]:
files = pd.DataFrame([(f['id'], f['cases'][0]['case_id']) for f in data], columns=['file_id', 'case_id']).set_index('file_id')
print(files.shape)
files.head()

(3, 1)


Unnamed: 0_level_0,case_id
file_id,Unnamed: 1_level_1
7845948f-701e-49c5-8b76-2f0e2f0d5a76,b10c64c2-7fd2-4210-b975-034affb14b57
afecdda2-735c-4304-a087-ef917ad9cd5a,20bf79af-3b0f-477d-b619-5597d42f5d5e
fd0ea67b-5b75-471f-be3c-a92142b91cf3,adc9a685-5b4a-4c29-881a-da1895c47520


In [10]:
matrix = pd.DataFrame()

for file_id in tqdm(files.index, unit='samples'):
    response = requests.get(data_endpt + file_id, headers = {"Content-Type": "application/json"})
    string_data = io.StringIO(str(gzip.decompress(response.content), 'utf-8'))
    matrix = pd.concat([matrix, pd.read_csv(string_data, sep='\t', header=None, names=['ENSMBL ID', files.loc[file_id, 'case_id']], index_col=0)], axis=1)
matrix.head()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0_level_0,b10c64c2-7fd2-4210-b975-034affb14b57,20bf79af-3b0f-477d-b619-5597d42f5d5e,adc9a685-5b4a-4c29-881a-da1895c47520
ENSMBL ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000003.13,7542,4802,8150
ENSG00000000005.5,0,1,0
ENSG00000000419.11,1121,1198,1770
ENSG00000000457.12,403,1099,1202
ENSG00000000460.15,127,290,378


In [21]:
matrix.index = matrix.index.map(lambda x: x.split('.')[0])
matrix.head()

Unnamed: 0_level_0,b10c64c2-7fd2-4210-b975-034affb14b57,20bf79af-3b0f-477d-b619-5597d42f5d5e,adc9a685-5b4a-4c29-881a-da1895c47520
ENSMBL ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000003,7542,4802,8150
ENSG00000000005,0,1,0
ENSG00000000419,1121,1198,1770
ENSG00000000457,403,1099,1202
ENSG00000000460,127,290,378


In [11]:
cases_fields = requests.get(cases_endpt + '_mapping').json()['fields']
keyfields = [field for field in cases_fields if 
    any(word in field for word in ['demographic', 'diagnoses']) and 'treatment' not in field]

In [12]:
sample_meta = pd.DataFrame()
for case_id in tqdm(files['case_id'].drop_duplicates(), unit='cases'):
    response = requests.get(cases_endpt + case_id, params={'fields': ','.join(keyfields)})
    data = response.json()['data']
    sample = pd.DataFrame([{'case_id': case_id, **data['demographic'], **data['diagnoses'][0]}])
    sample_meta = pd.concat([sample_meta, sample])
sample_meta = sample_meta.set_index('case_id')
sample_meta.head()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0_level_0,gender,demographic_id,vital_status,updated_datetime,race,ethnicity,submitter_id,year_of_death,year_of_birth,created_datetime,...,ajcc_pathologic_stage,morphology,ajcc_pathologic_t,prior_malignancy,days_to_diagnosis,year_of_diagnosis,days_to_last_follow_up,ajcc_staging_system_edition,primary_diagnosis,days_to_death
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b10c64c2-7fd2-4210-b975-034affb14b57,male,f0bb0161-a0c0-5d1b-b18b-930781523195,Alive,2019-08-08T16:33:18.272224-05:00,white,not hispanic or latino,TCGA-4G-AAZT_diagnosis,,1951,,...,Stage I,8160/3,T1,no,0,2013,420,7th,Cholangiocarcinoma,
20bf79af-3b0f-477d-b619-5597d42f5d5e,female,f1b3e939-c2b1-5652-bc5b-db298895f2be,Alive,2019-08-08T16:33:18.272224-05:00,white,not hispanic or latino,TCGA-W5-AA2R_diagnosis,,1929,,...,Stage I,8160/3,T1,no,0,2006,1542,6th,Cholangiocarcinoma,
adc9a685-5b4a-4c29-881a-da1895c47520,female,42a98d55-df9c-511a-9467-5da4039dda49,Dead,2019-08-08T16:33:18.272224-05:00,white,not hispanic or latino,TCGA-W5-AA34_diagnosis,,1937,,...,Stage I,8160/3,T1,no,0,2012,168,7th,Cholangiocarcinoma,555.0


In [15]:
import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [17]:
sym_lookup, _ = lookup.get_lookups(mapfrom='ensembl')

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.17s/it]


In [22]:
uf.map_symbols(matrix, sym_lookup)

100%|██████████| 60488/60488 [00:00<00:00, 492238.35it/s]


Unnamed: 0_level_0,b10c64c2-7fd2-4210-b975-034affb14b57,20bf79af-3b0f-477d-b619-5597d42f5d5e,adc9a685-5b4a-4c29-881a-da1895c47520
ENSMBL ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TSPAN6,7542,4802,8150
TNMD,0,1,0
DPM1,1121,1198,1770
SCYL3,403,1099,1202
C1orf112,127,290,378
...,...,...,...
SNORD38B,0,0,0
PAUPAR,0,0,0
GIMAP1-GIMAP5,0,0,0
SNORA50A,0,0,0
