In [5]:
import requests
import json
import os
import sys
import re
import gzip
import shutil
import tarfile
import pathlib
import pandas as pd
import numpy as np
from maayanlab_bioinformatics.harmonization import ncbi_genes
import math

sys.path.append("..") # moved to notebooks directory

<module 'posixpath' from '/Users/nicolemoiseyev/opt/anaconda3/envs/TCGA/lib/python3.8/posixpath.py'>

In [3]:
# Endpoints
base_url = 'https://api.gdc.cancer.gov/'
files_endpt = base_url + 'files/'
genes_endpt = base_url + 'genes/'
cases_endpt = base_url + 'cases/'
data_endpt = base_url + "data/"

In [None]:
def upload_to_aws(local_file, object_name=None):

    s3 = boto3.client('s3', aws_access_key_id=config.ACCESS_KEY,
                      aws_secret_access_key=config.SECRET_KEY)
    
    if (object_name == None):
        object_name = local_file

    s3.upload_file(
        local_file, config.BUCKET, object_name
    )
    
# upload_to_aws("./data.csv", "data.csv")

In [5]:
# Get info on properties of the returned objects for an endpoint

# files:

# data = requests.get(files_endpt + "_mapping").json()["fields"]
# print(json.dumps(data, indent=2))

In [8]:
def download_from_aws(object_name, location = None):
    
    s3 = boto3.client('s3', aws_access_key_id=config.ACCESS_KEY,
                      aws_secret_access_key=config.SECRET_KEY)
    
    if (location == None):
        location = "./" + object_name
    
    s3.download_file(
        config.BUCKET, object_name, location
    )

# os.makedirs("AWS_test", exist_ok=True)
# download_from_aws("data.csv", "AWS_test/data.csv")

# RNA-Seq Data
---

For now, we'll collect data for one cancer type to save space.

In [8]:
'''
{
            "op": "in",
            "content":
             {
                 "field": "cases.diagnoses.primary_diagnosis", 
                 "value": [],
             }
         }
'''
# Retrieve data from the TCGA API

# data type of files we want
data_type = "htseq.counts"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = "file_id,file_name,cases.case_id,cases.diagnoses.primary_diagnosis"

# filter files for only RNA-Seq results
filters = {
    "op": "and",
     "content":[
         {
            "op": "in",
            "content":
             {
                 "field": "files.experimental_strategy", 
                 "value": ["RNA-Seq"],
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "access", 
                 "value": "open",
                 
             }
         },
         {
            "op": "in",
            "content":
             {
                 "field": "files.file_name", 
                 "value": ["*htseq.counts.gz"],
             }
         },
     ],
}

# build parameters object
params = {
    "fields": fields,
    "filters": json.dumps(filters),
    "size": 100000
}

# get list of all files with RNA-seq results
response = requests.get(files_endpt, params = params) # optionally also provide params argument
data = json.loads(response.content.decode("utf-8"))

# get list of results
results = data["data"]["hits"]
#print(results)
#results = list(filter(lambda x: data_type in x["file_name"], results))

file_uuid_list = [ entry["file_id"] for entry in results]
case_uuid_list = [ entry["cases"][0]["case_id"] for entry in results]
print(f'{len(file_uuid_list)} unique files')
print(f'{len(set(case_uuid_list))} unique cases')


df_files_cases=pd.DataFrame({"case": case_uuid_list },  index=file_uuid_list)
file_to_case = df_files_cases.to_dict()["case"] # dict mapping file id to case id
# df_files_cases.head()

11280 unique files
10393 unique cases


In [17]:
df_files_cases.to_csv('files_to_cases.csv', encoding='utf-8')

In [29]:
cancer_types = []

for entry in results:
    if "diagnoses" in entry["cases"][0]:  
        cancer_types.append(entry["cases"][0]["diagnoses"][0]["primary_diagnosis"])
    

cancer_types = list(set(cancer_types))
print(cancer_types)

['Hepatocellular carcinoma, spindle cell variant', 'Pleomorphic carcinoma', 'Adrenal cortical carcinoma', 'Amelanotic melanoma', 'Basaloid squamous cell carcinoma', 'Adenosquamous carcinoma', 'Squamous cell carcinoma, small cell, nonkeratinizing', 'Papillary adenocarcinoma, NOS', 'Superficial spreading melanoma', 'Pheochromocytoma, malignant', 'Mucinous adenocarcinoma, endocervical type', 'Thymoma, type B1, NOS', 'Neuroendocrine carcinoma, NOS', 'Epithelioid mesothelioma, malignant', 'Thymoma, type B2, malignant', 'Medullary carcinoma, NOS', 'Paget disease and infiltrating duct carcinoma of breast', 'Oligodendroglioma, anaplastic', 'Squamous cell carcinoma, keratinizing, NOS', 'Phyllodes tumor, malignant', 'Intraductal micropapillary carcinoma', 'Leiomyosarcoma, NOS', 'Apocrine adenocarcinoma', 'Large cell neuroendocrine carcinoma', 'Papillary serous cystadenocarcinoma', 'Thymoma, type B3, malignant', 'Solid carcinoma, NOS', 'Oligodendroglioma, NOS', 'Mixed epithelioid and spindle cell

In [13]:
# generate the manifest file
manifest_endpt = "https://api.gdc.cancer.gov/v0/manifest"

params = {"ids": file_uuid_list, "return_type": "manifest"}


response = requests.post(manifest_endpt, data=json.dumps(params), headers={
                            "Content-Type": "application/json"})
                        

# optionally also provide params argument


In [16]:
# filename is found in the Content-Disposition header of response
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]

downloads_folder = "manifest_downloads/"
os.makedirs(downloads_folder, exist_ok = True)


# Save manifest file to manifest_downloads folder
with open(downloads_folder + file_name, "wb") as f_out:
    f_out.write(response.content)

In [9]:
params = {"ids": file_uuid_list, "return_type": "manifest"}

# A POST is used, so the filter parameters can be passed directly as a Dict object.
response = requests.post(data_endpt,
                        data = json.dumps(params),
                        headers={
                            "Content-Type": "application/json"})

# filename is found in the Content-Disposition header of response
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]

downloads_folder = "TCGA_downloads/"

# Save .tar.gz zipped file to TCGA_downloads folder
with open(downloads_folder + file_name, "wb") as f_out:
    f_out.write(response.content)

KeyboardInterrupt: 

In [6]:
'''
# extract the root tar archive
tar = tarfile.open(downloads_folder + file_name, "r:gz")
tar.extractall("./{}".format(downloads_folder))
folder = file_name.split(".tar.gz")[0]

for tarinfo in tar:
    if (tarinfo.name == "MANIFEST.txt"): continue
    file_id = tarinfo.name.split("/")[0]
    
    # unzip inner .gz files
    with gzip.open(downloads_folder + tarinfo.name, "rb") as f_in:
        with open("data/{}.txt".format(file_to_case[file_id]), "wb") as f_out:
            f_out.write(f_in.read())

tar.close()

'''

# initialize empty df
df = pd.DataFrame({"gene": []})
df = df.set_index("gene")

size = 0
# loop over directories
for file_id in os.listdir("TCGA_downloads"):
    with gzip.open(f"TCGA_downloads/{file_id}", "rb") as f_in:
        new_df = pd.read_csv(f_in, sep = "\t", header = None)
        file_id = re.findall("data/(.+).txt", f_in.name)[0]
        new_df.columns = ["gene", file_id]
        new_df.gene.replace(to_replace = r'\..*$', value = "", regex=True, 
           inplace=True) # collapse all versions of same gene to one gene
        new_df = new_df.set_index("gene")
        df = pd.DataFrame.merge(df, new_df, how="outer", left_on = "gene", right_on = "gene")

# drop rows not corresponding to genes        
non_genes = list(filter(lambda val: not "ENSG" in val,np.array(df.index.values))) 
df = df.drop(non_genes)


In [7]:
# initialize empty df
df = pd.DataFrame({"gene": []})
df = df.set_index("gene")

size = 0
# loop over files, merging with pre-existing data
for file in pathlib.Path('data').glob('*.txt'):
    with open(file, "rb") as f_in:
        new_df = pd.read_csv(f_in, sep = "\t", header = None)
        file_id = re.findall("data/(.+).txt", f_in.name)[0]
        new_df.columns = ["gene", file_id]
        new_df.gene.replace(to_replace = r'\..*$', value = "", regex=True, 
           inplace=True) # collapse all versions of same gene to one gene
        new_df = new_df.set_index("gene")
        df = pd.DataFrame.merge(df, new_df, how="outer", left_on = "gene", right_on = "gene")

# drop rows not corresponding to genes        
non_genes = list(filter(lambda val: not "ENSG" in val,np.array(df.index.values))) 
df = df.drop(non_genes)

In [8]:
df.head(10)

Unnamed: 0_level_0,54ac877a-52ff-450c-9df9-b7cd3fc8e2e2,16e69011-c295-479f-b521-86e66fba498d,b60f22ac-a659-4f33-b01d-820e86a9a5c9,3bce9803-e74c-462e-bd40-8ec337a66f3a,1a094c2b-9fba-4bb1-929d-bb2cdf874bee,2b14123b-8fcd-402c-9399-4e7c47f20252,f58e49fc-9641-4b46-be4c-766445ec70a3,6a1be87b-c4e0-4fd4-b050-50a245b22038,24118b8f-59fe-482a-aed3-7b1bde1571b7,7413c891-f74e-456f-bc70-69c83fb53c70,...,0d23fa5b-95f8-4626-a4d2-c72ad3cc553b,4160e048-f0b0-40f5-805b-e277a5893a3b,0a2d29de-869a-4dc8-ad11-6ee0d0a3a895,b8023162-5e82-40e6-ad8c-8acf81821f01,eec69ea9-ceb8-456a-aff4-41fb8a261e36,5503b3a1-7d03-41b2-9ec8-e478c5414d67,79fd602b-3e8e-4353-aa78-4f5f170b607d,07a859fc-6f78-4905-9035-90e2403dbe8d,f9c835db-2ab6-4bf5-826f-48723493c0ec,a782352e-c4bb-4c3e-ba82-456a47c3689a
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,1005,1234,11822,2648,3139,1928,2221,13405,2025,7594,...,3071,4105,3007,6136,8602,4884,2830,5286,3635,4420
ENSG00000000005,2,0,10,7,4,3,1,2,2,6,...,2,3,0,8,32,2,3,4,3,0
ENSG00000000419,232,449,2021,2047,1090,1863,2849,6176,663,3590,...,1399,3706,2297,2954,3094,5054,1045,3846,5822,4038
ENSG00000000457,115,356,1157,453,404,651,695,1060,315,1410,...,759,653,975,1194,1407,890,718,981,1033,978
ENSG00000000460,10,209,1421,412,209,537,448,693,226,634,...,201,419,777,740,1525,626,198,753,915,724
ENSG00000000938,17,994,93,1030,32,489,386,298,149,1055,...,94,224,684,127,1226,1093,332,527,287,477
ENSG00000000971,491,5369,788,1797,166,2699,1604,8482,393,727,...,7588,878,2083,405,3430,9918,5980,4381,2322,2536
ENSG00000001036,1105,2574,3808,4219,804,3926,2715,4154,1374,3310,...,1880,3021,2682,5720,6115,4445,2319,4660,2768,3301
ENSG00000001084,229,893,1914,1436,384,1665,1782,2435,805,2761,...,1649,1909,1200,4457,3229,2070,1446,2462,2159,2292
ENSG00000001167,433,622,4450,1963,1165,1882,2162,1938,868,2954,...,570,1573,1391,3964,4094,2381,1023,2272,1498,2319


In [9]:
# API only accepts 1000 ids per call

# Convert Ensembl gene IDs to Entrez gene symbols
# mygene_endpt = "http://mygene.info/v3/gene/"
# headers = {'content-type': 'application/x-www-form-urlencoded'}

# ensembl_ids = df.index.to_list()
# build parameters object

# ids = ",".join(ensembl_ids)
# species = "human"
# fields = ",".join(["symbol","name","taxid","entrezgene"])

# params = "ids={}&species={}&fields={}".format(ids,species,fields)
# response = requests.post(mygene_endpt, data=params, headers=headers)
# data = response.json()

# create map of ensembl ID -> entrez symbol
# ensembl_to_symbol = {gene["query"]:gene["symbol"] for gene in data}

In [10]:
# Lots of missing values
# id_chart = pd.read_csv("gene_id_conversion.txt", sep="\t")
# id_chart = id_chart[["Ensembl gene ID", "Approved symbol"]]
# id_chart = id_chart.set_index("Ensembl gene ID")
# ensembl_to_symbol = id_chart.to_dict()["Approved symbol"]

In [11]:
ncbi = pd.DataFrame(ncbi_genes.ncbi_genes_fetch())

def get_ensembl_id(ids):
    ids = "".join(ids)
    ensembl = re.findall("Ensembl:(.*)", ids)
    if (len(ensembl) == 1):
        return ensembl[0]
    else:
        return None

    
all_ids = ncbi.dbXrefs.values
ensembl_ids = [ get_ensembl_id(ids) for ids in all_ids] 

ncbi = ncbi[["dbXrefs", "Symbol", "type_of_gene"]]
ncbi["ensembl"] = ensembl_ids
ncbi = ncbi.drop(columns=["dbXrefs"])
ncbi = ncbi.set_index("ensembl")

ensembl_to_symbol = ncbi.to_dict()["Symbol"]
ensembl_to_gene_type = ncbi.to_dict()["type_of_gene"]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
data_ensembl_ids = df.index.to_list()

# if the key is present, return it; otherwise, set the index for the corresponding row as its ensembl id
def id_to_symbol(key):
    if (key in ensembl_to_symbol): 
        return ensembl_to_symbol[key]
    else:
        return key # can return key here to maintain some gene identity

def id_to_type(key):
    if (key in ensembl_to_gene_type): 
        return ensembl_to_gene_type[key]
    else:
        return None
    
data_symbols = [ id_to_symbol(key) for key in data_ensembl_ids ]
data_types = [ id_to_type(key) for key in data_ensembl_ids ]

df["symbol"] = data_symbols
df["type_of_gene"] = data_types
df_symbol = df.set_index("symbol")

# drop non protein-coding genes
df_symbol = df_symbol[df_symbol["type_of_gene"] == "protein-coding"]
df_symbol = df_symbol.drop(columns=["type_of_gene"])
df_symbol.head()

# write the final csv, ready for normalization and further preprocessing
df_symbol.to_csv('data.csv', encoding='utf-8')



In [13]:
df_symbol = pd.read_csv("data.csv")
df_symbol.head()



Unnamed: 0,symbol,54ac877a-52ff-450c-9df9-b7cd3fc8e2e2,16e69011-c295-479f-b521-86e66fba498d,b60f22ac-a659-4f33-b01d-820e86a9a5c9,3bce9803-e74c-462e-bd40-8ec337a66f3a,1a094c2b-9fba-4bb1-929d-bb2cdf874bee,2b14123b-8fcd-402c-9399-4e7c47f20252,f58e49fc-9641-4b46-be4c-766445ec70a3,6a1be87b-c4e0-4fd4-b050-50a245b22038,24118b8f-59fe-482a-aed3-7b1bde1571b7,...,0d23fa5b-95f8-4626-a4d2-c72ad3cc553b,4160e048-f0b0-40f5-805b-e277a5893a3b,0a2d29de-869a-4dc8-ad11-6ee0d0a3a895,b8023162-5e82-40e6-ad8c-8acf81821f01,eec69ea9-ceb8-456a-aff4-41fb8a261e36,5503b3a1-7d03-41b2-9ec8-e478c5414d67,79fd602b-3e8e-4353-aa78-4f5f170b607d,07a859fc-6f78-4905-9035-90e2403dbe8d,f9c835db-2ab6-4bf5-826f-48723493c0ec,a782352e-c4bb-4c3e-ba82-456a47c3689a
0,TSPAN6,1005,1234,11822,2648,3139,1928,2221,13405,2025,...,3071,4105,3007,6136,8602,4884,2830,5286,3635,4420
1,TNMD,2,0,10,7,4,3,1,2,2,...,2,3,0,8,32,2,3,4,3,0
2,DPM1,232,449,2021,2047,1090,1863,2849,6176,663,...,1399,3706,2297,2954,3094,5054,1045,3846,5822,4038
3,SCYL3,115,356,1157,453,404,651,695,1060,315,...,759,653,975,1194,1407,890,718,981,1033,978
4,C1orf112,10,209,1421,412,209,537,448,693,226,...,201,419,777,740,1525,626,198,753,915,724


# Clinical Data
---

In [14]:
# get all demographic fields

cases_fields = requests.get(cases_endpt + "_mapping").json()["fields"]

diagnoses_fields = list(filter(lambda x: "diagnoses" in x, cases_fields))
print(diagnoses_fields)

demographic_fields = list(filter(lambda x: "demographic" in x, cases_fields))
print(demographic_fields)

['diagnoses.gleason_patterns_percent', 'diagnoses.synchronous_malignancy', 'diagnoses.child_pugh_classification', 'diagnoses.submitter_id', 'diagnoses.cog_neuroblastoma_risk_group', 'diagnoses.peritoneal_fluid_cytological_status', 'diagnoses.cog_renal_stage', 'diagnoses.esophageal_columnar_metaplasia_present', 'diagnoses.figo_staging_edition_year', 'diagnoses.primary_diagnosis', 'diagnoses.circumferential_resection_margin', 'diagnoses.diagnosis_id', 'diagnoses.tumor_largest_dimension_diameter', 'diagnoses.papillary_renal_cell_type', 'diagnoses.primary_gleason_grade', 'diagnoses.pregnant_at_diagnosis', 'diagnoses.ajcc_clinical_n', 'diagnoses.site_of_resection_or_biopsy', 'diagnoses.lymph_nodes_positive', 'diagnoses.days_to_recurrence', 'diagnoses.laterality', 'diagnoses.non_nodal_regional_disease', 'diagnoses.irs_group', 'diagnoses.medulloblastoma_molecular_classification', 'diagnoses.masaoka_stage', 'diagnoses.micropapillary_features', 'diagnoses.ann_arbor_b_symptoms', 'diagnoses.best_

In [58]:
# initialize dataframe
demographic_column_names = [ field.split(".")[1] for field in demographic_fields ]
diagnoses_column_names = [ field.split(".")[1] for field in diagnoses_fields ]

columns = list(set([*demographic_column_names,*diagnoses_column_names]))
df_clinical = pd.DataFrame({}, columns=columns)

df_clinical["case_id"] = []

# get demographics and diagnosis data for each case, 
# merging with pre-exisiting dataframe

for case in case_uuid_list:
    fields=",".join([*demographic_fields, *diagnoses_fields])
    params={
        "fields": fields
    }
    response = requests.get(cases_endpt + case, params=params).json()["data"]
    demographic_data = response["demographic"]
    diagnoses_data = response["diagnoses"]
    diagnoses_data = diagnoses_data[0]
    del diagnoses_data["treatments"] # do not load treatment data
    df_case = pd.DataFrame({**demographic_data,**diagnoses_data}, index=[case])
    df_case.head()
    df_case["case_id"] = case
    df_clinical = pd.concat([df_clinical, df_case], join="outer")

df_clinical = df_clinical.set_index("case_id")


In [59]:
# make first column "primary_diagnosis"
cols = ['primary_diagnosis']  + [col for col in df_clinical.columns.values if col != 'primary_diagnosis']
df_clinical = df_clinical[cols]
df_clinical.head()

Unnamed: 0_level_0,primary_diagnosis,primary_gleason_grade,figo_staging_edition_year,updated_datetime,occupation_duration_years,ann_arbor_clinical_stage,best_overall_response,largest_extrapelvic_peritoneal_focus,esophageal_columnar_dysplasia_degree,perineural_invasion_present,...,lymph_node_involved_site,year_of_diagnosis,ann_arbor_b_symptoms,gastric_esophageal_junction_involvement,days_to_birth,ajcc_clinical_stage,treatments,ajcc_pathologic_stage,progression_or_recurrence,ajcc_pathologic_t
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d79e692c-5053-4484-a180-01a094c5ff45,"Serous cystadenocarcinoma, NOS",,,2019-08-27T13:09:24.298212-05:00,,,,,,,...,,2010,,,-25612,,,,not reported,
32b5d0e4-a242-44d1-ac20-6e5ca2f35b71,"Serous cystadenocarcinoma, NOS",,,2019-08-27T13:11:28.180392-05:00,,,,,,,...,,2007,,,-23913,,,,not reported,
16e69011-c295-479f-b521-86e66fba498d,"Serous cystadenocarcinoma, NOS",,,2019-08-27T13:10:55.555572-05:00,,,,,,,...,,2008,,,-27065,,,,not reported,
211b2c52-1ad8-4d33-af0b-69ec1a56c821,"Serous cystadenocarcinoma, NOS",,,2019-08-27T13:08:19.563638-05:00,,,,,,,...,,2010,,,-20594,,,,not reported,
37d5c1d9-3ff5-44fe-afdc-ff430a0f3510,"Serous cystadenocarcinoma, NOS",,,2019-08-27T13:10:55.555572-05:00,,,,,,,...,,2010,,,-28037,,,,not reported,


In [60]:
# save final .csv
df_clinical.to_csv("clinical_data.csv", encoding='utf-8')


In [61]:
# save a df containing all entries for one cancer type for each type
cancer_types = list(set(df_clinical.primary_diagnosis))

for cancer in cancer_types:
    df_cancer = df_clinical[df_clinical.primary_diagnosis == cancer]
    df_cancer.to_csv("data_cancer/{}.csv".format(cancer), encoding='utf-8')