In [1]:
import requests
import json
from io import StringIO
import pandas as pd
import numpy as np
import re
import os
from pandas.errors import EmptyDataError 

In [2]:
organs1 = ['Tongue', 'Adrenal gland', 'Bladder',\
       'Bone', 'Brain',\
       'Breast', 'Lung', 'Bronchus and lung', 'Cervix', 'Colorectal',\
       'Corpus uteri', 'Esophagus', 'Floor of mouth', 'Kidney', 'Larynx',\
       'Liver',\
       'Other and ill-defined sites in lip, oral cavity and pharynx',\
       'Other and unspecified parts of biliary tract',\
       'Other and unspecified parts of tongue', 'Palate', 'Pancreas',\
       'Prostate', 'Rectosigmoid junction', 'Skin',\
       'Stomach', 'Thymus', 'Thyroid', 'Uterus']

In [3]:
organs2 = ['Adrenal gland','Bile Duct','Bladder','Bone','Bone Marrow','Brain','Breast','Cervix','Colorectal', \
          'Esophagus','Eye','Head and Neck','Kidney','Liver','Lung','Lymph Nodes','Nervous System','Ovary','Pancreas','Pleura','Prostate','Skin','Soft Tissue','Stomach','Testis','Thymus','Thyroid','Uterus']

In [4]:
organs=list(set(organs1+organs2))
print(len(organs))
organs


38


['Testis',
 'Cervix',
 'Other and ill-defined sites in lip, oral cavity and pharynx',
 'Palate',
 'Kidney',
 'Brain',
 'Bone',
 'Pleura',
 'Nervous System',
 'Rectosigmoid junction',
 'Bronchus and lung',
 'Ovary',
 'Pancreas',
 'Breast',
 'Other and unspecified parts of biliary tract',
 'Thymus',
 'Lung',
 'Eye',
 'Skin',
 'Bone Marrow',
 'Thyroid',
 'Esophagus',
 'Stomach',
 'Floor of mouth',
 'Lymph Nodes',
 'Other and unspecified parts of tongue',
 'Bladder',
 'Liver',
 'Tongue',
 'Larynx',
 'Adrenal gland',
 'Soft Tissue',
 'Uterus',
 'Colorectal',
 'Head and Neck',
 'Prostate',
 'Bile Duct',
 'Corpus uteri']

In [5]:
def save_files(organs):
    idx = 0
    for j, organ in enumerate(organs):
        organ_name = '_'.join(organ.split(' '))
        '''
        try:
            os.mkdir(organ_name)
        except FileExistsError:
            pass
        '''
        fields = [
            "file_name",
            "cases.submitter_id",
            "cases.samples.sample_type",
            "cases.disease_type",
            "cases.project.project_id",
            "cases.primary_site"
            ]

        fields = ",".join(fields)

        files_endpt = "https://api.gdc.cancer.gov/files"

        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "files.data_format",
                    "value": ["TXT"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "cases.project.primary_site",
                    "value": [organ]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.data_category",
                    "value": ["DNA Methylation"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.platform",
                    "value": ["Illumina Human Methylation 450"]
                    }
                },
            ]
        }

        # A POST is used, so the filter parameters can be passed directly as a Dict object.
        params = {
            "filters": filters,
            "fields": fields,
            "format": "TSV",
            "size": "2000"
            }

        # The parameters are passed to 'json' rather than 'params' in this case
        response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
        resp = response.content.decode("utf-8")
        try:
            resp_df = pd.read_csv(StringIO(resp), sep='\t')
        except EmptyDataError:
            print('No tissue for', organ)
            continue
        u, c = np.unique(np.array(resp_df['cases.0.submitter_id']), return_counts=True)
        dup = u[c > 1]
        if idx == 0:
            df_new = pd.DataFrame(columns=resp_df.columns)
        for sub in dup:
            df_sub = resp_df[resp_df['cases.0.submitter_id'] == sub]
            tissues = np.unique(df_sub['cases.0.samples.0.sample_type'])
            df_new = pd.concat([df_new, df_sub])
            '''
            for i, ID in enumerate(np.array(df_sub['id'])):
                file_id = ID
                project_id = np.array(df_sub['cases.0.project.project_id'])[i]
                tissue = np.array(df_sub['cases.0.samples.0.sample_type'])[i]

                data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)

                response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})

                # The file name can be found in the header within the Content-Disposition key.
                response_head_cd = response.headers["Content-Disposition"]

                file_name = re.findall("filename=(.+)", response_head_cd)[0]

                tissue_name = '_'.join(tissue.split(' '))
                full_name = organ_name + '/' + organ_name + '_' + project_id + '_' + sub + '_' + tissue_name + '_' + file_name
                with open(full_name, "wb") as output_file:
                    output_file.write(response.content)
            '''
        idx += 1
    df_new.to_csv('/Users/irffanalahi/Research/Research_update/opensource_data/TCGA/TCGA_masterfile.txt', sep='\t', index=None)
    return df_new

In [6]:
res = save_files(organs)

No tissue for Other and ill-defined sites in lip, oral cavity and pharynx
No tissue for Palate
No tissue for Rectosigmoid junction
No tissue for Bronchus and lung
No tissue for Other and unspecified parts of biliary tract
No tissue for Floor of mouth
No tissue for Other and unspecified parts of tongue
No tissue for Tongue
No tissue for Larynx
No tissue for Corpus uteri


In [7]:
res

Unnamed: 0,cases.0.disease_type,cases.0.primary_site,cases.0.project.project_id,cases.0.samples.0.sample_type,cases.0.submitter_id,file_name,id
62,Germ Cell Neoplasms,Testis,TCGA-TGCT,Additional - New Primary,TCGA-2G-AAFG,60e9d95e-7db0-4bac-850b-ae03c3306a25.methylati...,806211af-0fe2-436e-82ea-57ba239e2925
83,Germ Cell Neoplasms,Testis,TCGA-TGCT,Primary Tumor,TCGA-2G-AAFG,a425f595-96af-444a-9da7-4c27a00a2b55.methylati...,0a476f89-0b80-4867-b5ae-4375f4858944
100,Germ Cell Neoplasms,Testis,TCGA-TGCT,Additional - New Primary,TCGA-2G-AAGI,5d3ca8e2-23ef-47c2-a4f2-82b036d16e23.methylati...,7d59218d-5fe6-49c5-bc79-91475df6ce11
127,Germ Cell Neoplasms,Testis,TCGA-TGCT,Primary Tumor,TCGA-2G-AAGI,628ef380-f556-424e-b027-b1a6c01fafab.methylati...,180c3c01-6c72-4a24-8e1c-7f0dedc4d06a
27,Germ Cell Neoplasms,Testis,TCGA-TGCT,Additional - New Primary,TCGA-2G-AAGY,39755aae-1cab-4ba1-9a7a-8557f899d7bc.methylati...,f0f6ecc9-f05b-4780-88f2-65e503bea36d
...,...,...,...,...,...,...,...
24,Adenomas and Adenocarcinomas,Liver and intrahepatic bile ducts,TCGA-CHOL,Primary Tumor,TCGA-W5-AA31,52908124-d10a-4870-aebe-eab762dde34f.methylati...,76616b92-e967-43b1-bbf2-877803fce7cf
36,Adenomas and Adenocarcinomas,Liver and intrahepatic bile ducts,TCGA-CHOL,Solid Tissue Normal,TCGA-W5-AA34,476c9698-16e9-493d-ad31-34e326510db7.methylati...,deefbcb6-72c6-4bbf-b1cf-f89508deb077
40,Adenomas and Adenocarcinomas,Liver and intrahepatic bile ducts,TCGA-CHOL,Primary Tumor,TCGA-W5-AA34,243ffab3-f1d3-4aa3-a63b-dcaf7edb3ff1.methylati...,ad529cad-9251-4521-b341-cc59c419c57b
14,Adenomas and Adenocarcinomas,Liver and intrahepatic bile ducts,TCGA-CHOL,Solid Tissue Normal,TCGA-ZU-A8S4,17e646f0-4dc3-46fe-893f-58b5a41e2939.methylati...,08afb4f0-dff3-4666-a22b-df08579852fb
