In [49]:
import requests
import json
from io import StringIO
import pandas as pd
import numpy as np
import re
import os
from pandas.errors import EmptyDataError 

In [50]:
organs = ['Adrenal gland', 'Tongue', 'Bladder',\
       'Bone', 'Brain',\
       'Breast', 'Lung', 'Bronchus and lung', 'Cervix', 'Colorectal',\
       'Corpus uteri', 'Esophagus', 'Floor of mouth', 'Kidney', 'Larynx',\
       'Liver',\
       'Other and ill-defined sites in lip, oral cavity and pharynx',\
       'Other and unspecified parts of biliary tract',\
       'Other and unspecified parts of tongue', 'Palate', 'Pancreas',\
       'Prostate', 'Rectosigmoid junction', 'Skin',\
       'Stomach', 'Thymus', 'Thyroid', 'Uterus']

In [51]:
def save_files(organs):
    for j, organ in enumerate(organs):
        organ_name = '_'.join(organ.split(' '))
        '''
        try:
            os.mkdir(organ_name)
        except FileExistsError:
            pass
        '''
        fields = [
            "file_name",
            "cases.submitter_id",
            "cases.samples.sample_type",
            "cases.disease_type",
            "cases.project.project_id",
            "cases.primary_site"
            ]

        fields = ",".join(fields)

        files_endpt = "https://api.gdc.cancer.gov/files"

        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "files.data_format",
                    "value": ["TXT"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "cases.project.primary_site",
                    "value": [organ]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.data_category",
                    "value": ["DNA Methylation"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.platform",
                    "value": ["Illumina Human Methylation 450"]
                    }
                },
            ]
        }

        # A POST is used, so the filter parameters can be passed directly as a Dict object.
        params = {
            "filters": filters,
            "fields": fields,
            "format": "TSV",
            "size": "2000"
            }

        # The parameters are passed to 'json' rather than 'params' in this case
        response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
        resp = response.content.decode("utf-8")
        try:
            resp_df = pd.read_csv(StringIO(resp), sep='\t')
        except EmptyDataError:
            print('No normal tissue for', organ)
            continue
        u, c = np.unique(np.array(resp_df['cases.0.submitter_id']), return_counts=True)
        dup = u[c > 1]
        if j == 0:
            df_new = pd.DataFrame(columns=resp_df.columns)
        for sub in dup:
            df_sub = resp_df[resp_df['cases.0.submitter_id'] == sub]
            tissues = np.unique(df_sub['cases.0.samples.0.sample_type'])
            df_new = pd.concat([df_new, df_sub])
            '''
            for i, ID in enumerate(np.array(df_sub['id'])):
                file_id = ID
                project_id = np.array(df_sub['cases.0.project.project_id'])[i]
                tissue = np.array(df_sub['cases.0.samples.0.sample_type'])[i]

                data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)

                response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})

                # The file name can be found in the header within the Content-Disposition key.
                response_head_cd = response.headers["Content-Disposition"]

                file_name = re.findall("filename=(.+)", response_head_cd)[0]

                tissue_name = '_'.join(tissue.split(' '))
                full_name = organ_name + '/' + organ_name + '_' + project_id + '_' + sub + '_' + tissue_name + '_' + file_name
                with open(full_name, "wb") as output_file:
                    output_file.write(response.content)
            '''
    df_new.to_csv('illumina_masterfile.txt', sep='\t', index=None)
    return df_new

In [52]:
res = save_files(organs)

No normal tissue for Tongue
No normal tissue for Bronchus and lung
No normal tissue for Corpus uteri
No normal tissue for Floor of mouth
No normal tissue for Larynx
No normal tissue for Other and ill-defined sites in lip, oral cavity and pharynx
No normal tissue for Other and unspecified parts of biliary tract
No normal tissue for Other and unspecified parts of tongue
No normal tissue for Palate
No normal tissue for Rectosigmoid junction


In [36]:
res

Unnamed: 0,cases.0.disease_type,cases.0.primary_site,cases.0.project.project_id,cases.0.samples.0.sample_type,cases.0.submitter_id,file_name,id
