In [30]:
os.chdir('/home/ruidong/Documents/Research/Git/TCGA/')

In [184]:
os.getcwd()

'/home/ruidong/Documents/Research/Git/TCGA'

In [133]:
###########################################################################
###                        Step 0. Setup Environment                    ###
###########################################################################

import re, os, scipy, json
from os import system, path
import zipfile
import gzip
import shutil
import io
import requests
import json

import numpy as np
import pandas as pd
from scipy import stats

from collections import OrderedDict, defaultdict
from operator import itemgetter
from pprint import pprint

project = 'TCGA-CHOL'

if not os.path.isdir(project):
    os.mkdir(project)


In [200]:
###########################################################################
###                  Step 1. Download data by gdc-client                ###
###########################################################################

def gdcDownload(project, data_category, data_type, workflow_type, dest_dir, manifest=None):
    
    #manifest = input('Manifest file:')
    #manifest = manifest.rstrip()
    
    #dest_dir = input('Destination directory:')
    #dest_dir = dest_dir.rstrip()
    
    if not os.path.exists('gdc-client'):
        os.system('wget https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.3.0_Ubuntu14.04_x64.zip')
        #os.system('unzip gdc-client_v1.3.0_Ubuntu14.04_x64.zip')
    
        with zipfile.ZipFile('gdc-client_v1.3.0_Ubuntu14.04_x64.zip','r') as zip_ref:
            zip_ref.extractall('.')
    
    if not os.path.exists(dest_dir):
        os.mkdir(dest_dir)
    
    
    print ('Start Downloading ...')

    
    if manifest != None:
        manifile = manifest
        
    else:
        rm = True
        manifest = getManifest(project, data_category, data_type, workflow_type)
        manifile = '.'.join([data_category, data_type, workflow_type, project, 'txt'])
        manifile = manifile.replace(' ', '_')
        manifest.to_csv(manifile, header=True, index=False, sep='\t')
        
    os.system('./gdc-client download -m '+ manifile)
        
        
    fls = []
    with open(manifile) as f:
        for line in f:
            line = line.rstrip()
            lst = line.split('\t')
            
            if lst[0] == 'id':
                continue
            else:
                fls.append(lst[0])
    
    for fl in fls:
        shutil.move(fl, dest_dir)
        
    if rm == True:
        os.remove(manifile)
    
    print ('Download completed !')
    

In [39]:
### Download data by providing manifest file

data_type = 'miRNAs'
dest_dir = os.path.join(project, data_type)

manifest = 'gdc_manifest.Isoform.TCGA-CHOL.txt'

gdcDownload(manifest, dest_dir)

Start Downloading ...
Download completed !


In [201]:
### Download data by specifying project, data category, etc.

data_type = 'RNAseq'
dest_dir = os.path.join(project, data_type)

gdcDownload(project = 'TCGA-CHOL', 
            data_category = 'Transcriptome Profiling', 
            data_type = 'Gene Expression Quantification', 
            workflow_type = 'HTSeq - Counts',
            dest_dir = dest_dir)

Start Downloading ...
Download completed !


In [204]:
data_type = 'preMIR'
dest_dir = os.path.join(project, data_type)

gdcDownload(project = 'TCGA-CHOL', 
            data_category = 'Transcriptome Profiling', 
            data_type = 'miRNA Expression Quantification', 
            workflow_type = 'BCGSC miRNA Profiling',
            dest_dir = dest_dir)

Start Downloading ...
Download completed !


In [None]:
data_type = 'Methylation'
dest_dir = os.path.join(project, data_type)

gdcDownload(project = 'TCGA-CHOL', 
            data_category = 'DNA Methylation', 
            data_type = 'Methylation Beta Value', 
            workflow_type = 'Liftover',
            dest_dir = dest_dir)

In [109]:
###########################################################################
###                 Step 2. Parse metadata via GDC API                  ###
###########################################################################


def gdcParseMeta(project, data_category, data_type, workflow_type):
    
    files_endpt = "https://api.gdc.cancer.gov/files"
    
    fields = [
        "file_name",
        #"file_id",
        "cases.samples.submitter_id",
        "associated_entities.entity_submitter_id",
        "cases.samples.sample_type",
        "cases.disease_type",
        "cases.demographic.gender",
        "cases.diagnoses.tumor_stage",
        "cases.diagnoses.tumor_grade",
        "cases.diagnoses.age_at_diagnosis",
        "cases.diagnoses.days_to_death",
        "cases.diagnoses.days_to_last_follow_up",
        "cases.diagnoses.vital_status",
        "cases.project.project_id"
    ]

    fields = ",".join(fields)

    # This set of filters is nested under an 'and' operator.
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": [project]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_category",
                "value": [data_category]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_type",
                "value": [data_type]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": [workflow_type]
                }
            }
        ]
    }
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    params = {
        "filters": filters,
        "fields": fields,
        "format": "TSV",
        "size": "10000"
        }
    
    # The parameters are passed to 'json' rather than 'params' in this case
    response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    
    df = pd.read_csv(io.StringIO(response.content.decode("utf-8")), sep='\t')
    
    df.columns = ['days_to_death', 'age_at_diagnosis', 'tumor_grade',
                 'sample_type', 'tumor_stage', 'entity_submitter_id',
                 'submitter_id', 'project_id', 'vital_status','disease_type', 
                 'file_name', 'days_to_last_follow_up','gender', 'file_id']
    
    df = pd.DataFrame(df, columns = ['file_id', 'file_name', 'submitter_id', 'entity_submitter_id',
                                     'sample_type', 'age_at_diagnosis', 'gender', 'tumor_stage',
                                     'tumor_grade', 'days_to_death', 'days_to_last_follow_up',
                                     'vital_status', 'disease_type', 'project_id'])
    
    df = df.rename_axis(df['submitter_id'], axis="rows")
    #print(response.content.decode("utf-8"))
    return (df)

In [219]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'Transcriptome Profiling', 
                    data_type = 'Isoform Expression Quantification', 
                    workflow_type = 'BCGSC miRNA Profiling')

In [211]:
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-W5-AA36-01A,40a40b43-142a-4041-beac-e4c55dfab09d,91c07e4a-3521-4f9c-a1ad-ba8c677a7c98.mirbase21...,TCGA-W5-AA36-01A,TCGA-W5-AA36-01A-11R-A41D-13,Primary Tumor,18882,female,stage iv,not reported,1402.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-YR-A95A-01A,2370512a-bdd1-4554-b5e2-421e4a44c5a1,07a10aa5-fb68-4d78-b02c-f1456e297f4f.mirbase21...,TCGA-YR-A95A-01A,TCGA-YR-A95A-01A-12R-A41D-13,Primary Tumor,19292,male,stage iv,not reported,26.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-3X-AAV9-01A,85f0797e-1d52-461e-a9f0-519c26908a04,32100d21-e578-43a9-9e7e-0a59069fd6bf.mirbase21...,TCGA-3X-AAV9-01A,TCGA-3X-AAV9-01A-72R-A41D-13,Primary Tumor,26349,male,stage i,not reported,339.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2G-01A,30cf8b89-708a-40df-9d4e-72eab593ac13,44ad6046-e17a-4655-bb58-3c778930e938.mirbase21...,TCGA-W5-AA2G-01A,TCGA-W5-AA2G-01A-11R-A41D-13,Primary Tumor,22933,female,stage i,not reported,,1976.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2Q-01A,aeecfc69-fdf7-450c-b374-0f7a21c74da5,caad916f-d439-4849-a3a5-e29dcd54cd43.mirbase21...,TCGA-W5-AA2Q-01A,TCGA-W5-AA2Q-01A-11R-A41D-13,Primary Tumor,25069,male,stage ii,not reported,,50.0,alive,Cholangiocarcinoma,TCGA-CHOL


In [215]:
###########################################################################
###               Step 3. Merge data for downstream analysis            ###
###########################################################################

def gdcMerge(path, meta, data_type): 
    
    fls = meta['file_id'] + '/' + meta['file_name']
    sams = fls.to_dict()
    
    expr = {}
    
    ### mature miRNAs
    if data_type == 'miRNAs':
        
        mir21 = OrderedDict()
    
        with open('mature.release21.id.txt') as f:
            for line in f:
                line = line.rstrip()
                lst = line.split('\t')
                mir21[lst[0]] = lst[1]

        for sam,fl in sams.items():
            expr[sam] = OrderedDict()
    
            for mir in mir21.values():
                expr[sam][mir] = 0
    
            with open(path+'/'+fl) as f:
                for line in f:
                    line = line.rstrip()
                    lst = line.split('\t')
            
                    if lst[-1].startswith('mature'):
                        mir = lst[-1].split(',')[-1]
                        expr[sam][mir21[mir]] += int(lst[2])
    
    ### RNAseq
    if data_type == 'RNAseq':
        for sam,fl in sams.items():
            expr[sam] = OrderedDict()
            
            with gzip.open(path+'/'+fl, 'rt') as f:
                for line in f:
                    line = line.rstrip()
                    lst = line.split('\t')
            
                    if not lst[0].startswith('ENSG'):
                        continue
                        
                    gene = lst[0].split('.')[0]
                        
                    expr[sam][gene] = lst[-1]
    
    
    ### Methylation and precursor miRNAs
    if data_type in ['Methylation', 'preMIR']:
        for sam,fl in sams.items():
            expr[sam] = OrderedDict()
            
            with open(path+'/'+fl) as f:
                for line in f:
                    line = line.rstrip()
                    lst = line.split('\t')
            
                    if lst[0].startswith('Composite|miRNA_ID'):
                        continue
                        
                    expr[sam][lst[0]] = lst[1]        
    
                    
    mergeDa = pd.DataFrame.from_dict(expr)
        
    return (mergeDa)
    

In [220]:
mirExpr = gdcMerge(path='TCGA-CHOL/miRNAs', meta=meta, data_type='miRNAs')

In [221]:
mirExpr.iloc[:6, :6]

Unnamed: 0,TCGA-3X-AAV9-01A,TCGA-3X-AAVA-01A,TCGA-3X-AAVB-01A,TCGA-3X-AAVC-01A,TCGA-3X-AAVE-01A,TCGA-4G-AAZO-01A
hsa-let-7a-2-3p,29,25,46,1,38,30
hsa-let-7a-3p,188,155,282,120,141,210
hsa-let-7a-5p,153811,119809,190562,79303,110684,107689
hsa-let-7b-3p,45,58,129,60,115,95
hsa-let-7b-5p,58883,34952,123675,51800,94827,104602
hsa-let-7c-3p,40,34,48,9,34,31


In [222]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'Transcriptome Profiling', 
                    data_type = 'Gene Expression Quantification', 
                    workflow_type = 'HTSeq - Counts')

In [146]:
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-W5-AA2R-01A,afecdda2-735c-4304-a087-ef917ad9cd5a,fc06a930-e3a1-4775-a126-610a531c655b.htseq.cou...,TCGA-W5-AA2R-01A,TCGA-W5-AA2R-01A-11R-A41I-07,Primary Tumor,28367,female,stage i,not reported,,1542.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2U-01A,4963b9af-7d7b-42cd-b057-2f894639e59f,46fd54e8-5ab7-43f1-bb88-01a163ff121f.htseq.cou...,TCGA-W5-AA2U-01A,TCGA-W5-AA2U-01A-11R-A41I-07,Primary Tumor,28552,female,stage i,not reported,627.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA34-11A,d9249d25-d852-4008-a71e-dd9be6cb68e9,fbdb830c-2f70-45f5-a2d6-2d8317301af3.htseq.cou...,TCGA-W5-AA34-11A,TCGA-W5-AA34-11A-11R-A41I-07,Solid Tissue Normal,27535,female,stage i,not reported,555.0,168.0,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-ZH-A8Y6-01A,be2d52c8-eb14-4b45-a65b-c1bdd6f5c4c2,4696ce44-29bf-41ea-b866-ddb17c376e94.htseq.cou...,TCGA-ZH-A8Y6-01A,TCGA-ZH-A8Y6-01A-11R-A41I-07,Primary Tumor,14976,female,stage i,not reported,,519.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-3X-AAVA-01A,42b8d463-6209-4ea0-bb01-8023a1302fa0,b6a2c03a-c8ad-41e9-8a19-8f5ac53cae9f.htseq.cou...,TCGA-3X-AAVA-01A,TCGA-3X-AAVA-01A-11R-A41I-07,Primary Tumor,18303,female,stage ii,not reported,445.0,,dead,Cholangiocarcinoma,TCGA-CHOL


In [223]:
rnaExpr = gdcMerge(path='TCGA-CHOL/RNAseq', meta=meta, data_type='RNAseq')

In [224]:
rnaExpr.iloc[:6, :6]

Unnamed: 0,TCGA-3X-AAV9-01A,TCGA-3X-AAVA-01A,TCGA-3X-AAVB-01A,TCGA-3X-AAVC-01A,TCGA-3X-AAVE-01A,TCGA-4G-AAZO-01A
ENSG00000000003,3363,4262,2001,3960,4798,1631
ENSG00000000005,1,1,2,1,0,0
ENSG00000000419,1804,1254,1365,1250,1492,757
ENSG00000000457,601,699,584,1127,1110,1535
ENSG00000000460,242,239,238,217,368,162
ENSG00000000938,670,334,484,421,696,137


In [225]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'Transcriptome Profiling', 
                    data_type = 'miRNA Expression Quantification', 
                    workflow_type = 'BCGSC miRNA Profiling')
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-ZU-A8S4-11A,4d7b448e-496d-474d-856e-ee578a6559a5,78b783b1-c2eb-4b94-9765-b4b8bf3776e0.mirbase21...,TCGA-ZU-A8S4-11A,TCGA-ZU-A8S4-11A-11R-A41D-13,Solid Tissue Normal,19264,male,stage i,not reported,98.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA34-11A,6c78a05b-c1e8-4c20-adfc-262a95b4ea49,f2809fd4-3e86-4ee2-ae5a-6a56256b95e7.mirbase21...,TCGA-W5-AA34-11A,TCGA-W5-AA34-11A-11R-A41D-13,Solid Tissue Normal,27535,female,stage i,not reported,555.0,168.0,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2T-01A,15bfe93a-9236-4ff0-9145-78a7bbf22cb6,2ab22567-023f-4e2b-80a5-00c7d58f4716.mirbase21...,TCGA-W5-AA2T-01A,TCGA-W5-AA2T-01A-12R-A41D-13,Primary Tumor,23560,female,stage ii,not reported,1220.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2Q-01A,74d60a5e-cd0b-4291-90df-35a42f476d78,caad916f-d439-4849-a3a5-e29dcd54cd43.mirbase21...,TCGA-W5-AA2Q-01A,TCGA-W5-AA2Q-01A-11R-A41D-13,Primary Tumor,25069,male,stage ii,not reported,,50.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2O-01A,455ee4cb-72f9-41f8-908e-ba15b92fafd9,88ef0233-d4f6-43ee-aa5f-2bb5089072d5.mirbase21...,TCGA-W5-AA2O-01A,TCGA-W5-AA2O-01A-11R-A41D-13,Primary Tumor,21118,male,stage i,not reported,640.0,,dead,Cholangiocarcinoma,TCGA-CHOL


In [226]:
preMIR = gdcMerge(path='TCGA-CHOL/preMIR', meta=meta, data_type='preMIR')

In [227]:
preMIR.iloc[:6, :6]

Unnamed: 0,TCGA-3X-AAV9-01A,TCGA-3X-AAVA-01A,TCGA-3X-AAVB-01A,TCGA-3X-AAVC-01A,TCGA-3X-AAVE-01A,TCGA-4G-AAZO-01A
hsa-let-7a-1,51254,39980,63775,26495,36932,36191
hsa-let-7a-2,51237,39889,63280,26414,36851,35811
hsa-let-7a-3,51547,40122,63843,26516,37080,35932
hsa-let-7b,58928,35010,123804,51860,94942,104700
hsa-let-7c,13027,3246,9751,760,4719,2551
hsa-let-7d,1229,2536,2294,2718,2356,1317


In [149]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'DNA Methylation', 
                    data_type = 'Methylation Beta Value', 
                    workflow_type = 'Liftover')

In [150]:
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-ZH-A8Y1-01A,9e76b112-9c90-4310-9e0b-643519d1b8c8,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,TCGA-ZH-A8Y1-01A,TCGA-ZH-A8Y1-01A-11D-A418-05,Primary Tumor,27279,female,stage iva,not reported,385.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA36-01A,8c977e5c-a9a8-4abb-b9f8-61f74dddbb16,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,TCGA-W5-AA36-01A,TCGA-W5-AA36-01A-11D-A418-05,Primary Tumor,18882,female,stage iv,not reported,1402.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2I-11A,77dd33d4-b399-43d0-9ac8-b4798c481294,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,TCGA-W5-AA2I-11A,TCGA-W5-AA2I-11A-11D-A418-05,Solid Tissue Normal,24388,male,stage i,not reported,1939.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2W-01A,05d8aac6-d53a-485b-9d06-674501e96274,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,TCGA-W5-AA2W-01A,TCGA-W5-AA2W-01A-11D-A418-05,Primary Tumor,11438,female,stage iva,not reported,924.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2O-01A,bb8db31e-d4a8-48db-9f20-c39a9e24de92,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,TCGA-W5-AA2O-01A,TCGA-W5-AA2O-01A-11D-A418-05,Primary Tumor,21118,male,stage i,not reported,640.0,,dead,Cholangiocarcinoma,TCGA-CHOL


In [164]:
###########################################################################
###                           Other Functions                           ###
###########################################################################

### get manifest file

def getManifest(project, data_category, data_type, workflow_type):
    
    files_endpt = "https://api.gdc.cancer.gov/files"

    # This set of filters is nested under an 'and' operator.
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": [project]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_category",
                "value": [data_category]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_type",
                "value": [data_type]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": [workflow_type]
                }
            }
        ]
    }
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    params = {
        "filters": filters,
        #"fields": fields,
        #"format": "TSV",
        "size": "10000",
        "return_type": 'manifest'
        }
    
    # The parameters are passed to 'json' rather than 'params' in this case
    response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    
    df = pd.read_csv(io.StringIO(response.content.decode("utf-8")), sep='\t')
    
    return (df)

In [202]:
manifest = getManifest(project = 'TCGA-CHOL', 
                    data_category = 'DNA Methylation', 
                    data_type = 'Methylation Beta Value', 
                    workflow_type = 'Liftover')

In [203]:
manifest.head()

Unnamed: 0,id,filename,md5,size,state
0,9e76b112-9c90-4310-9e0b-643519d1b8c8,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,1c3969ae227bb2571bd638610fa113ed,141181949,live
1,8c977e5c-a9a8-4abb-b9f8-61f74dddbb16,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,a59a932df97a3972a34a5db8089c6444,141230779,live
2,77dd33d4-b399-43d0-9ac8-b4798c481294,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,3fbfa356cb84d9aba5fb87d10231fd15,141222861,live
3,05d8aac6-d53a-485b-9d06-674501e96274,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,22761b47e8dcdc4a1de79c9a75c2900e,141281134,live
4,bb8db31e-d4a8-48db-9f20-c39a9e24de92,jhu-usc.edu_CHOL.HumanMethylation450.1.lvl-3.T...,0c5bb44a7c780ace5825ab6da0fe9ef1,141287936,live


In [None]:
manifest.to_csv('manifest.txt', header=True, index=False, sep='\t')