In [30]:
os.chdir('/home/ruidong/Documents/Research/Git/TCGA/')

In [134]:
os.getcwd()

'/home/ruidong/Documents/Research/Git/TCGA'

In [133]:
###########################################################################
###                        Step 0. Setup Environment                    ###
###########################################################################

import re, os, scipy, json
from os import system, path
import zipfile
import gzip
import shutil
import io
import requests
import json

import numpy as np
import pandas as pd
from scipy import stats

from collections import OrderedDict, defaultdict
from operator import itemgetter
from pprint import pprint

project = 'TCGA-CHOL'

if not os.path.isdir(project):
    os.mkdir(project)


In [125]:
###########################################################################
###                  Step 1. Download data by gdc-client                ###
###########################################################################

def gdcDownload(manifest, dest_dir):
    
    #manifest = input('Manifest file:')
    #manifest = manifest.rstrip()
    
    #dest_dir = input('Destination directory:')
    #dest_dir = dest_dir.rstrip()
    
    if not os.path.exists('gdc-client'):
        os.system('wget https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.3.0_Ubuntu14.04_x64.zip')
        #os.system('unzip gdc-client_v1.3.0_Ubuntu14.04_x64.zip')
    
        with zipfile.ZipFile('gdc-client_v1.3.0_Ubuntu14.04_x64.zip','r') as zip_ref:
            zip_ref.extractall('.')
    
    if not os.path.exists(dest_dir):
        os.mkdir(dest_dir)
    
    
    print ('Start Downloading ...')

    os.system('./gdc-client download -m '+ manifest)
    
    
    fls = []
    with open(manifest) as f:
        for line in f:
            line = line.rstrip()
            lst = line.split('\t')
            
            if lst[0] == 'id':
                continue
            else:
                fls.append(lst[0])
    
    for fl in fls:
        shutil.move(fl, dest_dir)
    
    print ('Download completed !')
    

In [39]:
data_type = 'miRNAs'
dest_dir = os.path.join(project, data_type)

manifest = 'gdc_manifest.Isoform.TCGA-CHOL.txt'

gdcDownload(manifest, dest_dir)

Start Downloading ...
Download completed !


In [126]:
data_type = 'RNAseq'
dest_dir = os.path.join(project, data_type)

manifest = 'gdc_manifest.HTSeq-Counts.TCGA-CHOL.txt'

gdcDownload(manifest, dest_dir)

Start Downloading ...
Download completed !


In [109]:
###########################################################################
###                 Step 2. Parse metadata via GDC API                  ###
###########################################################################


def gdcParseMeta(project, data_category, data_type, workflow_type):
    
    files_endpt = "https://api.gdc.cancer.gov/files"
    
    fields = [
        "file_name",
        #"file_id",
        "cases.samples.submitter_id",
        "associated_entities.entity_submitter_id",
        "cases.samples.sample_type",
        "cases.disease_type",
        "cases.demographic.gender",
        "cases.diagnoses.tumor_stage",
        "cases.diagnoses.tumor_grade",
        "cases.diagnoses.age_at_diagnosis",
        "cases.diagnoses.days_to_death",
        "cases.diagnoses.days_to_last_follow_up",
        "cases.diagnoses.vital_status",
        "cases.project.project_id"
    ]

    fields = ",".join(fields)

    # This set of filters is nested under an 'and' operator.
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": [project]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_category",
                "value": [data_category]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_type",
                "value": [data_type]
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.analysis.workflow_type",
                "value": [workflow_type]
                }
            }
        ]
    }
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    params = {
        "filters": filters,
        "fields": fields,
        "format": "TSV",
        "size": "10000"
        }
    
    # The parameters are passed to 'json' rather than 'params' in this case
    response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    
    df = pd.read_csv(io.StringIO(response.content.decode("utf-8")), sep='\t')
    
    df.columns = ['days_to_death', 'age_at_diagnosis', 'tumor_grade',
                 'sample_type', 'tumor_stage', 'entity_submitter_id',
                 'submitter_id', 'project_id', 'vital_status','disease_type', 
                 'file_name', 'days_to_last_follow_up','gender', 'file_id']
    
    df = pd.DataFrame(df, columns = ['file_id', 'file_name', 'submitter_id', 'entity_submitter_id',
                                     'sample_type', 'age_at_diagnosis', 'gender', 'tumor_stage',
                                     'tumor_grade', 'days_to_death', 'days_to_last_follow_up',
                                     'vital_status', 'disease_type', 'project_id'])
    
    df = df.rename_axis(df['submitter_id'], axis="rows")
    #print(response.content.decode("utf-8"))
    return (df)

In [147]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'Transcriptome Profiling', 
                    data_type = 'Isoform Expression Quantification', 
                    workflow_type = 'BCGSC miRNA Profiling')

In [148]:
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-W5-AA36-01A,40a40b43-142a-4041-beac-e4c55dfab09d,91c07e4a-3521-4f9c-a1ad-ba8c677a7c98.mirbase21...,TCGA-W5-AA36-01A,TCGA-W5-AA36-01A-11R-A41D-13,Primary Tumor,18882,female,stage iv,not reported,1402.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-YR-A95A-01A,2370512a-bdd1-4554-b5e2-421e4a44c5a1,07a10aa5-fb68-4d78-b02c-f1456e297f4f.mirbase21...,TCGA-YR-A95A-01A,TCGA-YR-A95A-01A-12R-A41D-13,Primary Tumor,19292,male,stage iv,not reported,26.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-3X-AAV9-01A,85f0797e-1d52-461e-a9f0-519c26908a04,32100d21-e578-43a9-9e7e-0a59069fd6bf.mirbase21...,TCGA-3X-AAV9-01A,TCGA-3X-AAV9-01A-72R-A41D-13,Primary Tumor,26349,male,stage i,not reported,339.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2G-01A,30cf8b89-708a-40df-9d4e-72eab593ac13,44ad6046-e17a-4655-bb58-3c778930e938.mirbase21...,TCGA-W5-AA2G-01A,TCGA-W5-AA2G-01A-11R-A41D-13,Primary Tumor,22933,female,stage i,not reported,,1976.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2Q-01A,aeecfc69-fdf7-450c-b374-0f7a21c74da5,caad916f-d439-4849-a3a5-e29dcd54cd43.mirbase21...,TCGA-W5-AA2Q-01A,TCGA-W5-AA2Q-01A-11R-A41D-13,Primary Tumor,25069,male,stage ii,not reported,,50.0,alive,Cholangiocarcinoma,TCGA-CHOL


In [139]:
###########################################################################
###               Step 3. Merge data for downstream analysis            ###
###########################################################################

def gdcMerge(path, meta, data_type):
    
    fls = meta['file_id'] + '/' + meta['file_name']
    sams = fls.to_dict()
    
    expr = {}
    
    if data_type == 'miRNAs':
        
        mir21 = OrderedDict()
    
        with open('mature.release21.id.txt') as f:
            for line in f:
                line = line.rstrip()
                lst = line.split('\t')
                mir21[lst[0]] = lst[1]

        for sam,fl in sams.items():
            expr[sam] = OrderedDict()
    
            for mir in mir21.values():
                expr[sam][mir] = 0
    
            with open(path+'/'+fl) as f:
                for line in f:
                    line = line.rstrip()
                    lst = line.split('\t')
            
                    if lst[-1].startswith('mature'):
                        mir = lst[-1].split(',')[-1]
                        expr[sam][mir21[mir]] += int(lst[2])
    
    
    if data_type == 'RNAseq':
        for sam,fl in sams.items():
            expr[sam] = OrderedDict()
            
            with gzip.open(path+'/'+fl, 'rt') as f:
                for line in f:
                    line = line.rstrip()
                    lst = line.split('\t')
            
                    if not lst[0].startswith('ENSG'):
                        continue
                        
                    gene = lst[0].split('.')[0]
                        
                    expr[sam][gene] = lst[-1]
        
    mergeDa = pd.DataFrame.from_dict(expr)
        
    return (mergeDa)
    

In [122]:
mirExpr = gdcMerge(path='TCGA-CHOL/miRNAs', meta=meta, data_type='miRNAs')

In [145]:
mirExpr.iloc[1:6, 1:6]

Unnamed: 0,TCGA-3X-AAVA-01A,TCGA-3X-AAVB-01A,TCGA-3X-AAVC-01A,TCGA-3X-AAVE-01A,TCGA-4G-AAZO-01A
hsa-let-7a-3p,155,282,120,141,210
hsa-let-7a-5p,119809,190562,79303,110684,107689
hsa-let-7b-3p,58,129,60,115,95
hsa-let-7b-5p,34952,123675,51800,94827,104602
hsa-let-7c-3p,34,48,9,34,31


In [130]:
meta = gdcParseMeta(project = 'TCGA-CHOL', 
                    data_category = 'Transcriptome Profiling', 
                    data_type = 'Gene Expression Quantification', 
                    workflow_type = 'HTSeq - Counts')

In [146]:
meta.head()

Unnamed: 0,file_id,file_name,submitter_id,entity_submitter_id,sample_type,age_at_diagnosis,gender,tumor_stage,tumor_grade,days_to_death,days_to_last_follow_up,vital_status,disease_type,project_id
TCGA-W5-AA2R-01A,afecdda2-735c-4304-a087-ef917ad9cd5a,fc06a930-e3a1-4775-a126-610a531c655b.htseq.cou...,TCGA-W5-AA2R-01A,TCGA-W5-AA2R-01A-11R-A41I-07,Primary Tumor,28367,female,stage i,not reported,,1542.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA2U-01A,4963b9af-7d7b-42cd-b057-2f894639e59f,46fd54e8-5ab7-43f1-bb88-01a163ff121f.htseq.cou...,TCGA-W5-AA2U-01A,TCGA-W5-AA2U-01A-11R-A41I-07,Primary Tumor,28552,female,stage i,not reported,627.0,,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-W5-AA34-11A,d9249d25-d852-4008-a71e-dd9be6cb68e9,fbdb830c-2f70-45f5-a2d6-2d8317301af3.htseq.cou...,TCGA-W5-AA34-11A,TCGA-W5-AA34-11A-11R-A41I-07,Solid Tissue Normal,27535,female,stage i,not reported,555.0,168.0,dead,Cholangiocarcinoma,TCGA-CHOL
TCGA-ZH-A8Y6-01A,be2d52c8-eb14-4b45-a65b-c1bdd6f5c4c2,4696ce44-29bf-41ea-b866-ddb17c376e94.htseq.cou...,TCGA-ZH-A8Y6-01A,TCGA-ZH-A8Y6-01A-11R-A41I-07,Primary Tumor,14976,female,stage i,not reported,,519.0,alive,Cholangiocarcinoma,TCGA-CHOL
TCGA-3X-AAVA-01A,42b8d463-6209-4ea0-bb01-8023a1302fa0,b6a2c03a-c8ad-41e9-8a19-8f5ac53cae9f.htseq.cou...,TCGA-3X-AAVA-01A,TCGA-3X-AAVA-01A-11R-A41I-07,Primary Tumor,18303,female,stage ii,not reported,445.0,,dead,Cholangiocarcinoma,TCGA-CHOL


In [140]:
rnaExpr = gdcMerge(path='TCGA-CHOL/RNAseq', meta=meta, data_type='RNAseq')

In [144]:
rnaExpr.iloc[1:6, 1:6]

Unnamed: 0,TCGA-3X-AAVA-01A,TCGA-3X-AAVB-01A,TCGA-3X-AAVC-01A,TCGA-3X-AAVE-01A,TCGA-4G-AAZO-01A
ENSG00000000005,1,2,1,0,0
ENSG00000000419,1254,1365,1250,1492,757
ENSG00000000457,699,584,1127,1110,1535
ENSG00000000460,239,238,217,368,162
ENSG00000000938,334,484,421,696,137
