In [1]:
import os
import json
import requests
import pandas as pd

In [2]:
import sys
sys.path.insert(0, '/home/rparulkar/libs/data-collection/api')
import api as five3api

In [3]:
import time
from datetime import timedelta

In [4]:
SOURCEDIR = 'source-data/'
OUTDIR = 'processed-data/'

In [5]:
project_codes_inf = os.path.join(SOURCEDIR, 'tissueSourceSite.tsv')
project_codes_df = pd.read_csv(project_codes_inf, dtype=str, sep='\t')
project_codes_df

Unnamed: 0,TSS Code,Source Site,Study Name,BCR
0,01,International Genomics Consortium,Ovarian serous cystadenocarcinoma,IGC
1,02,MD Anderson Cancer Center,Glioblastoma multiforme,IGC
2,04,Gynecologic Oncology Group,Ovarian serous cystadenocarcinoma,IGC
3,05,Indivumed,Lung adenocarcinoma,IGC
4,06,Henry Ford Hospital,Glioblastoma multiforme,IGC
5,07,TGen,Cell Line Control,IGC
6,08,UCSF,Glioblastoma multiforme,IGC
7,09,UCSF,Ovarian serous cystadenocarcinoma,IGC
8,10,MD Anderson Cancer Center,Ovarian serous cystadenocarcinoma,IGC
9,11,MD Anderson Cancer Center,Lung squamous cell carcinoma,IGC


In [6]:
data = json.load(open(os.path.join(SOURCEDIR, '20180823_release_12.0.clean.json')))
print '# of patients: {0}'.format(str(len(data.keys())))
print '# of unique patients?: {0}'.format(str(len(list(set(data.keys())))))

# of patients: 11167
# of unique patients?: 11167


# Nant Drug DB

In [None]:
DRUG_AUTH = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjo1MCwiZW1haWwiOiJyYWh1bC5wYXJ1bGthckBpbW11bml0eWJpby5jb20iLCJ1c2VybmFtZSI6InJhaHVsLnBhcnVsa2FyQGltbXVuaXR5YmlvLmNvbSIsImV4cCI6MTU5NDg4NTM0MSwib3JpZ19pYXQiOjE1OTQ4NDIxNDF9.Fkq_sTWspcPbNAoQ0AH5FyR2LxMERLT3b3IrJejfWPQ'
DRUG_URL = 'https://drugs.nantomics.com/api/drugs'
r, code = five3api.query(DRUG_URL, auth=DRUG_AUTH, authtype='JWT')
r_json = r.json()
drug_results = five3api.reports_paginated(r_json, [], auth=DRUG_AUTH, authtype='JWT')

In [None]:
len(drug_results)

In [None]:
nant_drugs_aliases = {x['name']: [drug.lstrip(' ').rstrip(' ') for drug in x['synonyms'].split('|')] + [x['name']] for x in drug_results}

# Pubchem Drug DB

In [None]:
pubchem_drugs_aliases = {}
aliases_not_found = []

start_time = time.time()
ngroups = len(nant_drugs_aliases.keys())
nparsed = 1

for alias in nant_drugs_aliases.keys():
    response = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{0}/synonyms/JSON'.format(alias))
    if response.status_code == 200:
        pubchem_drugs_aliases[alias] = response.json()['InformationList']['Information'][0]['Synonym']
    else:
        aliases_not_found.append(alias)
    
    if nparsed % ngroups != 0:
        sys.stdout.write("\r%d/%d parsed [%.2f%% complete]" % (nparsed, ngroups, 100*float(nparsed)/ngroups))
        sys.stdout.flush()
    nparsed += 1

sys.stdout.write("Elapsed time: %s" % str(timedelta(seconds=(time.time() - start_time))))

In [None]:
print len(aliases_not_found)
print aliases_not_found[0]

## Write out aliases

In [None]:
combined_aliases = {}
for tx in nant_drugs_aliases.keys():
    aliases = nant_drugs_aliases.get(tx, []) + pubchem_drugs_aliases.get(tx, [])
    aliases = list(set(aliases))
    combined_aliases[tx] = aliases

aliases_out = os.path.join(OUTDIR, 'aliases.json')
with open(aliases_out, 'w') as fp:
    json.dump([combined_aliases], fp, indent=4)

# TCGA Patient Class

In [28]:
class TCGAPatient:
    def __init__(self, uuid, d):
        self.info = d
        self.uuid = uuid
        self.barcode = self.get_field('bcr_patient_barcode', DEFAULT='NA')
        self.age = 'NA' if not self.get_age() else self.get_age()
        self.gender = self.get_field('gender', DEFAULT='NA')
        self.tissue = self.get_field('tumor_tissue_site', DEFAULT='NA')
        self.race = self.get_field('race', DEFAULT='NA')
        self.project = self.get_project()
        self.collection = self.get_collection_type() 
        
    def get_field(self, key, DEFAULT=None):
        return self.info.get(key, DEFAULT)

    def get_age(self):
        return (self.get_field('age_at_diagnosis', None) or 
                    self.get_field('age_at_initial_pathologic_diagnosis', None))
    
    def get_project(self):
        project_code = self.barcode.split('-')[1]
        if project_code in project_codes_df['TSS Code'].unique():
            return project_codes_df[project_codes_df['TSS Code'] == project_code]['Study Name'].tolist()[0]
        else:
            return 'NA'
        
    def get_collection_type(self):
        collection_type = 'NA'
        if (self.get_field('retrospective_collection') and not 
                self.get_field('prospective_collection')):
            collection_type = 'retrospective'
        elif (not self.get_field('retrospective_collection') and 
                  self.get_field('prospective_collection')):
            collection_type = 'prospective'
        return collection_type
    
    def get_vital(self):
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        times = [(val, key) for val, key in self.os_times if val != '']
        
        if (len(times) == 0 and 
                self.get_field('vital_status', '') == ''):
            return 'NA'
        elif any([key in [DDT, DTD] for time, key in times]):
            return 'Dead'
        else:
            return 'Alive'
    
    def get_nlp_values(self):
        nlp_keys = ['age at diagnosis', 'age at initial pathologic diagnosis',
                    'ajcc clinical tumor stage', 'ajcc pathologic tumor stage',
                    'bilateral diagnosis timing type',
                    'breslow thickness at diagnosis', 'clark level at diagnosis',
                    'clinical stage', 'days to initial pathologic diagnosis',
                    'diabetes diagnosis days to', 'diabetes diagnosis indicator',
                    'grade tier system', 'height cm at diagnosis',
                    'histologic diagnosis', 'histologic diagnosis other',
                    'histologic diagnosis percent', 'hypertension diagnosis',
                    'igcccg stage', 'initial diagnosis esophageal ca type',
                    'initial pathologic diagnosis method', 'masaoka stage',
                    'neoplasm histologic grade', 'nuclear grade III IV',
                    'pregnant at diagnosis', 'surgical approach at diagnosis',
                    'tumor grade', 'weight kg at diagnosis',
                    'year of initial pathologic diagnosis']
        nlp_values = [self.get_field('_'.join(key.split(' ')), 'None') for key in nlp_keys]
        return nlp_values

######## OS ########
    
    def get_os_event_time(self): # OS ONLY?
        if self.vital == 'NA':
            return 'NA'
        
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        times = [(int(val), key) for val, key in self.os_times if val not in BLACKLIST and val != '']
        
        if len(times) == 0:
            return 'NA'
        
        max_time = max([time for time, key in times])
        for time, key in times:
            if (time == max_time and 
                    self.vital == 'Alive' and
                    key in [LCDT, DTLF]):
                return max_time
            elif (time == max_time and 
                    self.vital == 'Dead' and
                    key in [DDT, DTD]):
                return max_time
    
    def get_os_censored(self): # OS ONLY?
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        times = [(int(val), key) for val, key in self.os_times if val not in BLACKLIST and val != '']
        
        if len(times) == 0:
            return 'True'
        else:
            max_time = max([time for time, key in times])
            if any([True for time, key in times if time == max_time and key in [DDT, DTD]]):
                return 'False'
            else:
                return 'True'
        
    def get_os_times(self, field='patient'):
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        KEYS = [LCDT, DTLF, DDT, DTD]
        if (field == 'patient' and 
                self.info.get('vital_status', None)):
            d = self.info
            vals = [(str(d.get(KEY, '')), KEY) for KEY in KEYS]
        elif (field == 'followups' and 
                  field in self.info):
            followups = self.get_field(field)
            vals = []
            for d in followups:
                if d.get('vital_status', None):
                    for KEY in KEYS:
                        vals.append((str(d.get(KEY, '')), KEY))
        else:
            return []
        
        return vals

######## END OS ########

In [29]:
patients = []
for uuid in data:
    d = data[uuid]
    p = TCGAPatient(uuid, d)
    p.os_times = list(set(p.get_os_times() + p.get_os_times(field='followups')))
    p.vital = p.get_vital()
    p.os_censored = p.get_os_censored()
    p.os_time = p.get_os_event_time()
    
    ### NLP Fields ###
    nlp_values = p.get_nlp_values()
    
    patients.append(tuple([p.uuid, p.barcode, p.age, p.gender,
                           p.tissue, p.project, p.race, p.collection,
                           p.vital, p.os_censored, p.os_time] + nlp_values))
patients_df = pd.DataFrame(patients, columns=['uuid', 'barcode', 'age', 'gender', 'tissue',
                                              'project', 'race', 'collection_type',
                                              'vital_status', 'os_censored', 'os_time'] + nlp_keys)

In [30]:
patients_df

Unnamed: 0,uuid,barcode,age,gender,tissue,project,race,collection_type,vital_status,os_censored,...,initial diagnosis esophageal ca type,initial pathologic diagnosis method,masaoka stage,neoplasm histologic grade,nuclear grade III IV,pregnant at diagnosis,surgical approach at diagnosis,tumor grade,weight kg at diagnosis,year of initial pathologic diagnosis
0,670c1cbc-4494-45f1-bb8b-18db82d4f7e0,TCGA-CC-A1HT,50,MALE,Liver,Liver hepatocellular carcinoma,ASIAN,prospective,Dead,False,...,,,,,,,,G3,54,2010
1,5b8f05fa-0145-4142-8d41-5980e4d36d81,TCGA-02-0046,61,MALE,Brain,Glioblastoma multiforme,BLACK OR AFRICAN AMERICAN,,Dead,False,...,,,,,,,,,,
2,7CE45134-5B1F-4EA7-ACE9-7E463B034A43,TCGA-KK-A8I5,55,MALE,Prostate,Prostate adenocarcinoma,BLACK OR AFRICAN AMERICAN,retrospective,Alive,True,...,,Core needle biopsy,,,,,,,,
3,A648D9BF-CF37-41FC-9515-E8F5AC85FCD4,TCGA-XF-A9SX,63,FEMALE,Bladder,Bladder Urothelial Carcinoma,WHITE,retrospective,Dead,False,...,,,,High Grade,,,,,56,
4,554a7381-46fd-4301-b91b-5af7f173c350,TCGA-EJ-5525,67,MALE,Prostate,Prostate adenocarcinoma,WHITE,prospective,Alive,True,...,,Core needle biopsy,,,,,,,,
5,8FABDAF2-00B1-4D41-A27B-F4AA13F7B451,TCGA-DX-A3M2,59,MALE,Retroperitoneum/Upper abdominal - Retroperitoneum,Sarcoma,WHITE,retrospective,Alive,True,...,,,,,,,,,,
6,1101faa1-a713-489b-8593-960a9d6bda42,TCGA-16-1060,70,FEMALE,Brain,Glioblastoma multiforme,WHITE,,Dead,False,...,,,,,,,,,,
7,36470b30-a3b3-4d28-851b-918a7ff88011,TCGA-55-8096,67,FEMALE,,Lung adenocarcinoma,WHITE,prospective,Dead,False,...,,,,,,,,,,
8,1d276b62-5e64-48fb-b2ce-5192b511fe37,TCGA-75-5146,,MALE,,Lung adenocarcinoma,,retrospective,Alive,True,...,,,,,,,,,,
9,f03aacb8-5565-425f-a8d5-be1056f882e0,TCGA-60-2725,74,MALE,Lung,Lung squamous cell carcinoma,WHITE,retrospective,Alive,True,...,,,,,,,,,,


In [35]:
patients_df['vital_status'].value_counts()

Alive    7549
Dead     3610
NA          8
Name: vital_status, dtype: int64

In [36]:
patients_df.to_csv('processed-data/20180823_release_12.0.tsv', sep='\t', index=False)

# Drug Mapping

## First, lets map based on drug entries as seen in GDC
#### No string manipulation or best guessing of drug alias as documented in NANT or PUBCHEM

In [None]:
drug_mapping = {uuid: {'drugs': []} for uuid in data}
not_found = []

start_time = time.time()
ngroups = len(drug_mapping.keys())
nparsed = 1

for uuid in data:
    d = data[uuid]
    drugs = d.get('drugs', [])
    if len(drugs) > 0:
        for tx in drugs:
            tx_name = tx.get('pharmaceutical_therapy_drug_name', '')
            flag = False
            if tx_name != '':
                tx_new = str(tx_name)
                tx_new = tx_new[0].upper() + tx_new[1:].lower()
                for drug in nant_drugs_aliases.keys():
                    aliases = nant_drugs_aliases.get(drug, []) + pubchem_drugs_aliases.get(drug, [])
                    if tx_name in aliases:
                        drug_mapping[uuid]['drugs'].append(drug)
                        flag = True
                        break
                    elif tx_new in aliases :
                        drug_mapping[uuid]['drugs'].append(drug)
                        flag = True
                        break
            if not flag:
                not_found.append((uuid, tx_name))
    
    if nparsed % ngroups != 0:
        sys.stdout.write("\r%d/%d parsed [%.2f%% complete]" % (nparsed, ngroups, 100*float(nparsed)/ngroups))
        sys.stdout.flush()
    nparsed += 1

sys.stdout.write("Elapsed time: %s" % str(timedelta(seconds=(time.time() - start_time))))

In [None]:
print len(not_found)

In [None]:
print sorted([x for x in list(set(not_found))], key=lambda x: x[1])

In [None]:
print sorted(nant_drugs_aliases.keys())

In [None]:
print nant_drugs_aliases['Doxorubicin']
print pubchem_drugs_aliases['Doxorubicin']


In [None]:
drug_mapping_out = os.path.join(OUTDIR, 'drugs.json')
with open(drug_mapping_out, 'w') as fp:
    json.dump([drug_mapping], fp, indent=4)

In [None]:
count = 0
for uuid in drug_mapping.keys():
    if len(drug_mapping[uuid]['drugs']) > 0:
        count += 1
print count
        

In [None]:
drug_mapping