In [129]:
import os
import json
import requests
import pandas as pd

In [2]:
import sys
sys.path.insert(0, '/home/rparulkar/libs/data-collection/api')
import api as five3api

In [18]:
import time
from datetime import timedelta

In [153]:
SOURCEDIR = 'source-data/'
OUTDIR = 'processed-data/'

In [358]:
project_codes_inf = os.path.join(SOURCEDIR, 'tissueSourceSite.tsv')
project_codes_df = pd.read_csv(project_codes_inf, dtype=str, sep='\t')
project_codes_df

Unnamed: 0,TSS Code,Source Site,Study Name,BCR
0,01,International Genomics Consortium,Ovarian serous cystadenocarcinoma,IGC
1,02,MD Anderson Cancer Center,Glioblastoma multiforme,IGC
2,04,Gynecologic Oncology Group,Ovarian serous cystadenocarcinoma,IGC
3,05,Indivumed,Lung adenocarcinoma,IGC
4,06,Henry Ford Hospital,Glioblastoma multiforme,IGC
5,07,TGen,Cell Line Control,IGC
6,08,UCSF,Glioblastoma multiforme,IGC
7,09,UCSF,Ovarian serous cystadenocarcinoma,IGC
8,10,MD Anderson Cancer Center,Ovarian serous cystadenocarcinoma,IGC
9,11,MD Anderson Cancer Center,Lung squamous cell carcinoma,IGC


In [3]:
data = json.load(open(os.path.join(SOURCEDIR, '20180823_release_12.0.clean.json')))
print '# of patients: {0}'.format(str(len(data.keys())))
print '# of unique patients?: {0}'.format(str(len(list(set(data.keys())))))

# of patients: 11167
# of unique patients?: 11167


# Nant Drug DB

In [5]:
DRUG_AUTH = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjo1MCwiZW1haWwiOiJyYWh1bC5wYXJ1bGthckBpbW11bml0eWJpby5jb20iLCJ1c2VybmFtZSI6InJhaHVsLnBhcnVsa2FyQGltbXVuaXR5YmlvLmNvbSIsImV4cCI6MTU5NDg4NTM0MSwib3JpZ19pYXQiOjE1OTQ4NDIxNDF9.Fkq_sTWspcPbNAoQ0AH5FyR2LxMERLT3b3IrJejfWPQ'
DRUG_URL = 'https://drugs.nantomics.com/api/drugs'
r, code = five3api.query(DRUG_URL, auth=DRUG_AUTH, authtype='JWT')
r_json = r.json()
drug_results = five3api.reports_paginated(r_json, [], auth=DRUG_AUTH, authtype='JWT')

In [6]:
len(drug_results)

1340

In [42]:
nant_drugs_aliases = {x['name']: [drug.lstrip(' ').rstrip(' ') for drug in x['synonyms'].split('|')] + [x['name']] for x in drug_results}

# Pubchem Drug DB

In [44]:
pubchem_drugs_aliases = {}
aliases_not_found = []

start_time = time.time()
ngroups = len(nant_drugs_aliases.keys())
nparsed = 1

for alias in nant_drugs_aliases.keys():
    response = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{0}/synonyms/JSON'.format(alias))
    if response.status_code == 200:
        pubchem_drugs_aliases[alias] = response.json()['InformationList']['Information'][0]['Synonym']
    else:
        aliases_not_found.append(alias)
    
    if nparsed % ngroups != 0:
        sys.stdout.write("\r%d/%d parsed [%.2f%% complete]" % (nparsed, ngroups, 100*float(nparsed)/ngroups))
        sys.stdout.flush()
    nparsed += 1

sys.stdout.write("Elapsed time: %s" % str(timedelta(seconds=(time.time() - start_time))))

1339/1340 parsed [99.93% complete]Elapsed time: 1:26:46.380226

In [46]:
print len(aliases_not_found)
print aliases_not_found[0]

586
Milatuzumab


## Write out aliases

In [133]:
combined_aliases = {}
for tx in nant_drugs_aliases.keys():
    aliases = nant_drugs_aliases.get(tx, []) + pubchem_drugs_aliases.get(tx, [])
    aliases = list(set(aliases))
    combined_aliases[tx] = aliases

aliases_out = os.path.join(OUTDIR, 'aliases.json')
with open(aliases_out, 'w') as fp:
    json.dump([combined_aliases], fp, indent=4)

# TCGA Patient Class

In [732]:
class TCGAPatient:
    def __init__(self, uuid, d):
        self.info = d
        self.uuid = uuid
        self.barcode = self.get_field('bcr_patient_barcode', DEFAULT='NA')
        self.age = 'NA' if not self.get_age() else self.get_age()
        self.gender = self.get_field('gender', DEFAULT='NA')
        self.tissue = self.get_field('tumor_tissue_site', DEFAULT='NA')
        self.race = self.get_field('race', DEFAULT='NA')
        self.project = self.get_project()
        self.collection = self.get_collection_type() 
        
    def get_field(self, key, DEFAULT=None):
        return self.info.get(key, DEFAULT)

    def get_age(self):
        return (self.get_field('age_at_diagnosis', None) or 
                    self.get_field('age_at_initial_pathologic_diagnosis', None))
    
    def get_project(self):
        project_code = self.barcode.split('-')[1]
        if project_code in project_codes_df['TSS Code'].unique():
            return project_codes_df[project_codes_df['TSS Code'] == project_code]['Study Name'].tolist()[0]
        else:
            return 'NA'
        
    def get_collection_type(self):
        collection_type = 'NA'
        if (self.get_field('retrospective_collection') and not 
                self.get_field('prospective_collection')):
            collection_type = 'retrospective'
        elif (not self.get_field('retrospective_collection') and 
                  self.get_field('prospective_collection')):
            collection_type = 'prospective'
        return collection_type
    
    def get_vital(self):
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        times = [(val, key) for val, key in self.os_times if val != '']
        
        if (len(times) == 0 and 
                self.get_field('vital_status', '') == ''):
            return 'NA'
        elif any([key in [DDT, DTD] for time, key in times]):
            return 'Dead'
        else:
            return 'Alive'

######## OS ########
    
    def get_os_event_time(self): # OS ONLY?
        if self.vital == 'NA':
            return 'NA'
        
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        times = [(int(val), key) for val, key in self.os_times if val not in BLACKLIST and val != '']
        
        if len(times) == 0:
            return 'NA'
        
        max_time = max([time for time, key in times])
        for time, key in times:
            if (time == max_time and 
                    self.vital == 'Alive' and
                    key in [LCDT, DTLF]):
                return max_time
            elif (time == max_time and 
                    self.vital == 'Dead' and
                    key in [DDT, DTD]):
                return max_time
    
    def get_os_censored(self): # OS ONLY?
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        times = [(int(val), key) for val, key in self.os_times if val not in BLACKLIST and val != '']
        
        if len(times) == 0:
            return 'True'
        else:
            max_time = max([time for time, key in times])
            if any([True for time, key in times if time == max_time and key in [DDT, DTD]]):
                return 'False'
            else:
                return 'True'
        
    def get_os_times(self, field='patient'):
        
        BLACKLIST = ["[Discrepancy]", "[Completed]"]
        
        LCDT = 'last_contact_days_to' # Alive + Dead
        DTLF = 'days_to_last_followup' # Alive + Dead
        DDT = 'death_days_to' # Dead only
        DTD = 'days_to_death' # Dead only
        
        KEYS = [LCDT, DTLF, DDT, DTD]
        if (field == 'patient' and 
                self.info.get('vital_status', None)):
            d = self.info
            vals = [(str(d.get(KEY, '')), KEY) for KEY in KEYS]
        elif (field == 'followups' and 
                  field in self.info):
            followups = self.get_field(field)
            vals = []
            for d in followups:
                if d.get('vital_status', None):
                    for KEY in KEYS:
                        vals.append((str(d.get(KEY, '')), KEY))
        else:
            return []
        
        return vals

######## END OS ########

In [736]:
patients = []
for uuid in data:
    d = data[uuid]
    p = TCGAPatient(uuid, d)
    p.os_times = list(set(p.get_os_times() + p.get_os_times(field='followups')))
    p.vital = p.get_vital()
    p.os_censored = p.get_os_censored()
    p.os_time = p.get_os_event_time()
    patients.append((p.uuid, p.barcode, p.age, p.gender,
                     p.tissue, p.project, p.race, p.collection,
                     p.vital, p.os_censored, p.os_time))
patients_df = pd.DataFrame(patients, columns=['uuid', 'barcode', 'age', 'gender', 'tissue',
                                              'project', 'race', 'collection_type',
                                              'vital_status', 'os_censored', 'os_time'])

In [737]:
patients_df

Unnamed: 0,uuid,barcode,age,gender,tissue,project,race,collection_type,vital_status,os_censored,os_time
0,5f10f8e4-a6f9-4643-b210-390bc8fd32bd,TCGA-AF-4110,77,MALE,,Rectum adenocarcinoma,WHITE,prospective,Alive,True,912
1,7b982d5e-3a7d-40ac-bd25-6044c62879b6,TCGA-68-8250,66,MALE,Lung,Lung squamous cell carcinoma,BLACK OR AFRICAN AMERICAN,retrospective,Alive,True,244
2,38dd826d-06dc-4a73-b852-c3d78c9c3ff4,TCGA-14-1821,31,MALE,Brain,Glioblastoma multiforme,WHITE,,Dead,False,541
3,FF5B9238-38A7-42B4-9D44-3B8EC9913F6D,TCGA-GU-A42P,72,MALE,Bladder,Bladder Urothelial Carcinoma,WHITE,prospective,Dead,False,332
4,eff7f13b-ed32-4e85-a8d0-831b655d773f,TCGA-14-0812,65,MALE,Brain,Glioblastoma multiforme,WHITE,,Dead,False,99
5,32128246-1258-43d1-b1e5-30ae63822c32,TCGA-CV-6950,64,MALE,Head and Neck,Head and Neck squamous cell carcinoma,WHITE,retrospective,Dead,False,459
6,33C09CD7-2CCC-4A69-B738-79DFD9AA149D,TCGA-5P-A9KE,70,MALE,Kidney,Kidney renal papillary cell carcinoma,WHITE,retrospective,Alive,True,824
7,79fd602b-3e8e-4353-aa78-4f5f170b607d,TCGA-25-1328,38,FEMALE,Ovary,Ovarian serous cystadenocarcinoma,WHITE,,Dead,False,2009
8,5AF0E222-3DC8-400B-BA61-2225921F2FD3,TCGA-L5-A8NH,54,MALE,Esophagus,Esophageal carcinoma,WHITE,retrospective,Dead,False,393
9,c75c915f-ef4b-4c19-8ace-995e6c6015fd,TCGA-04-1638,57,FEMALE,Ovary,Ovarian serous cystadenocarcinoma,BLACK OR AFRICAN AMERICAN,,Dead,False,1686


In [734]:
patients_df['vital_status'].value_counts()

Alive    7549
Dead     3610
NA          8
Name: vital_status, dtype: int64

In [739]:
patients_df.to_csv('processed-data/20180823_release_12.0.tsv', sep='\t', index=False)

TypeError: to_csv() got an unexpected keyword argument 'dtype'

# Drug Mapping

## First, lets map based on drug entries as seen in GDC
#### No string manipulation or best guessing of drug alias as documented in NANT or PUBCHEM

In [528]:
drug_mapping = {uuid: {'drugs': []} for uuid in data}
not_found = []

start_time = time.time()
ngroups = len(drug_mapping.keys())
nparsed = 1

for uuid in data:
    d = data[uuid]
    drugs = d.get('drugs', [])
    if len(drugs) > 0:
        for tx in drugs:
            tx_name = tx.get('pharmaceutical_therapy_drug_name', '')
            flag = False
            if tx_name != '':
                tx_new = str(tx_name)
                tx_new = tx_new[0].upper() + tx_new[1:].lower()
                for drug in nant_drugs_aliases.keys():
                    aliases = nant_drugs_aliases.get(drug, []) + pubchem_drugs_aliases.get(drug, [])
                    if tx_name in aliases:
                        drug_mapping[uuid]['drugs'].append(drug)
                        flag = True
                        break
                    elif tx_new in aliases :
                        drug_mapping[uuid]['drugs'].append(drug)
                        flag = True
                        break
            if not flag:
                not_found.append((uuid, tx_name))
    
    if nparsed % ngroups != 0:
        sys.stdout.write("\r%d/%d parsed [%.2f%% complete]" % (nparsed, ngroups, 100*float(nparsed)/ngroups))
        sys.stdout.flush()
    nparsed += 1

sys.stdout.write("Elapsed time: %s" % str(timedelta(seconds=(time.time() - start_time))))

11166/11167 parsed [99.99% complete]Elapsed time: 0:01:44.346829

In [751]:
print len(not_found)

2380


In [750]:
print sorted([x for x in list(set(not_found))], key=lambda x: x[1])

[(u'77d8c2cf-6b27-417b-9d43-14c95e6f9b85', inf), (u'9817EC15-605A-40DB-B848-2199E5CCBB7B', ''), (u'c7c34fd9-3021-45b7-b6ff-30b4196585a8', ''), (u'8bb3ed57-6e48-4c90-bcf2-baf9a843398f', ''), (u'fd5728e0-e495-45f5-aac8-c0856510c8aa', ''), (u'12b032ed-ff92-4bfe-b980-eb273ae8ae17', ''), (u'3429df57-5aae-41aa-8247-6a2ea935a934', ''), (u'e6dee2d7-ca05-44e2-bf25-1068a416bd14', ''), (u'a9bb8159-32f0-454c-a946-b3286a52b9d5', ''), (u'A0D35A25-3500-46F3-874F-1A0820200473', ''), (u'e1670628-632d-4305-be07-9b9b6bea3350', ''), (u'a2338b30-f511-4163-af3b-1e4a40ff00e1', ''), (u'0bf5bbd4-d9e8-42a6-9ab5-f2c174dec12c', ''), (u'7dcc809b-e33a-4453-b92a-c00786f48cb0', ''), (u'cf9db1af-17f0-490e-8139-142bd704763a', ''), (u'520a709a-87f6-4f3d-b2a2-9192e3b65c5b', ''), (u'BA207D79-91C0-45DC-A784-4BCAEC580EB4', ''), (u'CE2BFD9E-6382-47BB-9FD1-B77AFE57E948', ''), (u'5B10234B-6189-449F-8886-D0D44A729CED', ''), (u'9f89510c-ed07-471f-b35e-7c87c237b9fe', ''), (u'913F21A9-F727-4983-AB3A-61B5B3FC7F5E', ''), (u'a2b6246d

In [743]:
print sorted(nant_drugs_aliases.keys())

[u'17B-estradiol', u'AAT-007', u'ABBV-075', u'ABBV-085', u'ABBV-221', u'ABBV-399', u'ABBV-838', u'ABC294640', u'ABI-009', u'ABI-011', u'ABP 798', u'ABT-165', u'ABT-263', u'ABT-700', u'ABT-806', u'AC0010MA', u'AC410', u'ACP-319', u'ACTB-1003', u'ACY-1215', u'ACY-241', u'ADCT-301', u'ADCT-402', u'ADI-PEG20', u'ADU-S100', u'ADXS11-001', u'ADXS31-142', u'ADXS31-164', u'AE37', u'AEB071', u'AEB1102', u'AEE788', u'AFM11', u'AFM13', u'AG-881', u'AGEN1884', u'AGI-5198', u'AGI-6780', u'AGS-003-LNG', u'AGS-16C3F', u'AGS67E', u'ALRN-6924', u'ALT-801', u'ALT-802', u'AM0010', u'AMG 172', u'AMG 208', u'AMG 224', u'AMG 228', u'AMG 232', u'AMG 319', u'AMG 330', u'AMG 337', u'AMG 420', u'AMG 595', u'AMG 780', u'AMG 820', u'AMG 900', u'AMG-510', u'AMP-224', u'ANG1005', u'APC-100', u'APR-246', u'APS001F', u'APX005M', u'AR-42', u'AR160', u'ARQ 092', u'ARQ 736', u'ARQ 751', u'ARQ 761', u'ARS-1620', u'ASG-15ME', u'ASN001', u'ASN002', u'ASP3026', u'ASP4132', u'ASP5878', u'ASTX660', u'ASTX727', u'AT-101', u'AT

In [760]:
print nant_drugs_aliases['Doxorubicin']
print pubchem_drugs_aliases['Doxorubicin']


[u'Doxorubicin', u'Adriamycin', u'Adriablastin', u'Adriblastin', u'Adriblastina', u'Caelyx', u'Doxil', u'Doxorubicin', u'Doxorubicinum', u'NCI-C01514', u'Myocet', u'pegylated liposomal doxorubicin', u'doxorubicin hydrochloride liposome', u'Doxorubicin']
[u'doxorubicin', u'Adriamycin', u'23214-92-8', u'Doxil', u'Adriablastin', u'Doxorubicine', u'Adriblastina', u'Doxorubicinum', u'14-Hydroxydaunomycin', u'14-Hydroxydaunorubicine', u'Adriamycin semiquinone', u'Doxorubicine [INN-French]', u'Doxorubicinum [INN-Latin]', u'Doxorubicina [INN-Spanish]', u'FI 106', u'Doxorubicin [USAN:INN:BAN]', u'Hydroxydaunorubicin', u'CCRIS 739', u'HSDB 3070', u'NDC 38242-874', u'NCI-C01514', u'UNII-80168379AG', u'EINECS 245-495-6', u'CHEMBL53463', u'Doxorubicina', u'CHEBI:28748', u'NSC 123127', u'5,12-Naphthacenedione, 10-((3-amino-2,3,6-trideoxy-alpha-L-lyxo-hexopyranosyl)oxy)-7,8,9,10-tetrahydro-6,8,11-trihydroxy-8-(hydroxyacetyl)-1-methoxy-, (8S-cis)-', u'80168379AG', u'ADM', u'(1S,3S)-3,5,12-trihydroxy-3

In [534]:
drug_mapping_out = os.path.join(OUTDIR, 'drugs.json')
with open(drug_mapping_out, 'w') as fp:
    json.dump([drug_mapping], fp, indent=4)

In [761]:
count = 0
for uuid in drug_mapping.keys():
    if len(drug_mapping[uuid]['drugs']) > 0:
        count += 1
print count
        

3923


In [749]:
drug_mapping

{u'5f10f8e4-a6f9-4643-b210-390bc8fd32bd': {'drugs': [u'Oxaliplatin',
   u'Fluorouracil',
   u'Fluorouracil']},
 u'7b982d5e-3a7d-40ac-bd25-6044c62879b6': {'drugs': []},
 u'38dd826d-06dc-4a73-b852-c3d78c9c3ff4': {'drugs': [u'Temozolomide']},
 u'FF5B9238-38A7-42B4-9D44-3B8EC9913F6D': {'drugs': []},
 u'eff7f13b-ed32-4e85-a8d0-831b655d773f': {'drugs': []},
 u'32128246-1258-43d1-b1e5-30ae63822c32': {'drugs': []},
 u'33C09CD7-2CCC-4A69-B738-79DFD9AA149D': {'drugs': []},
 u'F693EEC9-CD6E-4696-882F-A6DFFE481DF9': {'drugs': []},
 u'5AF0E222-3DC8-400B-BA61-2225921F2FD3': {'drugs': []},
 u'c75c915f-ef4b-4c19-8ace-995e6c6015fd': {'drugs': [u'Carboplatin',
   u'Carboplatin',
   u'Doxorubicin']},
 u'03565036-b6d3-432f-8d9a-7099818e0788': {'drugs': [u'Fluorouracil',
   u'Cyclophosphamide',
   u'Methotrexate']},
 u'597c81a3-5820-48b0-9a9c-c45257c7ad70': {'drugs': []},
 u'BFB973EB-D42D-4900-A606-A381D2317555': {'drugs': []},
 u'9DCDBD1B-2E03-4314-AEF2-1F6B6FE18523': {'drugs': []},
 u'9BBD057C-5A3B-4E04-