In [1]:
import requests
import pandas as pd
from io import StringIO

## Xrefs for DO-Slim have been provided by dhimmel

Download the files from github and use them to convert the disorders to a slim version

r = requests.get('https://raw.githubusercontent.com/dhimmel/disease-ontology/gh-pages/data/xrefs-prop-slim.tsv')

file = StringIO(r.text)
xref = pd.read_csv(file, sep='\t')

In [2]:
xref = pd.read_table('../../disease-ontology/data/xrefs-prop-slim.tsv')

In [3]:
xref.head(2)

Unnamed: 0,doid_code,doid_name,resource,resource_id
0,DOID:2531,hematologic cancer,CSP,2004-1600
1,DOID:2531,hematologic cancer,CSP,2004-1803


In [4]:
# load the DOID to UMLS mapper
doid_to_umls = pd.read_csv('../doid-to-umls.csv').set_index('doid')['umlscui'].to_dict()

In [5]:
xref = xref.query('resource == "UMLS"')
print('{} unique CUIs can be mapped to DO Slim'.format(len(xref)))
xref['parent_cui'] = xref['doid_code'].apply(lambda c: doid_to_umls.get(c, float('NaN')))

2420 unique CUIs can be mapped to DO Slim


In [6]:
idx = xref['parent_cui'].isnull()
xref.loc[idx].drop_duplicates(subset=['doid_code'])

Unnamed: 0,doid_code,doid_name,resource,resource_id,parent_cui
2105,DOID:0060119,pharynx cancer,UMLS,C0031347,
4660,DOID:0060073,lymphatic system cancer,UMLS,C0024224,
9056,DOID:0050777,Joubert syndrome,UMLS,C2745997,
9966,DOID:0050581,brachydactyly,UMLS,C1300268,
10451,DOID:0060668,anencephaly,UMLS,C0020225,
10835,DOID:0050545,visceral heterotaxy,UMLS,C0037221,
10941,DOID:0050591,tooth agenesis,UMLS,C0399352,
11688,DOID:0050674,congenital bile acid synthesis defect,UMLS,C1843116,
11882,DOID:12859,choreatic disease,UMLS,C0264746,


9 DOIDs could not be mapped directly to UMLS.  We will use the DOID as the identifier in these two instances

In [7]:
xref.loc[idx, 'parent_cui'] = xref.loc[idx, 'doid_code']
xref.loc[idx].drop_duplicates(subset=['doid_code'])

Unnamed: 0,doid_code,doid_name,resource,resource_id,parent_cui
2105,DOID:0060119,pharynx cancer,UMLS,C0031347,DOID:0060119
4660,DOID:0060073,lymphatic system cancer,UMLS,C0024224,DOID:0060073
9056,DOID:0050777,Joubert syndrome,UMLS,C2745997,DOID:0050777
9966,DOID:0050581,brachydactyly,UMLS,C1300268,DOID:0050581
10451,DOID:0060668,anencephaly,UMLS,C0020225,DOID:0060668
10835,DOID:0050545,visceral heterotaxy,UMLS,C0037221,DOID:0050545
10941,DOID:0050591,tooth agenesis,UMLS,C0399352,DOID:0050591
11688,DOID:0050674,congenital bile acid synthesis defect,UMLS,C1843116,DOID:0050674
11882,DOID:12859,choreatic disease,UMLS,C0264746,DOID:12859


## Find more relations with slim mapping

Daniel also has DOID to DO-Slim mapping file.  Lets see if we can use it along with our doid to umls mapper to pick up any new relations.


r = requests.get('https://raw.githubusercontent.com/dhimmel/disease-ontology/gh-pages/data/slim-terms-prop.tsv')

file = StringIO(r.text)
do_slim = pd.read_csv(file, sep='\t')

In [8]:
do_slim = pd.read_table('../../disease-ontology/data/slim-terms-prop.tsv')

In [9]:
do_slim.head(2)

Unnamed: 0,slim_id,slim_name,subsumed_id,subsumed_name,min_distance
0,DOID:0050144,Kartagener syndrome,DOID:0050144,Kartagener syndrome,0
1,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156,idiopathic pulmonary fibrosis,0


In [10]:
do_slim['slim_cui'] = do_slim['slim_id'].apply(lambda i: doid_to_umls.get(i, float('NaN')))
do_slim['subsumed_cui'] = do_slim['subsumed_id'].apply(lambda i: doid_to_umls.get(i, float('NaN')))

In [11]:
idx = do_slim['slim_cui'].isnull()
do_slim.loc[idx].drop_duplicates(subset=['slim_id'])

Unnamed: 0,slim_id,slim_name,subsumed_id,subsumed_name,min_distance,slim_cui,subsumed_cui
47,DOID:0050489,multinodular goiter,DOID:0050489,multinodular goiter,0,,
91,DOID:0050541,Charcot-Marie-Tooth disease type 4,DOID:0050541,Charcot-Marie-Tooth disease type 4,0,,
104,DOID:0050542,Charcot-Marie-Tooth disease type X,DOID:0050542,Charcot-Marie-Tooth disease type X,0,,
111,DOID:0050544,hypermethioninemia,DOID:0050544,hypermethioninemia,0,,
115,DOID:0050545,visceral heterotaxy,DOID:0050545,visceral heterotaxy,0,,
135,DOID:0050564,autosomal dominant nonsyndromic deafness,DOID:0050564,autosomal dominant nonsyndromic deafness,0,,
190,DOID:0050565,autosomal recessive nonsyndromic deafness,DOID:0050565,autosomal recessive nonsyndromic deafness,0,,
270,DOID:0050566,X-linked nonsyndromic deafness,DOID:0050566,X-linked nonsyndromic deafness,0,,
271,DOID:0050569,Seckel syndrome,DOID:0050569,Seckel syndrome,0,,
272,DOID:0050572,cone-rod dystrophy,DOID:0050572,cone-rod dystrophy,0,,


More are not able to be mapped, will still use the same doid parents

In [12]:
do_slim.loc[idx, 'slim_cui'] = do_slim.loc[idx, 'slim_id']

In [13]:
do_slim = do_slim.rename(columns={'slim_name': 'doid_name', 'subsumed_cui':'resource_id', 
                                  'slim_cui': 'parent_cui'})[['doid_name', 'resource_id', 'parent_cui']]


final = (pd.concat([xref, do_slim])
           .dropna(subset=['resource_id', 'parent_cui'])
           .drop_duplicates(subset=['resource_id', 'parent_cui']))
         
print('{} new mappings found by incorporating do-slim'.format(len(final) - len(xref)) )

116 new mappings found by incorporating do-slim


In [21]:
final.to_csv('../data/DOSlim_mapping_info.csv', index=False)

## Start mapping network to slim

In [14]:
to_slim = final.set_index('resource_id')['parent_cui'].to_dict()
to_slim_name = final.set_index('resource_id')['doid_name'].to_dict()

In [15]:
nodes = pd.read_csv('../data/nodes_7_metanode.csv')
edges = pd.read_csv('../data/edges_7_metanode.csv', converters={'pmids':eval})

In [16]:
nodeid_to_name = nodes.set_index(':ID')['name'].to_dict()

In [17]:
nodes['name'] = nodes[':ID'].apply(lambda i: to_slim_name.get(i, nodeid_to_name[i]))
nodes[':ID'] = nodes[':ID'].apply(lambda i: to_slim.get(i, i))

In [18]:
edges[':START_ID'] = edges[':START_ID'].apply(lambda i: to_slim.get(i, i))
edges[':END_ID'] = edges[':END_ID'].apply(lambda i: to_slim.get(i, i))

## Map indications to slim

In [15]:
ind = pd.read_csv('../data/indications.csv')

In [16]:
ind.head(2)

Unnamed: 0,dc_struct_id,compound_umlscui,compound_name,relationship,dc_disease_id,disease_umlscui,disease_name,disease_umls_semantic_type,date_approved
0,5203,C3661315,rucaparib,indication,21000533,C1140680,Malignant tumor of ovary,T191,2016-12-19
1,5202,C4044947,baricitinib,indication,21002805,C0003873,Rheumatoid arthritis,T047,2017-02-13


In [17]:
did_to_name = ind.set_index('disease_umlscui')['disease_name'].to_dict()

ind['disease_name'] = ind['disease_umlscui'].apply(lambda c: to_slim_name.get(c, did_to_name[c]))
ind['disease_umlscui'] = ind['disease_umlscui'].apply(lambda c: to_slim.get(c, c))

In [19]:

ind = ind.drop_duplicates(subset=['compound_umlscui', 'disease_umlscui'])

print("{} nodes, {} edges, {} indications".format(0, 0, len(ind)))

0 nodes, 0 edges, 8182 indications


In [20]:
ind.to_csv('../data/indications_slim.csv', index=False)

## De-duplicate and Save Files

In [22]:
%%time

print("{} nodes, {} edges, {} indications".format(len(nodes), len(edges), len(ind)))

nodes = nodes.drop_duplicates(subset=':ID')

# Some edges now duplicated, de-duplicate and combine pmids
grpd = edges.groupby([':START_ID', ':END_ID', ':TYPE'])
edges = grpd['pmids'].apply(lambda Series: set.union(*Series.values)).reset_index()
# re-count the pmid numbers
edges['n_pmids'] = edges['pmids'].apply(len)

ind = ind.drop_duplicates(subset=['compound_umlscui', 'disease_umlscui'])

print("{} nodes, {} edges, {} indications".format(len(nodes), len(edges), len(ind)))

215817 nodes, 10344754 edges, 8264 indications
214590 nodes, 10230594 edges, 8176 indications
CPU times: user 14min 39s, sys: 3.21 s, total: 14min 43s
Wall time: 14min 43s


In [23]:
nodes.to_csv('../data/nodes_7_metanode_slim.csv', index=False)
edges.to_csv('../data/edges_7_metanode_slim.csv', index=False)
ind.to_csv('../data/indications_slim.csv', index=False)

In [24]:
slim_ids = list(final['parent_cui'].unique())
len(ind.query('disease_umlscui in @slim_ids'))

1096

In [25]:
import pickle
pickle.dump(slim_ids, open('../data/slim_cuis.pkl', 'wb'))