In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import requests
import time
from tqdm import tqdm

### Fetch all unique small molecules from Drugmonizome by InChI Key

In [None]:
uploaded_entities = requests.get("https://maayanlab.cloud/drugmonizome/metadata-api/entities").json()
len(uploaded_entities)

In [None]:
synonym_dict = {}
for entity in uploaded_entities:
    name = entity['meta']['Name']
    inchikey = entity['meta']['InChI_key']
    synonym_dict[inchikey] = name

### Fetch all unique small molecules from SEP-L1000 by InChI Key

In [None]:
# L1000 drugs with expression signatures
L1000_DATA = 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/SEP-L1000/LINCS_Gene_Experssion_signatures_CD.csv.gz'
l1000data_df = pd.read_csv(L1000_DATA).set_index('InChI Key')

# L1000 drug metadata
L1000_METADATA = 'https://maayanlab.cloud/L1000FWD/download/Drugs_metadata.csv'
l1000meta_df = pd.read_csv(L1000_METADATA, index_col=5)
l1000meta_df.index = l1000meta_df.index.map(lambda s: s.replace('InChIKey=', '') if isinstance(s, str) else s)
l1000meta_df = l1000meta_df.iloc[np.logical_not(l1000meta_df.index.duplicated())]

In [None]:
l1000_dict = dict(zip(l1000data_df.index, l1000meta_df['pert_iname'].reindex(l1000data_df.index)))

In [None]:
synonym_dict.update(l1000_dict)

In [None]:
len(synonym_dict)

In [None]:
# Import list of compounds to filter out... these are not considered small molecules
with open('compounds-to-filter.txt', 'r') as f:
    filter_list = [x.strip().lower() for x in f.read().strip().split('\n')]
    
# Filter unwanted compounds from synonym_dict
synonym_dict = {k:v for k,v in synonym_dict.items() if v not in filter_list}

### PubChem Search for PubMed IDs associated with each InChI Key

In [None]:
xref_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/%s/xrefs/PubMedID/JSON'

pmid_dict = {}
output = []

failed = []

for inchikey,name in tqdm(synonym_dict.items()):
    response = requests.get(xref_url%inchikey)
    try:
        response.json()
    except ValueError:
        continue
    pmids = []
    if 'InformationList' in response.json().keys():
        for entry in response.json()['InformationList']['Information']:
            if 'PubMedID' in entry:
                pmids.extend(entry['PubMedID'])
                
    else:
        failed.append(inchikey)
    
    if len(set(pmids)) >= 2:
        for i in pmids:
            output.append((inchikey, name, i))
    
    time.sleep(0.20)

In [None]:
print("{} / {} records could not be retrieved ({} percent)".format(len(failed),len(synonym_dict), len(failed)/len(synonym_dict)*100))

In [None]:
# Create tsv file of all chemical-PMID associations
DrugRIF = pd.DataFrame(data = output, columns = ['inchikey','name','PMID']).set_index('name')
DrugRIF = DrugRIF.reset_index().dropna().set_index('name')
DrugRIF.drop_duplicates(inplace = True)
DrugRIF.to_csv('DrugRIF.tsv.gz', sep = '\t')