# Getting the list of compounds used in JUMP CP effort

In [1]:
import pandas as pd

InChI keys of compounds from jump-cell painting dataset (https://github.com/jump-cellpainting/datasets)

In [2]:
jump_cp_inchlkey = pd.read_csv('./compound.csv')
jump_cp_inchlkey.shape

(116753, 3)

In [3]:
jump_cp_inchlkey.head()

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...
1,JCP2022_000002,AAAJHRMBUHXWLD-UHFFFAOYSA-N,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...
2,JCP2022_000003,AAALVYBICLMAMA-UHFFFAOYSA-N,InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-...
3,JCP2022_000004,AAANUZMCJQUYNX-UHFFFAOYSA-N,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...
4,JCP2022_000005,AAAQFGUYHFJNHI-UHFFFAOYSA-N,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...


Converting InChI keys to PubChem Compound IDs (https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi)

In [4]:
cids = pd.read_csv('./jump_cell_painting_cids.csv', index_col = 0)

In [5]:
cids.shape

(107332, 1)

In [6]:
cids.head()

Unnamed: 0,cid
0,110094592
1,5076487
2,1697
3,52934829
4,72019641


# Collecting compound descriptions

In [5]:
import json
import os
import requests
import urllib
from urllib.request import urlopen
from tqdm import tqdm_notebook
from tqdm import tqdm
#from fake_useragent import UserAgent
#from bs4 import BeautifulSoup
import time
import shutup; shutup.please()

In [17]:
data = []
for i in tqdm_notebook(range(cids.shape[0])):
    try:
        CID = str(cids.iloc[i, 0])
        page_link = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + CID + '/description/JSON'
        json_url = urlopen(page_link)
        json_data = json.loads(json_url.read())
        if 'InformationList' in json_data and 'Information' in json_data['InformationList']:
            for info in json_data['InformationList']['Information']:
                data.append({
                    'CID': CID,
                    'Description': info.get('Description', None)
                })
        #time.sleep(1)
    except:
        continue

  0%|          | 0/107332 [00:00<?, ?it/s]

In [18]:
descriptions = pd.DataFrame(data).dropna(subset = 'Description').reset_index(drop=True)
descriptions.shape

(4094, 2)

In [19]:
descriptions.head()

Unnamed: 0,CID,Description
0,1697,"4,5-dianilinophthalimide is phthalimide substi..."
1,755673,"7-hydroxy-2,3,4,5-tetrahydrobenzofuro[2,3-c]az..."
2,15160711,Ovalitenin B is a butanone.
3,176870,Erlotinib is a quinazoline compound having a (...
4,656344,"2-[[5-(4-methylphenyl)-1,3,4-oxadiazol-2-yl]th..."


In [11]:
descriptions.to_csv('./cmpd_descriptions.csv')

# Collecting pharmacology data

In [34]:
data = []
for i in tqdm_notebook(range(cids.shape[0])):
    try:
        CID = str(cids.iloc[i, 0])
        page_link = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/' + CID + '/JSON/'
        request = urllib.request.Request(page_link)
        response = urllib.request.urlopen(request)
        json_data = json.load(response, strict = False)
        cmpd_section = json_data['Record']['Section']
        for i in cmpd_section:
            if i['TOCHeading'].startswith('Pharmacology and Biochemistry'):
                pharm_data = i['Section']
                string = json.dumps(pharm_data)
                data.append({
                    'CID': CID,
                    'Pharmacology information': string
                })
    except:
        continue

  0%|          | 0/107332 [00:00<?, ?it/s]

In [35]:
pharmacology = pd.DataFrame(data)
pharmacology.head()

Unnamed: 0,CID,Pharmacology information
0,1697,"[{""TOCHeading"": ""MeSH Pharmacological Classifi..."
1,31789,"[{""TOCHeading"": ""MeSH Pharmacological Classifi..."
2,176870,"[{""TOCHeading"": ""MeSH Pharmacological Classifi..."
3,226036,"[{""TOCHeading"": ""Human Metabolite Information""..."
4,2431,"[{""TOCHeading"": ""Pharmacodynamics"", ""Descripti..."


In [36]:
pharmacology.shape

(2868, 2)

In [37]:
pharmacology.to_csv('./cmpd_pharmacology_info.csv')

# Collecting scientific articles on compounds

In [6]:
data = []
for i in tqdm_notebook(range(cids.shape[0])):
    try:
        CID = str(cids.iloc[i, 0])
        page_link = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + CID + '/property/LiteratureCount/CSV'
        cid_lit_count = pd.read_csv(page_link)
        data.append(cid_lit_count)
    except:
        continue
lit_count = pd.concat(data, ignore_index=True)

  0%|          | 0/107332 [00:00<?, ?it/s]

In [7]:
lit_count.head()

Unnamed: 0,CID,LiteratureCount
0,110094592,0
1,5076487,0
2,1697,101
3,52934829,0
4,72019641,0


In [8]:
lit_count = lit_count.loc[lit_count.LiteratureCount != 0]
lit_count.shape

(9347, 2)

In [9]:
lit_count.to_csv('./cmpd_lit_count.csv')

In [47]:
articles = pd.DataFrame(columns = ['cid', 'title', 'abstract'])
for i in tqdm_notebook(range(lit_count.shape[0])):
    try:
        CID = str(lit_count.iloc[i, 0])
        url = 'https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={%22download%22:%22*%22,%22collection%22:%22literature%22,%22order%22:[%22articlepubdate,desc%22],%22start%22:1,%22limit%22:10000000,%22downloadfilename%22:%22pubchem_cid_' + CID + '_literature%22,%22nullatbottom%22:1,%22where%22:{%22ands%22:[{%22cid%22:%22' + CID + '%22}]}}'
        json_url = urlopen(url)
        json_data = json.loads(json_url.read())
        for index, paper in enumerate(json_data):
            cid = CID
            title = paper.get('articletitle')
            abstract = paper.get('articleabstract')
            articles.loc[index] = [cid, title, abstract]      
    except:
        continue
articles.shape

  0%|          | 0/9347 [00:00<?, ?it/s]

(10000, 3)

In [48]:
articles.head()

Unnamed: 0,cid,title,abstract
0,196968,H23 Antigen,
1,196968,Serotonin receptors 5-HTR2A and 5-HTR2B are in...,BACKGROUND: Cigarette smoke plays an important...
2,196968,Serotonin-2B receptor antagonism increases the...,Previous research has implicated the serotonin...
3,196968,Roles of 5-HT2B Receptor in Pain,
4,196968,"Gene Structure, Expression, and 5-HT2B Recepto...",


In [49]:
articles.to_csv('./cmpd_articles.csv')