# Download the PubChem IDs and Names of Molecules
Get the SMILES strings then add the names they are known by

In [1]:
from cfree.store import MoleculeRecord
from tempfile import TemporaryDirectory
from multiprocessing.pool import Pool
from pathlib import Path
from typing import Optional
from shutil import copyfileobj
from more_itertools import peekable
from mongoengine import connect
from rdkit import RDLogger
from tqdm import tqdm
import requests
import gzip

Configuration

In [2]:
write_chunk_size = 100000

Surpress complaints from RDKit

In [3]:
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

Connect to Mongo

In [4]:
client = connect('cfree')
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary(), uuidrepresentation=3)

In [5]:
db = client['cfree']
coll = db['molecule_record']
print(f'Database already has {coll.estimated_document_count()} molecules')

Database already has 61435078 molecules


## Make functions to iterate from PubChem Data Files
PubChem supplies a mapping of their "Compound ID" to a SMILES string and synonyms as separate files. 

The Data Files are hosted on an [FTP server](https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/) We can access them via HTTP requests.

In [6]:
def get_smiles_strings() -> (int, str):
    """Iterate over all of the SMILES strings in PubChem
    
    Yields:
        - Its compound ID in PubChem
        - SMILES string of a molecule
    """
    with TemporaryDirectory(prefix='smiles') as tmp:
        file_path = Path(tmp) / 'smiles.gz'
        with requests.get('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz', stream=True) as req, file_path.open('wb') as fo:
            copyfileobj(req.raw, fo)
    
        with gzip.open(file_path, 'rt') as fp:
            for line in fp:
                id_str, smiles = line[:-1].split("\t")
                yield int(id_str), smiles
smiles_iter = peekable(get_smiles_strings())
assert smiles_iter.peek() == (1, 'CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C')

In [7]:
def get_synonyms() -> (str, int):
    """Iterate over all of the synonyms in PubChem
    
    Yields:
        - SMILES string of a molecule
        - Its compound ID in PubChem
    """
    # Store the names of the molecules
    curr_id = None
    names = []
    
    with TemporaryDirectory(prefix='names') as tmp:
        file_path = Path(tmp) / 'names.gz'
        with requests.get('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-Synonym-filtered.gz', stream=True) as req, file_path.open('wb') as fo:
            copyfileobj(req.raw, fo)
    
        for line in gzip.open(file_path, 'rt'):
            # Read in the CID and name
            id_str, name = line[:-1].split("\t")
            my_id = int(id_str)
            
            # If the CID has changed, we can return the previous CID's names
            if curr_id != my_id:
                # If not the first, return the names
                if curr_id is not None:
                    yield curr_id, names
                    
                # Prepare to store new names
                curr_id = my_id
                names = []
            
            # Append and continue
            names.append(name)
                
    return curr_id, names
name_iter = peekable(get_synonyms())
cid, names = name_iter.peek()
assert cid == 1
assert 'acetylcarnitine' in names

## Download and Save Everything
We have two iterators, so the idea is going to be to store everything in 

In [8]:
def make_record(x: (int, str)) -> Optional[MoleculeRecord]:
    """Make a molecule record
    
    This function is designed to run in parallel, 
    so that we can parse faster
    
    Args:
        x: CID and SMILES string of a molecule
    Returns: 
        Parsed record
    """
    
    # Make a molecule record
    cid, smiles = x
    try:
        record = MoleculeRecord.from_identifier(smiles=smiles)
    except (ValueError, RuntimeError):
        return None
    record.identifier.pubchem_id = cid
    
    return record

In [None]:
failures = 0
total = 0
to_save = {}
with Pool(6) as p:
    for record in tqdm(p.imap(make_record, smiles_iter, chunksize=1000)):
        # Get the record ID if it parsed
        total += 1
        if record is None:
            failures += 1
            continue

        cid = record.identifier.pubchem_id

        # Loop until the synonyms list is farther back than our present position
        while name_iter.peek()[0] < cid:
            next(name_iter)
        nid, names = name_iter.peek()
        if name_iter.peek()[0] == cid:
            record.names = names

        # Save it
        to_save[record.key] = record
        if len(to_save) > write_chunk_size:
            # Determine which records are already present
            existing_keys = coll.find({'_id': {'$in': list(to_save.keys())}}, return_key=True)
            for key in existing_keys:
                to_save.pop(key['_id'])

            # If there are any to add, insert them
            if len(to_save) > 0:
                coll.insert_many([x.to_mongo() for x in to_save.values()])
            to_save.clear()
print(f'Stored {total-failures} of {total}')

36768001it [2:38:40, 5643.34it/s] 