# Description
### (April 26 2020)
* In this notebook, we use the [Pubchem-Py](https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html) to obtain all SMILES and INCHI strings for all compounds found in KEGG (see `nb0_parseKEGGcompunds.ipynb`).

In [2]:
# Imports
import pubchempy as pcp
import pandas as pd
import time

## Get SMILES and InChI strings from Pubchem IDs in `../tables/kegg_compounds.tsv`

In [3]:
kegg_compounds = pd.read_csv('../tables/kegg_compounds.tsv', sep='\t')
kegg_compounds.head(3)

Unnamed: 0,Kegg-id,Pubchem-id,Chembl-id,CAS-number
0,cpd:C00001,3303.0,CHEMBL1098659,7732-18-5
1,cpd:C00002,3304.0,CHEMBL14249,56-65-5
2,cpd:C00003,3305.0,CHEMBL1234613 CHEMBL1454168,53-84-9


In [20]:
# Get strings

#NOTE: This will take a while
start_time = time.time()

pubchem_numbers = list(kegg_compounds['Pubchem-id'])
isomeric_smiles = []
canonical_smiles = []
inchi_strings = []

for i, pubchem_id in enumerate(pubchem_numbers):
    
    # Log progress
    if i%1000 == 0 and i > 0:
        print('Processed {} compounds'.format(i))
        print('Elapsed time so far: {} seconds'.format(time.time() - start_time))

    if pubchem_id == '' or str(pubchem_id) == 'nan':
        isomeric_smiles.append('')
        canonical_smiles.append('')
        inchi_strings.append('')

    else:
        try:    
            iso_smiles, cano_smiles, inchi = getStringRepresentations(pubchem_id)
            isomeric_smiles.append(iso_smiles)
            canonical_smiles.append(cano_smiles)
            inchi_strings.append(inchi)
        except:
            isomeric_smiles.append('')
            canonical_smiles.append('')
            inchi_strings.append('')

print('Total time elapsed: {} seconds'.format(time.time() - start_time))

Processed 14000 compounds
Elapsed time so far: 68.2527129650116 seconds
Processed 15000 compounds
Elapsed time so far: 545.9440810680389 seconds
Processed 16000 compounds
Elapsed time so far: 1036.1503551006317 seconds
Processed 17000 compounds
Elapsed time so far: 1470.3248751163483 seconds
Processed 18000 compounds
Elapsed time so far: 1832.9193739891052 seconds
Total time elapsed: 2009.9433789253235 seconds


In [21]:
print(len(pubchem_numbers))
print(len(isomeric_smiles))
print(len(canonical_smiles))
print(len(inchi_strings))

18700
18700
18700
18700


In [28]:
# Save results to .tsv file
f = open('../data/kegg_compound_strings.tsv', 'w')
f.write('\t'.join(['Pubchem-id','Isomeric-SMILES','Canonical-SMILES','InChI']) + '\n')
for i in range(len(pubchem_numbers)):
    if str(pubchem_numbers[i]) == 'nan':
        f.write('-1\t')
    else:
        f.write(str(int(pubchem_numbers[i])) + '\t')
    f.write('\t'.join([str(isomeric_smiles[i]), str(canonical_smiles[i]), str(inchi_strings[i])]) + '\n')
f.close()