# Load Data

In [1]:
import pandas as pd
APEXBio_Library_Details = pd.read_excel('L1021-DiscoveryProbe-FDA-approved-Drug-Library(1).xlsx', 
                                        sheet_name='Chemical Data',
                                       nrows=2322) # There is a comment after row 2322. Skipping the comment after our desired data.
APEXBio_Library_Details = APEXBio_Library_Details.dropna(how='all').copy(deep=True) # Dropping any empty rows

# Checking the validity of provided SMILES notation

In [2]:
# We will use rdkit to check validity of provided SMILES notation for all compounds

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def smiles_check(SMILES_code):
    x = 1
    if Chem.MolFromSmiles(SMILES_code) is None:
        x =0
    return x
APEXBio_Library_Details['SMILES_check'] = APEXBio_Library_Details['SMILES'].apply(smiles_check)

# Get entries with defective SMILES 
## smiles_to_parse dictionary pairs compound name to respective APEXbio URL

In [3]:
Entries_with_defective_SMILES = APEXBio_Library_Details[APEXBio_Library_Details.SMILES_check == 0].copy(deep=True)
smiles_to_parse_dict = dict(zip(Entries_with_defective_SMILES['Item Name'], Entries_with_defective_SMILES['URL']))
# len(smiles_to_parse_dict) = 290, there are 290 compounds with defective SMILES

# Get SMILES notation from PubChem

In [4]:
import pubchempy as pcp
name_to_SMILES_pcp = {i:pcp.get_compounds(i, 'name') for i in smiles_to_parse_dict}
# This step should take a while since this is communication with PubChem server and 
# fetching server response.

# Get PubChem results with multiple hits

In [5]:
name_to_SMILES_pcp_entries_with_multiple_hits = dict()
for i in name_to_SMILES_pcp:
    if len(name_to_SMILES_pcp[i]) > 1:
        name_to_SMILES_pcp_entries_with_multiple_hits[i] = name_to_SMILES_pcp[i]
len(name_to_SMILES_pcp_entries_with_multiple_hits)

97

# Get PubChem results with single or zero hits

In [43]:
name_to_SMILES_pcp_single_or_zero_hit = dict()
for i in (set(name_to_SMILES_pcp.keys()) - set(name_to_SMILES_pcp_entries_with_multiple_hits.keys())):
    name_to_SMILES_pcp_single_or_zero_hit[i] = name_to_SMILES_pcp[i]
#name_to_SMILES_pcp_single_or_zero_hit

 # Get PubChem results with zero hits

In [7]:
name_to_SMILES_pcp_zero_hit = [i for i in name_to_SMILES_pcp_single_or_zero_hit if len(name_to_SMILES_pcp_single_or_zero_hit[i]) == 0]

# Get PubChem resuts with single hits

In [44]:
name_to_SMILES_pcp_single_hit = set(name_to_SMILES_pcp_single_or_zero_hit) - set(name_to_SMILES_pcp_zero_hit)
#name_to_SMILES_pcp_single_hit
compound_to_SMILES_pcp_single_hit = {i:name_to_SMILES_pcp_single_or_zero_hit[i][0].canonical_smiles for i in name_to_SMILES_pcp_single_hit}
#compound_to_SMILES_pcp_single_hit

# For PubChem results with multiple hits, get molecular weight for all hits

In [45]:
name_to_SMILES_pcp_entries_with_multiple_hits_molWt = list()
for i in name_to_SMILES_pcp_entries_with_multiple_hits:
    temp_list = name_to_SMILES_pcp[i]
    for j in temp_list:
        name_weight_pair = {i:j.molecular_weight}
        name_to_SMILES_pcp_entries_with_multiple_hits_molWt.append(name_weight_pair)
        #print(name_weight_pair)


# For PubChem results with multiple hits - If molecular weight for all hits are same, then keep only the first hit, discard the rest

In [10]:
mean_molWt_multipleHits = pd.DataFrame(name_to_SMILES_pcp_entries_with_multiple_hits_molWt).astype(float).mean()

name_to_SMILES_pcp_entries_with_multiple_hits_first_enrty_subset = dict()
for i in name_to_SMILES_pcp_entries_with_multiple_hits:
    if float(name_to_SMILES_pcp_entries_with_multiple_hits[i][0].molecular_weight) == mean_molWt_multipleHits.loc[i]:
        name_to_SMILES_pcp_entries_with_multiple_hits_first_enrty_subset[i] = name_to_SMILES_pcp_entries_with_multiple_hits[i][0]
    else:
        print(f'Molecular weight did not match with the mean for {i}')
        

Molecular weight did not match with the mean for Galanthamine HBr
Molecular weight did not match with the mean for Leucovorin Calcium
Molecular weight did not match with the mean for Alarelin Acetate
Molecular weight did not match with the mean for Bivalirudin Trifluoroacetate
Molecular weight did not match with the mean for MK-5172
Molecular weight did not match with the mean for Doxycycline hyclate
Molecular weight did not match with the mean for Varenicline Tartrate
Molecular weight did not match with the mean for Staurosporine
Molecular weight did not match with the mean for Tivantinib (ARQ 197)
Molecular weight did not match with the mean for Bleomycin Sulfate
Molecular weight did not match with the mean for Ridaforolimus (Deforolimus, MK-8669)
Molecular weight did not match with the mean for Ceftriaxone Sodium Trihydrate
Molecular weight did not match with the mean for Azlocillin sodium salt
Molecular weight did not match with the mean for AHU-377 hemicalcium salt
Molecular weigh

In [11]:
compound_to_SMILES_pcp_multiple_hit_first_entry = {x:name_to_SMILES_pcp_entries_with_multiple_hits_first_enrty_subset[x].canonical_smiles 
                          for x in name_to_SMILES_pcp_entries_with_multiple_hits_first_enrty_subset}
# 59 such compunds

# For PubChem results with multiple hits - remaining compounds with unresolved compund-to-SMILE map

In [12]:
remaining_ambigous_compound_set = set(name_to_SMILES_pcp_entries_with_multiple_hits.keys()) - set(name_to_SMILES_pcp_entries_with_multiple_hits_first_enrty_subset.keys())
# 38 such compounds

# Combine compounds with missing SMILES - 
# (compounds with no hits) + (compound with multiple hits that can not be unambigously assigned)

In [19]:
combined_unmapped_set = set(name_to_SMILES_pcp_zero_hit) | set(remaining_ambigous_compound_set)

# Fetch SMILES from APExBio website 
# (using URLs provided in the loaded xlsx file)

In [22]:
# 'Compound name':'APExBio URL' pair was already retrieved before and saved as smiles_to_parse_dict
# we are going to use this smiles_to_parse_dict to get URL addresses and use selenium to grab respective APexBio webpages.
# APExBio webpage for these compounds have a table that lists many compound properties including compound SMILES code.
# I have manually checked couple of compounds that are in our combined_unmapped_set. The smile condes for these compounds
# came out as valid. So, for now, I'm assuming that SMILE codes listed in individual compound webpage is valid.


### Codeblock to dynamically parse APEXbio website

In [35]:
# This codeblock defines functions to grab small-molecule compound properties table from 
# respective compound web-page. Briefly, the URL is loaded, parsed using pandas and finally
# SMILES codes are extracted.

import time
import random
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import pandas as pd
#url = 'https://www.apexbt.com/search.php?catalog=A1402'

s = Service(r'C:\Users\Akash\Downloads\chromedriver_win32(1)\chromedriver')

def fetch_selenium(URL):
    #URL = 'https://bepl.ent.sirsi.net/client/en_US/default/search/results?dt=list'
    # initiating the webdriver. Parameter includes the path of the webdriver.
    driver = webdriver.Chrome(service=s) 
    driver.get(URL) 
    # this is just to ensure that the page is loaded
    time.sleep(random.uniform(5,10)) # Wait for a duration of a time choosen randomly between 5 s and 10 s.
    html = driver.page_source
    # this renders the JS code and stores all
    # of the information in static HTML code.
    #Close the window.
    driver.close()
    return html

def get_SMILES_from_URL(url):
    try:
        soup_selenium = fetch_selenium(url)
        apexbio_info_df = pd.read_html(str(soup_selenium))[1]
        smiles_code = apexbio_info_df.set_index(0).loc['Canonical SMILES'].iloc[0]
        #smiles_code = rows[10].find_all('td')[1].text
        #print(f'{url} fetch successful')
    except:
        smiles_code = 'FAILED'
        print(f'{url} fetch FAILED')
    return smiles_code

#get_SMILES_from_URL('https://www.apexbt.com/search.php?catalog=C6447')

### Grab unmapped compunds from APExBio website

In [36]:
compound_to_SMILES_APExBio = {i:get_SMILES_from_URL(smiles_to_parse_dict[i]) for i in combined_unmapped_set}

https://www.apexbt.com/search.php?catalog=B1639 fetch successful
https://www.apexbt.com/search.php?catalog=C6447 fetch successful
https://www.apexbt.com/search.php?catalog=C5503 fetch successful
https://www.apexbt.com/search.php?catalog=N1315 fetch successful
https://www.apexbt.com/search.php?catalog=B1689 fetch successful
https://www.apexbt.com/search.php?catalog=B8718 fetch successful
https://www.apexbt.com/search.php?catalog=B3342 fetch successful
https://www.apexbt.com/search.php?catalog=B6586 fetch successful
https://www.apexbt.com/search.php?catalog=C5378 fetch successful
https://www.apexbt.com/search.php?catalog=C5593 fetch successful
https://www.apexbt.com/search.php?catalog=A5391 fetch successful
https://www.apexbt.com/search.php?catalog=N1900 fetch successful
https://www.apexbt.com/search.php?catalog=B7201 fetch successful
https://www.apexbt.com/search.php?catalog=C6434 fetch successful
https://www.apexbt.com/search.php?catalog=A8331 fetch successful
https://www.apexbt.com/se

# Combining all fetched smiles

In [84]:
fetched_molecule_to_smile_map_compiled = dict()
fetched_molecule_to_smile_map_compiled.update(compound_to_SMILES_pcp_single_hit)
fetched_molecule_to_smile_map_compiled.update(compound_to_SMILES_pcp_multiple_hit_first_entry)
fetched_molecule_to_smile_map_compiled.update(compound_to_SMILES_APExBio)
len(fetched_molecule_to_smile_map_compiled) - len(Entries_with_defective_SMILES) # This should be zero.

0

# Check final dictionary for failed SMILES

In [85]:
failed_count = 0
removed_compound_list = list()
for i in list(fetched_molecule_to_smile_map_compiled):
    if smiles_check(fetched_molecule_to_smile_map_compiled[i]) ==0:
        print(f'SMILE map failed for {i}')
        try:
            fetched_molecule_to_smile_map_compiled[i] = name_to_SMILES_pcp[i][0].canonical_smiles
            # For entries with failed SMILES afther all these steps, we are going to assume that 
            # the first hit from PubChem is the best possible match. 
        except:
            del fetched_molecule_to_smile_map_compiled[i]
            print(f'{i} has been removed from the compilation')
            removed_compound_list.append(i)
        failed_count = failed_count +1
print(failed_count)

SMILE map failed for Bleomycin Sulfate
SMILE map failed for Gentamycin Sulfate
Gentamycin Sulfate has been removed from the compilation
SMILE map failed for Doxycycline hyclate
SMILE map failed for Calcitonin (salmon)
Calcitonin (salmon) has been removed from the compilation
SMILE map failed for Bivalirudin Trifluoroacetate
SMILE map failed for Colistin Sulfate
Colistin Sulfate has been removed from the compilation
SMILE map failed for Alarelin Acetate
SMILE map failed for AHU-377 hemicalcium salt
SMILE map failed for LCZ696
LCZ696 has been removed from the compilation
9


In [91]:
removed_compound_list
# It seems these these zero hits were retrived by PubChemPy for thsese four compounds.
# I have checked them manually in PubChem and surprisingly I find that they do show hits,
# esp. when searched via the graphical user interface. Perhaps the web-interface search is
# broad in nature, kind of like Google search, looking for places where matches can be found.
# I'm going to write a web parse code for PubChem.

['Gentamycin Sulfate', 'Calcitonin (salmon)', 'Colistin Sulfate', 'LCZ696']

# Using PubChem graphical search, seems eventhough there is no result when queried using PubChemPy, direct search gives some result. 
## Fetching first hits from PubChem web search (direct)

In [138]:
from bs4 import BeautifulSoup

url = 'https://pubchem.ncbi.nlm.nih.gov/#query=LCZ696'

def fetch_from_PubChem_query(compound_name):
    base_PubChem_url = 'https://pubchem.ncbi.nlm.nih.gov/#query='
    
    PubChem_web_search = fetch_selenium(base_PubChem_url+compound_name)
    soup_PubChem = BeautifulSoup(PubChem_web_search, 'html.parser')
    cid_of_first_hit = soup_PubChem.find_all('a', class_='f-medium')[0].text
    print(f'{cid_of_first_hit} has been retrieved.')
    canonical_smiles_from_cid = pcp.Compound.from_cid(cid_of_first_hit).canonical_smiles
    return canonical_smiles_from_cid

PubChem_WebFetch_for_failed = dict()
for i in removed_compound_list:
    PubChem_WebFetch_for_failed[i] = fetch_from_PubChem_query(i)

25134238 has been retrieved.
16133812 has been retrieved.
1117 has been retrieved.
19427706 has been retrieved.


In [141]:
for i in PubChem_WebFetch_for_failed:
    print(smiles_check(PubChem_WebFetch_for_failed[i]))
# Seems like all smiles are valid now!

1
1
1
1


In [142]:
print(f'Initial dictionary length was {len(fetched_molecule_to_smile_map_compiled)}')
for i in PubChem_WebFetch_for_failed:
    fetched_molecule_to_smile_map_compiled[i] = PubChem_WebFetch_for_failed[i]
print(f'Final dictionary length is {len(fetched_molecule_to_smile_map_compiled)}')

Initial dictionary length was 286
Final dictionary length is 290


In [143]:
# Back to 290 entries!

# Updating the dataframe with retrieved SMILES entries

In [144]:
for i in fetched_molecule_to_smile_map_compiled:
    APEXBio_Library_Details.loc[APEXBio_Library_Details['Item Name'] == i, 'SMILES'] = fetched_molecule_to_smile_map_compiled[i]

In [145]:
APEXBio_Library_Details.SMILES.apply(smiles_check).sum()

2321

In [146]:
# Yay! 2321 valid SMILES!

In [147]:
APEXBio_Library_Details.to_csv('L1021-DiscoveryProbe-FDA-approved-Drug-Library_Updated.csv')

In [149]:
#APEXBio_Library_Details