# 0. Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import requests
import os
import json
import tqdm

from rdkit import Chem
from tqdm import tqdm
from thermo import functional_groups
from Bio import Entrez
from chembl_structure_pipeline import checker

from rdkit.Chem import rdMolDescriptors, Descriptors, Lipinski, Crippen, inchi
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from concurrent.futures import ThreadPoolExecutor, as_completed

[21:22:05] Initializing Normalizer


In [2]:
data_folder = 'data/AID1843' # Name your data folder
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# 1. Data gathering

Before importing data, need to identify which AIDs will be included. Data will be imported from https://pubchem.ncbi.nlm.nih.gov/assay/pcget. For more information on PubChem's programmatic access, refer to: https://pubchem.ncbi.nlm.nih.gov/docs/bioassays. Some other programmatic access options available such as PUG-REST. However, these might not be optimal for bulk retrieval or handling of large dataset due to the limitation of request volume. 

Data for individual assays include 7 required columns (CIDs, isomeric SMILES, etc.) and optional test results. Refer to https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/README for further details. For datasets intended for regression model, additional columns could be extracted accordingly.

In [29]:
# Desired AIDs:
AIDs = [1672, 2032, 2105, 463252, 2236, 2329, 2345]

#Keep unique values in list AIDs (since there could be overlapping AIDs from different targets or project)
AIDs = list(set(AIDs))
AIDs = [str(AID) for AID in AIDs]
print('Number of datasets retrieving: ', len(AIDs))

Number of datasets retrieving:  7


In [4]:
#Data to be extracted from the assay:
col_list = ['PUBCHEM_CID','PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']

count = 0
for AID in AIDs:
    url = f'https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&response_type=save&aid={AID}'
    assay = pd.read_csv(url, usecols=col_list)

    #convert SMILES to string
    assay['PUBCHEM_EXT_DATASOURCE_SMILES'] = assay['PUBCHEM_EXT_DATASOURCE_SMILES'].astype(str)

    #delete rows with nan values
    assay = assay.dropna(subset=['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID'])

    #convert cids to int: 
    assay['PUBCHEM_CID'] = assay['PUBCHEM_CID'].astype(int)

    #reindex assay dataframe from 0:
    assay.reset_index(drop=True, inplace=True)

    #Create a new folder to save the data (with relative path):
    if not os.path.exists(f'{data_folder}/before_finished/step_1'):
        os.makedirs(f'{data_folder}/before_finished/step_1')
    assay.to_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv', index=False)
    count += 1
    print(f'{count} out of {len(AIDs)} complete')

1 out of 7 complete
2 out of 7 complete
3 out of 7 complete
4 out of 7 complete
5 out of 7 complete
6 out of 7 complete
7 out of 7 complete


# 2. Isomeric SMILES

For the purpose of our project, we would like to include isomeric form of SMILES representation in our final dataset. Although PubChem claimed that their datatable should include isomeric SMILES (https://pubchem.ncbi.nlm.nih.gov/docs/bioassays), some dataset might include non-isomeric SMILES. This step is to import isomeric SMILES based on CIDs. 

Several packages such as RDkit have modules to return isomeric SMILES from a given input SMILES. However, for consistency, we decided to use the PubChem Identifier Exchange Service, which take an input identifier (CIDs, SMILES, InChI, etc.)  and return the corresponding identifier (CIDs, isomeric SMILES, InChIs, etc.). Here, we export the list of CIDs for compounds in our dataset and use this server to retrieve their isomeric SMILES. For more information, refer to: https://pubchem.ncbi.nlm.nih.gov/docs/identifier-exchange-service

In [5]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_2'):
    os.makedirs(f'{data_folder}/before_finished/step_2')

#Export list of CIDs to csv with one column without the column name:
for AID in AIDs:
    assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')
    assay['PUBCHEM_CID'].to_csv(f'{data_folder}/before_finished/step_2/CID{AID}.csv', index=False, header=False)
"""
After this step, submit the lists of CIDs at https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi 
    Operator type: "same CID" 
    Output IDs "SMILES" (isomeric SMILES by default) 
    Output method: "Two column file showing each input output-correspondence"
    Compression: "No compression"
Refer to https://pubchem.ncbi.nlm.nih.gov/docs/identifier-exchange-service for more details.
The output files (converted isomeric SMILES) should be named as "isomeric_smi_{AID}.txt" and saved to the "step_2" folder.
"""

'\nAfter this step, submit the lists of CIDs at https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi \n    Operator type: "same CID" \n    Output IDs "SMILES" (isomeric SMILES by default) \n    Output method: "Two column file showing each input output-correspondence"\n    Compression: "No compression"\nRefer to https://pubchem.ncbi.nlm.nih.gov/docs/identifier-exchange-service for more details.\nThe output files (converted isomeric SMILES) should be named as "isomeric_smi_{AID}.txt" and saved to the "step_2" folder.\n'

In [6]:
def check_isomeric_smiles(AIDs):
    """
    Check if the SMILES in the assay are isomeric or not.
    Input: AIDs (list of strings)
    Output: non_isomeric_smi_cids (dictionary with AID as key and list of non-isomeric CIDs as values for the datasets in AIDs
    """
    non_isomeric_smi_cids = {}
    for AID in AIDs:    
        non_isomeric_smi_cids[AID] = []
        #import SMILES.txt file as a table:
        correct_isomeric_smiles = pd.read_csv(f'{data_folder}/before_finished/step_2/isomeric_smi_{AID}.txt', sep='\t', header=None)
        assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')

        #compare smiles in assay with smiles in correct_smiles:
        for cid in assay['PUBCHEM_CID']:
            if assay.loc[assay['PUBCHEM_CID'] == cid, 'PUBCHEM_EXT_DATASOURCE_SMILES'].values[0] != correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]:
                non_isomeric_smi_cids[AID].append(cid)

        if len(non_isomeric_smi_cids[AID]) == 0:
            print(f'All SMILES in AID {AID} are isomeric')
        else:
            print(f'There are some non-isomeric SMILES in AID {AID}:')
            print(non_isomeric_smi_cids[AID])

    return non_isomeric_smi_cids

def update_isomeric(AIDs, non_isomeric_smi_cids):
    """
    Update the SMILES in the assay to isomeric SMILES.
    Input: AIDs (list of strings), non_isomeric_smi_cids (dictionary with AID as key and list of non-isomeric CIDs as values)
    """
    with open(f'{data_folder}/before_finished/step_2/non_isomeric_smi_cids.txt', 'w') as f:
        # record the non-isomeric SMILES 
        for AID in AIDs:
            assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')
            correct_isomeric_smiles = pd.read_csv(f'{data_folder}/before_finished/step_2/isomeric_smi_{AID}.txt', sep='\t', header=None)
            f.write(f'AID {AID}: {non_isomeric_smi_cids[AID]}\n')

            for cid in non_isomeric_smi_cids[AID]:
                f.write(f'CID {cid}: {assay.loc[assay["PUBCHEM_CID"] == cid, "PUBCHEM_EXT_DATASOURCE_SMILES"].values[0]} -> {correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]}\n')
                assay.loc[assay['PUBCHEM_CID'] == cid, 'PUBCHEM_EXT_DATASOURCE_SMILES'] = correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]

            f.write(f'===\n')
            assay.to_csv(f'{data_folder}/before_finished/step_2/AID{AID}.csv', index=False)       

In [7]:
non_isomeric_smi_cids = check_isomeric_smiles(AIDs)

All SMILES in AID 2329 are isomeric
There are some non-isomeric SMILES in AID 1672:
[1952060, 1979247, 2997957, 2999888, 3593962, 3730758, 1829082, 3330059, 3640026, 2997662]
All SMILES in AID 2345 are isomeric
All SMILES in AID 2032 are isomeric
All SMILES in AID 463252 are isomeric
All SMILES in AID 2105 are isomeric
All SMILES in AID 2236 are isomeric


Note: Here they returned that three smiles in AID628 were not isomeric. This shows that the SMILES representation of some compounds in the given datasets might not be isomeric.

In [8]:
update_isomeric(AIDs, non_isomeric_smi_cids)

# 3. InChI

We would like to include standard InChI to diversify users' choice of which data they would like to use for their own benchmark. 

In [9]:
# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_3'):
    os.makedirs(f'{data_folder}/before_finished/step_3')

Again, it is convenient to use the PubChem Identifier Exchange Service (https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi) with operator type "same CID" and Output IDs "InChI" to retrieve InChI from a given list of input CIDs. The same CID lists from STEP 2 could be used here. The resulted InChIs could be checked if being standard by indentifying the presence of 'InChI=1S' at the begining of each InChI string.

In [11]:
"""
CID lists (in "step_2" folder should be submitted to PubChem Identifier Exchange Service)
    Operator type: "same CID" 
    Output IDs "InChI"
    Output method: "Two column file showing each input output-correspondence"
    Compression: "No compression"
InChI list should be saved into "step_3" folder, named as "std_inchi_{AID}.txt" 
"""
# Import dataframes:
for AID in AIDs: 
    exec(f'AID{AID} = pd.read_csv("{data_folder}/before_finished/step_2/AID{AID}.csv")')
    exec(f'AID{AID}_InChI = pd.read_csv("{data_folder}/before_finished/step_3/std_inchi_{AID}.txt", sep="\\t", header=None)')

In [12]:
#Check if they are all standard InChI:
for AID in AIDs:
    check_inchi = f"""
non_standard_InChI = []
for i in range(len(AID{AID}_InChI[1])):
    if not AID{AID}_InChI[1][i].startswith('InChI=1S'):
        non_standard_InChI.append(AID{AID}_InChI[1][i])
if not non_standard_InChI:
    print('All InChI in AID{AID} are standard')
else:
    print('There are some non-standard InChI in AID{AID}')
    print(non_standard_InChI)
    print('===')
"""
    exec(check_inchi)

All InChI in AID2329 are standard
All InChI in AID1672 are standard
All InChI in AID2345 are standard
All InChI in AID2032 are standard
All InChI in AID463252 are standard
All InChI in AID2105 are standard
All InChI in AID2236 are standard


Now we concatenate the InChIs in our tables:

In [13]:
#Convert AID626_InChI to a dictionary and save the files
for AID in AIDs: 
    update_inchi = f"""
AID{AID}_InChI_dict = dict(zip(AID{AID}_InChI[0], AID{AID}_InChI[1]))
AID{AID}['InChI'] = AID{AID}['PUBCHEM_CID'].map(AID{AID}_InChI_dict)
AID{AID}['InChI'] = AID{AID}['InChI'].astype(str)
AID{AID}.to_csv(r"{data_folder}/before_finished/step_3/AID{AID}.csv", index=False)
"""
    exec(update_inchi)

# 4. Check duplicates

When checking duplicates in the datasets, we would like to know if there are
1) Multiple identical molecules
2) Molecules with identical CID but different InChIs or SMILES
3) Molecules with identical InChI but with different CIDs or SMILES

In [14]:
#import:
for AID in AIDs:
    exec(f"AID{AID} = pd.read_csv(r'{data_folder}/before_finished/step_3/AID{AID}.csv', sep=',', header=0)")

In [15]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_4'):
    os.makedirs(f'{data_folder}/before_finished/step_4')

## 4.1. Checking identical molecules

In [16]:
#Return all duplicates by comparing InChI, SMILES, and CIDs:
for AID in AIDs:
    check_duplicate = f"""
AID{AID}_duplicates_InChI = AID{AID}[AID{AID}.duplicated(subset=['InChI'], keep=False)]
AID{AID}_duplicates_SMILES = AID{AID}[AID{AID}.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]
AID{AID}_duplicates_CIDs = AID{AID}[AID{AID}.duplicated(subset=['PUBCHEM_CID'], keep=False)]
print('Number of AID{AID} InChI duplicates: ', len(AID{AID}_duplicates_InChI))
print('Number of AID{AID} SMILES duplicates: ', len(AID{AID}_duplicates_SMILES))
print('Number of AID{AID} CID duplicates: ', len(AID{AID}_duplicates_CIDs))
"""
    exec(check_duplicate)

Number of AID2329 InChI duplicates:  0
Number of AID2329 SMILES duplicates:  0
Number of AID2329 CID duplicates:  0
Number of AID1672 InChI duplicates:  187
Number of AID1672 SMILES duplicates:  139
Number of AID1672 CID duplicates:  137
Number of AID2345 InChI duplicates:  0
Number of AID2345 SMILES duplicates:  0
Number of AID2345 CID duplicates:  0
Number of AID2032 InChI duplicates:  6
Number of AID2032 SMILES duplicates:  4
Number of AID2032 CID duplicates:  4
Number of AID463252 InChI duplicates:  0
Number of AID463252 SMILES duplicates:  0
Number of AID463252 CID duplicates:  0
Number of AID2105 InChI duplicates:  6
Number of AID2105 SMILES duplicates:  4
Number of AID2105 CID duplicates:  4
Number of AID2236 InChI duplicates:  0
Number of AID2236 SMILES duplicates:  0
Number of AID2236 CID duplicates:  0


In [17]:
#write duplicates to a txt file: 
with open(f'{data_folder}/before_finished/step_4/duplicates.txt', 'w') as f:
    for AID in AIDs: 
        duplicates_InChI = eval(f'AID{AID}_duplicates_InChI')
        duplicates_SMILES = eval(f'AID{AID}_duplicates_SMILES')
        duplicates_CIDs = eval(f'AID{AID}_duplicates_CIDs')
        f.write(f'\n\nAID{AID} InChI duplicates:\n')
        f.write(duplicates_InChI.to_string())
        f.write(f'\nAID{AID} SMILES duplicates:\n')
        f.write(duplicates_SMILES.to_string())
        f.write(f'\nAID{AID} CID duplicates:\n')
        f.write(duplicates_CIDs.to_string())

## 4.2. Same CIDs but different chemical representations

In [18]:
#reindex
for AID in AIDs: 
    exec(f"AID{AID}_duplicates_CIDs.reset_index(drop=True, inplace=True)")

In [19]:
with open(f'{data_folder}/before_finished/step_4/sameCID_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameCID_differentInChI = []
        sameCID_differentSMILES = []
        duplicates_CIDs = eval(f'AID{AID}_duplicates_CIDs')
        for i in range(len(duplicates_CIDs['PUBCHEM_CID'])):
            for j in range(i+1, len(duplicates_CIDs['PUBCHEM_CID'])):
                if duplicates_CIDs['PUBCHEM_CID'][i] == duplicates_CIDs['PUBCHEM_CID'][j]:
                    if duplicates_CIDs['InChI'][i] != duplicates_CIDs['InChI'][j]:
                        sameCID_differentInChI.append((duplicates_CIDs['PUBCHEM_CID'][i], duplicates_CIDs['PUBCHEM_CID'][j]))
                    if duplicates_CIDs['PUBCHEM_EXT_DATASOURCE_SMILES'][i] != duplicates_CIDs['PUBCHEM_EXT_DATASOURCE_SMILES'][j]:
                        sameCID_differentSMILES.append((duplicates_CIDs['PUBCHEM_CID'][i], duplicates_CIDs['PUBCHEM_CID'][j]))

        if sameCID_differentInChI == []:
            f.write(f'No duplicate CIDs with different InChIs in AID{AID}\n')
        else:
            f.write('Found duplicate CIDs with different InChIs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameCID_differentInChI))
            f.write("\n")
        
        if sameCID_differentSMILES == []:
            f.write(f'No duplicate CIDs with different SMILES in AID{AID}\n')
        else:
            f.write(f'Found duplicate CIDs with different SMILES in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameCID_differentSMILES))
            f.write("\n")
        f.write("===\n")

## 4.3. Same InChI but with different CIDs or SMILES

In [20]:
#reindex
for AID in AIDs: 
    exec(f"AID{AID}_duplicates_InChI.reset_index(drop=True, inplace=True)")

In [21]:
with open(f'{data_folder}/before_finished/step_4/sameInChI_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameInChI_differentCID = []
        sameInChI_differentSMILES = []
        duplicates_InChI = eval(f'AID{AID}_duplicates_InChI')
        for i in range(len(duplicates_InChI['InChI'])):
            for j in range(i+1, len(duplicates_InChI['InChI'])):
                if duplicates_InChI['InChI'][i] == duplicates_InChI['InChI'][j]:
                    if duplicates_InChI['PUBCHEM_CID'][i] != duplicates_InChI['PUBCHEM_CID'][j]:
                        sameInChI_differentCID.append((duplicates_InChI['PUBCHEM_CID'][i], duplicates_InChI['PUBCHEM_CID'][j]))
                    if duplicates_InChI['PUBCHEM_EXT_DATASOURCE_SMILES'][i] != duplicates_InChI['PUBCHEM_EXT_DATASOURCE_SMILES'][j]:
                        sameInChI_differentSMILES.append((duplicates_InChI['PUBCHEM_CID'][i], duplicates_InChI['PUBCHEM_CID'][j]))
        
        if sameInChI_differentCID == []:
            f.write(f'No duplicate InChIs with different CIDs in AID{AID}\n')
        else:
            f.write('Found duplicate InChIs with different CIDs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameInChI_differentCID))
            f.write("\n")
        
        if sameInChI_differentSMILES == []:
            f.write(f'No duplicate InChIs with different SMILES in AID{AID}\n')
        else:
            f.write(f'Found duplicate InChIs with different SMILES in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameInChI_differentSMILES))
            f.write("\n")
        f.write("===\n")


## 4.5 Same SMILES but with different CIDs or SMILES

In [22]:
#reindex
for AID in AIDs: 
    exec(f"AID{AID}_duplicates_SMILES.reset_index(drop=True, inplace=True)")

In [23]:
with open(f'{data_folder}/before_finished/step_4/sameSMILES_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameSMILES_differentCID = []
        sameSMILES_differentInChI = []
        duplicates_SMILES = eval(f'AID{AID}_duplicates_SMILES')
        for i in range(len(duplicates_SMILES['PUBCHEM_EXT_DATASOURCE_SMILES'])):
            for j in range(i+1, len(duplicates_SMILES['PUBCHEM_EXT_DATASOURCE_SMILES'])):
                if duplicates_SMILES['PUBCHEM_EXT_DATASOURCE_SMILES'][i] == duplicates_SMILES['PUBCHEM_EXT_DATASOURCE_SMILES'][j]:
                    if duplicates_SMILES['PUBCHEM_CID'][i] != duplicates_SMILES['PUBCHEM_CID'][j]:
                        sameSMILES_differentCID.append((duplicates_SMILES['PUBCHEM_CID'][i], duplicates_SMILES['PUBCHEM_CID'][j]))
                    if duplicates_SMILES['InChI'][i] != duplicates_SMILES['InChI'][j]:
                        sameSMILES_differentInChI.append((duplicates_SMILES['PUBCHEM_CID'][i], duplicates_SMILES['PUBCHEM_CID'][j]))
        
        if sameSMILES_differentCID == []:
            f.write(f'No duplicate SMILES with different CIDs in AID{AID}\n')
        else:
            f.write(f'Found duplicate SMILES with different CIDs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameSMILES_differentCID))
            f.write("\n")
        
        if sameSMILES_differentInChI == []:
            f.write(f'No duplicate SMILES with different InChIs in AID{AID}\n')
        else:
            f.write(f'Found duplicate SMILES with different InChIs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameSMILES_differentInChI))
            f.write("\n")
        f.write("===\n")


## 4.5 Drop duplicates

When dropping duplicates, we will keep the first molecule in a pair or a group of duplicates. For example, here there are 12 duplicates (6 pairs) so we keep 6 of them.

In [24]:
# Keep only the first duplicate in the dataframes:
for AID in AIDs: 
    exec(f"AID{AID}.drop_duplicates(subset=['InChI'], keep='first', inplace=True)")

    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=['InChI'], keep=False)]) == 0:
    print('No more duplicate InChI in AID{AID}')
else:
    print('There are still duplicate InChI in AID{AID}')   
    """
    exec(last_check)

No more duplicate InChI in AID2329
No more duplicate InChI in AID1672
No more duplicate InChI in AID2345
No more duplicate InChI in AID2032
No more duplicate InChI in AID463252
No more duplicate InChI in AID2105
No more duplicate InChI in AID2236


In [25]:
for AID in AIDs: 
    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]) == 0:
    print('No more duplicate SMILES in AID{AID}')
else:
    print('There are still duplicate SMILES in AID{AID}')   
    """
    exec(last_check)

No more duplicate SMILES in AID2329
No more duplicate SMILES in AID1672
No more duplicate SMILES in AID2345
No more duplicate SMILES in AID2032
No more duplicate SMILES in AID463252
No more duplicate SMILES in AID2105
No more duplicate SMILES in AID2236


In [26]:
for AID in AIDs: 
    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=['PUBCHEM_CID'], keep=False)]) == 0:
    print('No more duplicate CID in AID{AID}')
else:
    print('There are still duplicate CID in AID{AID}')   
    """
    exec(last_check)

No more duplicate CID in AID2329
No more duplicate CID in AID1672
No more duplicate CID in AID2345
No more duplicate CID in AID2032
No more duplicate CID in AID463252
No more duplicate CID in AID2105
No more duplicate CID in AID2236


In [27]:
# Save the dataframes to csv:
for AID in AIDs: 
    exec(f"AID{AID}.to_csv(r'{data_folder}/before_finished/step_4/AID{AID}.csv', index=False)")

# 5. Hierarchical Curation

For the hierarchical curation, there are some rules: 

(1) All assays used should be on the same or close species/cell lines. Optimally, they should also be from the same project/laboratory.

(2) Primary actives (PrA) will have a large false-positive rate. Therefore, they should be tested in follow-up confirmatory screens (optimally dose-reponse). 

(3) Actives could be promiscuous. Therefore, it is optimal to have counter-screens on different targets to test specificity.

(4) For some projects, compounds were tested in multiple rounds. Therefore, assays often have hierarchical relations. From a single primary screen (Pr), active compounds (Pr_A) could be tested in multiple rounds of confirmatory screens (Cf_1, Cf_2, ..., Cf_final) or counter screens (Ct_1, Ct_2, etc.). Actives from confirmatory screens (Cf_A) have a higher possibility of being true active. If an active compound is tested active in counter screens (Ct_A), it is likely to be a promiscuous compound and should not be included. 

(4) It is important to know the relationship between assays. Active sets from downstream screens always have a lower false-positive rate than active sets from upstream screens due to better assay technologies on a smaller set of compounds. Therefore, final hits should be taken from the intersection of the very last confirmatory assays, without tested active in any counter-screens: 
Final hits = [Cf_final1_A ∩ Cf_final2_A ∩ ...] \ [Ct_1_A ∪ Ct_2_A ∪ ...]

However, if the confirmatory assays are unrelated (tested on different set of compounds), then we might have to take the union of their active sets instead of the intersections as in this formula.

(5) The hierarchical relations should be inspected carefully to see if follow-up confirmatory screens include extra compounds (Ex) that were not tested in earlier screens or tested inactive in earlier screens. If exist, these compounds require manual inspection. 

(6) Final inactives should be taken from primary inactives (Pr_I) (not inconclusive, unspecified, or probes), plus extra compounds that were tested inactive in conformatory screens (Ex_I), if justified.
Final inactives = PrI ∪ Ex_I



## 5.1. Classify groups of compounds in each assay by activities

In [28]:
path = f'{data_folder}/before_finished/step_4' 
keynumbers = [1672, 2032, 2105, 463252, 2236, 2329, 2345] # specify the keynumbers you want to import

for keynumber in keynumbers:
    filename = os.path.join(path, f'AID{keynumber}.csv')
    if os.path.exists(filename):
        df = pd.read_csv(filename, index_col=None, header=0)
        exec(f'AID{keynumber} = df')
        exec(f'AID{keynumber}_active = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Active"]')
        exec(f'AID{keynumber}_inactive = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Inactive"]')
        exec(f'AID{keynumber}_inconclusive = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Inconclusive"]')
        exec(f'AID{keynumber}_unspecified = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Unspecified"]')
        exec(f'AID{keynumber}_probe = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Probe"]')

In [29]:
#Create a df with first column the variables name, and the second column the number of rows:
df = pd.DataFrame(columns=['AID', 'Tested Compounds', 'Active', 'Inactive', 'Inconclusive', 'Unspecified', 'Probe'])
for keynumber in keynumbers:
    exec(f'df.loc[len(df)] = ["AID{keynumber}", len(AID{keynumber}), len(AID{keynumber}_active), len(AID{keynumber}_inactive), len(AID{keynumber}_inconclusive), len(AID{keynumber}_unspecified), len(AID{keynumber}_probe)]')
df

Unnamed: 0,AID,Tested Compounds,Active,Inactive,Inconclusive,Unspecified,Probe
0,AID1672,305585,2590,302995,0,0,0
1,AID2032,2264,926,1338,0,0,0
2,AID2105,2264,426,1838,0,0,0
3,AID463252,320,227,92,1,0,0
4,AID2236,320,74,246,0,0,0
5,AID2329,320,13,307,0,0,0
6,AID2345,1189,18,1171,0,0,0


## 5.2. Check the hierachical relations

In [30]:
def check_is_in(downstream, upstream): 
    downstream_in_upstream = downstream[downstream['PUBCHEM_CID'].isin(upstream['PUBCHEM_CID'])]
    downstream_notin_upstream = downstream[~downstream['PUBCHEM_CID'].isin(upstream['PUBCHEM_CID'])]
    return downstream_in_upstream, downstream_notin_upstream

### Flow: AID1672 (Pr), AID2032 (Cf_a), AID2105 (Ct_a), AID463252 (Cf_b), AID2236 (Ct_b), AID2329 (Ct_b), AID2345(Ct_c)

In [33]:
# Check between AID2032 and AID1672
a1, a2 = check_is_in(AID2032, AID1672_inactive)
a3, a4 = check_is_in(a1, AID2032_active)
a5, a6 = check_is_in(AID2032, AID1672)
a7, a8 = check_is_in(a6, AID2032_active)
print(f'Among AID2032, {len(a1)} were tested inactive in AID1672. Among these, {len(a3)} became active')
print(f'Among AID2032, {len(a6)} were not tested in the AID1672. Among these, {len(a7)} became active')

b1, b2 = check_is_in(AID463252, AID1672_inactive)
b3, b4 = check_is_in(b1, AID463252_active)
b5, b6 = check_is_in(AID463252, AID1672)
b7, b8 = check_is_in(b6, AID463252_active)
print(f'Among AID463252, {len(b1)} were tested inactive in AID1672. Among these, {len(b3)} became active')
print(f'Among AID463252, {len(b6)} were not tested in the AID1672. Among these, {len(b7)} became active')

Among AID2032, 1163 were tested inactive in AID1672. Among these, 0 became active
Among AID2032, 1 were not tested in the AID1672. Among these, 1 became active
Among AID463252, 33 were tested inactive in AID1672. Among these, 0 became active
Among AID463252, 0 were not tested in the AID1672. Among these, 0 became active


Here, one extra compounds was found in AID2032, which became active. This compound is likely to be a newly synthesized or has similar structure to compounds that were found active in the primary screen AID1672. Therefore, we should keep this compound in final hits. 
There were 1163 compounds were tested inactive in AID1672 but were still retested in AID2032. Similarly, 33 compounds were retested in AID463252. All of these compounds still remain inactive. These compounds should be kept in the final inactives (confirmed inactive twice)

The potential final hits therefore should be the intersection of confirmed actives from AID2032 and AID463252, subtracted by compounds that were active in all the counter screens (AID2105, AID2236, AID2239, AID2345). Inactives should be taken from the primary inactives. 

## 5.3. Export the data

In [37]:
# Combined all actives in counter screens: 
list_of_promiscuous_cids = AID2105_active['PUBCHEM_CID'].tolist() + AID2236_active['PUBCHEM_CID'].tolist() + AID2329_active['PUBCHEM_CID'].tolist() + AID2345_active['PUBCHEM_CID'].tolist()
# Drop duplicates:
list_of_promiscuous_cids = list(set(list_of_promiscuous_cids))
print(f'There are {len(list_of_promiscuous_cids)} compounds that were active in counter screens')

There are 510 compounds that were active in counter screens


In [38]:
confirmed_hits = AID463252_active[AID463252_active['PUBCHEM_CID'].isin(AID2032_active['PUBCHEM_CID'])]
print(f'There are {len(confirmed_hits)} confirmed hits. Now we remove any mols that were active in counter screens')
potential_hits = confirmed_hits[~confirmed_hits['PUBCHEM_CID'].isin(list_of_promiscuous_cids)]
print(f'After removing promiscuous compounds, we end up with {len(potential_hits)} potential hits.')

There are 226 confirmed hits. Now we remove any mols that were active in counter screens
After removing promiscuous compounds, we end up with 172 potential hits.


In [41]:
potential_inactives = AID1672_inactive

In [39]:
if not os.path.exists(f'{data_folder}/before_finished/step_5'):
    os.makedirs(f'{data_folder}/before_finished/step_5')

In [42]:
#export the potential hits and inactives to csv:
potential_hits.to_csv(f'{data_folder}/before_finished/step_5/potential_hits.csv', index=False)
potential_inactives.to_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', index=False)

# 6. RDkit check

In [43]:
potential_hits = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_hits.csv', sep=',', header=0)
potential_inactives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', sep=',', header=0)

In [45]:
def process_smiles(df):
    """
    This function check if the SMILES strings from a given dataset could be parsed by RDKit or if they returns any problems detected by RDkit
    Input: 
        Pandas dataframe
    Output: 
        mol_list: dictionary with CID as key and RDKit molecule object as value
        problem_list: list of problems detected by RDKit
        cannot_parse: list of CIDs that could not be parsed by RDKit
    """
    mol_list = {}
    problem_list = []
    cannot_parse = []

    for i in df['PUBCHEM_CID']:

        #convert each SMILES to molecule:
        m = Chem.MolFromSmiles(df[df['PUBCHEM_CID'] == i]['PUBCHEM_EXT_DATASOURCE_SMILES'].values[0], sanitize=True)
        mol_list[i] = m

        if m is None:
            cannot_parse.append(i) #save if molecule is non-parsable
            
        elif m is not None:
            problems = Chem.DetectChemistryProblems(m) #identify and capture error messages when creating mol objects.
        if problems != ():
            problem_list.append(problems)
            
    if len(problem_list) > 0: 
        print(problem_list)
    else:
        print("No problems detected")
    return mol_list, problem_list, cannot_parse

In [46]:
mol_hits, problem_list_hits, cannot_parse_hits = process_smiles(potential_hits)
mol_inactives, problem_list_inactives, cannot_parse_inactives = process_smiles(potential_inactives)

No problems detected




No problems detected


In [48]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_6'):
    os.makedirs(f'{data_folder}/before_finished/step_6')

with open(f'{data_folder}/before_finished/step_6/problem_list_hits.txt', 'w') as f:
    f.write("Problems:\n")
    for item in problem_list_hits:
        f.write("%s\n" % item)
    f.write("Cannot parse:\n")
    for item in cannot_parse_hits:
        f.write("%s\n" % item)

with open(f'{data_folder}/before_finished/step_6/problem_list_inactives.txt', 'w') as f:
    f.write("Problems:\n")
    for item in problem_list_inactives:
        f.write("%s\n" % item)
    f.write("Cannot parse:\n")
    for item in cannot_parse_inactives:
        f.write("%s\n" % item)

Our dataset returned no problem or non-parsable molecule.

# 7. Inorganics

In [49]:
# Import data
potential_hits = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_hits.csv', sep=',', header=0)
potential_inactives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', sep=',', header=0)

In [50]:
organic_hits = []
inorganic_hits = []

for index, row in potential_hits.iterrows():
    cid = row['PUBCHEM_CID']
    smi = row['PUBCHEM_EXT_DATASOURCE_SMILES']
    mol = Chem.MolFromSmiles(smi, sanitize=True)
    if functional_groups.is_inorganic(mol):
        inorganic_hits.append(cid)
    else:
        organic_hits.append(cid)

In [51]:
organic_inactives = []
inorganic_inactives = []

for index, row in potential_inactives.iterrows():
    cid = row['PUBCHEM_CID']
    smi = row['PUBCHEM_EXT_DATASOURCE_SMILES']
    mol = Chem.MolFromSmiles(smi, sanitize=True)
    if functional_groups.is_inorganic(mol):
        inorganic_inactives.append(cid)
    else: 
        organic_inactives.append(cid)



In [52]:
print(f'In hits, there are {len(organic_hits)} organic molecules and {len(inorganic_hits)} inorganic molecules')
print(f'In inactives, there are {len(organic_inactives)} organic molecules and {len(inorganic_inactives)} inorganic molecules')

In hits, there are 172 organic molecules and 0 inorganic molecules
In inactives, there are 302995 organic molecules and 0 inorganic molecules


In [53]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_7'):
    os.makedirs(f'{data_folder}/before_finished/step_7')

with open(f'{data_folder}/before_finished/step_7/inorganic.txt', 'w') as f:
    f.write("Hits:\n")
    for item in inorganic_hits:
        f.write("%s\n" % item)
    f.write("\n\nInactives:\n")
    for item in inorganic_inactives:
        f.write("%s\n" % item)

In [54]:
# Drop inorganics: 
potential_hits = potential_hits[~potential_hits['PUBCHEM_CID'].isin(inorganic_hits)]
potential_inactives = potential_inactives[~potential_inactives['PUBCHEM_CID'].isin(inorganic_inactives)]


In [55]:
#save: 
potential_hits.to_csv(f'{data_folder}/before_finished/step_7/organic_hits.csv', index=False)
potential_inactives.to_csv(f'{data_folder}/before_finished/step_7/organic_inactives.csv', index=False)

# 8. Mixture

## 8.1. Quick check

In [56]:
#import: 
organic_hits = pd.read_csv(f'{data_folder}/before_finished/step_7/organic_hits.csv', sep=',', header=0)
organic_inactives = pd.read_csv(f'{data_folder}/before_finished/step_7/organic_inactives.csv', sep=',', header=0)

In [57]:
def quick_check_mixtures(name, smiles_list):
    count=0
    for smiles in smiles_list:
        if '.' in smiles:
            count+=1
    print(f"Total number of mixtures in {name} is {count}")

quick_check_mixtures('hits', organic_hits['PUBCHEM_EXT_DATASOURCE_SMILES'])
quick_check_mixtures('inactives', organic_inactives['PUBCHEM_EXT_DATASOURCE_SMILES'])

Total number of mixtures in hits is 44
Total number of mixtures in inactives is 11262


## 8.2. Handling mixture

In [58]:
def process_smiles_dataframe(df):
    """
    From a given dataframe, detect and handle mixtures based on SMILES representation.
    Input: 
        Pandas dataframe
    Output: 
        (dictionary: CID -> SMILES)
            processed: non-mixture forms of every SMILES in the given dataset
            removed: mixture components or mixtures removed from the original mixture SMILES
            small_organic: small organic molecules that are removed from a size-imbalanced mixtures of organic molecules
            small_inorganic: small inorganic molecules that are removed from a mixture of both organic and inorganic molecules
            big_organic_not_lipinski: large organic molecules that are not kept due to not passing the lipinski criteria
            cleaned_from_mixtures: non-mixture forms after handling of the orignal mixtures
    """
    processed = {}
    removed = {}
    small_inorganic = {}
    small_organic = {}
    big_organic_not_lipinski = {}
    cleaned_from_mixtures = {}
    
    for index, row in df.iterrows():
        cid = row['PUBCHEM_CID']
        smiles = row['PUBCHEM_EXT_DATASOURCE_SMILES']
        
        if '.' in smiles: # Check for mixtures
            molecules = smiles.split('.')
            mols = [Chem.MolFromSmiles(mol) for mol in molecules]
            num_atoms = [mol.GetNumAtoms() for mol in mols if mol is not None]
            
            if all(x == num_atoms[0] for x in num_atoms): # Check if all molecules have the same number of atoms. If yes, keep one of them
                processed[cid] = molecules[0]
                cleaned_from_mixtures[cid] = molecules[0]
                removed[cid] = '.'.join(molecules[1:])
            else:
                if max(num_atoms) - min(num_atoms) <= 5:
                    removed[cid] = smiles # Remove the mixture if the difference in number of atoms is less than 5
                    print(f"Cannot decide between {molecules} for CID {cid}")
                else:
                    max_index = num_atoms.index(max(num_atoms))
                    min_index = num_atoms.index(min(num_atoms))
                    if functional_groups.is_inorganic(mols[min_index]) == True:
                        processed[cid] = molecules[max_index]
                        cleaned_from_mixtures[cid] = molecules[max_index]
                        removed[cid] = molecules[min_index] # Keep the organic molecule and remove the inorganic one
                        small_inorganic[cid] = molecules[min_index]
                    else:
                        big_molecule = mols[max_index]
                        
                        # Calculate properties for Lipinski's rule of five
                        mw = Descriptors.MolWt(big_molecule)
                        hbd = rdMolDescriptors.CalcNumHBD(big_molecule)
                        hba = rdMolDescriptors.CalcNumHBA(big_molecule)
                        logp = Crippen.MolLogP(big_molecule)

                        # Check Lipinski's criteria
                        if mw <= 500 and hbd <= 5 and hba <= 10 and logp <= 5:
                            processed[cid] = molecules[max_index] # Keep the big organic molecule if it passes Lipinski's rule of five
                            cleaned_from_mixtures[cid] = molecules[max_index]
                            removed[cid] = '.'.join([molecules[i] for i in range(len(molecules)) if i != max_index])
                            small_organic[cid] = molecules[min_index]
                        else:
                            removed[cid] = smiles # Remove the mixture if the big organic molecule does not pass Lipinski's rule of five
                            big_organic_not_lipinski[cid] = molecules[max_index]
                            print(f"Big organic molecule for CID {cid} does not pass Lipinski's rule of five")

        else:
            processed[cid] = smiles
            
    return processed, removed, small_organic, small_inorganic, big_organic_not_lipinski, cleaned_from_mixtures


In [59]:
processed_hits, removed_hits, small_organic_hits, small_inorganic_hits, not_lipinski_hits, cleaned_hits = process_smiles_dataframe(organic_hits)
processed_inactives, removed_inactives, small_organic_inactives, small_inorganic_inactives, not_lipinski_inactives, cleaned_inactives = process_smiles_dataframe(organic_inactives)

Big organic molecule for CID 23640915 does not pass Lipinski's rule of five
Cannot decide between ['CC1=CC=C(C=C1)S(=O)(=O)[O-]', 'CN1C=C[N+](=C1COC(=O)N(C)C)C'] for CID 15945791
Big organic molecule for CID 15944844 does not pass Lipinski's rule of five
Big organic molecule for CID 12004634 does not pass Lipinski's rule of five
Big organic molecule for CID 15945428 does not pass Lipinski's rule of five
Big organic molecule for CID 12005684 does not pass Lipinski's rule of five
Big organic molecule for CID 12005677 does not pass Lipinski's rule of five
Big organic molecule for CID 15945399 does not pass Lipinski's rule of five
Big organic molecule for CID 12005527 does not pass Lipinski's rule of five
Big organic molecule for CID 15944995 does not pass Lipinski's rule of five
Cannot decide between ['CC1=CC=C(C=C1)S(=O)(=O)[O-]', 'CN1C=C[N+](=C1CCOC(=O)N(C)C)C'] for CID 15945641
Big organic molecule for CID 15945589 does not pass Lipinski's rule of five
Cannot decide between ['CC1=NCCC2



Big organic molecule for CID 135433258 does not pass Lipinski's rule of five
Cannot decide between ['CC1=CC=C(C=C1)S(=O)(=O)[O-]', 'C1=CC2=C(C3=C(C=CC=N3)C=C2)[NH+]=C1'] for CID 16196049
Cannot decide between ['C1=CC=C2C=C(C=CC2=C1)S(=O)(=O)[O-]', 'C1=CC(=C[N+](=C1)CC(=O)NC2=CC=C(C=C2)Br)O'] for CID 16196732
Big organic molecule for CID 56642840 does not pass Lipinski's rule of five
Big organic molecule for CID 16193570 does not pass Lipinski's rule of five
Big organic molecule for CID 16193495 does not pass Lipinski's rule of five
Big organic molecule for CID 10864994 does not pass Lipinski's rule of five
Big organic molecule for CID 16193568 does not pass Lipinski's rule of five
Big organic molecule for CID 16193567 does not pass Lipinski's rule of five
Big organic molecule for CID 9551917 does not pass Lipinski's rule of five
Big organic molecule for CID 11958671 does not pass Lipinski's rule of five
Cannot decide between ['CN(C)C1=NC2=CC=CC=C2C(=C1)N', 'C1=C(NC(=O)NC1=O)C(=O)O'] fo

In [60]:
# Create a new step folder
if not os.path.exists(f'{data_folder}/before_finished/step_8'):
    os.makedirs(f'{data_folder}/before_finished/step_8')

In [61]:
#Generate df with the smiles column in the cleaned_hits or cleaned_inactives dictionary:
cleaned_hits_df = pd.DataFrame(list(cleaned_hits.values()), columns=['PUBCHEM_EXT_DATASOURCE_SMILES'])
cleaned_inactives_df = pd.DataFrame(list(cleaned_inactives.values()), columns=['PUBCHEM_EXT_DATASOURCE_SMILES'])

In [62]:
#Export the cleaned hits and inactives to csv:
cleaned_hits_df.to_csv(f'{data_folder}/before_finished/step_8/cleaned_mixtures_hits.csv', index=False, header=False)
cleaned_inactives_df.to_csv(f'{data_folder}/before_finished/step_8/cleaned_mixtures_inactives.csv', index=False, header=False)

In [63]:
def process_mixture_df(name, df, processed, removed, small_organic, small_inorganic):
    """
    Update a given dataframe with information on mixture handling
    """
    indices_to_drop = []  # List to keep track of row indices that should be dropped
    
    for index, row in df.iterrows():
        cid = row['PUBCHEM_CID']
        if cid in processed:
            df.loc[index, 'PUBCHEM_EXT_DATASOURCE_SMILES'] = processed[cid]
            if cid in removed:
                df.loc[index, 'Mol removed from mixture'] = removed[cid]
            if cid in small_organic:
                df.loc[index, 'Small organic molecule'] = small_organic[cid]
            if cid in small_inorganic:
                df.loc[index, 'Small inorganic molecule'] = small_inorganic[cid]
        else:
            indices_to_drop.append(index)
            print(f"{cid} has been removed from {name} because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.")
    
    # Drop rows outside the loop and reset index if needed
    new_df = df.drop(indices_to_drop).reset_index(drop=True)
    
    return new_df

processed_hits_df = process_mixture_df('hits_M1_antagonist', organic_hits, processed_hits, removed_hits, small_organic_hits, small_inorganic_hits)
processed_inactives_df = process_mixture_df('inactives_M1_antagonist', organic_inactives, processed_inactives, removed_inactives, small_organic_inactives, small_inorganic_inactives)

23640915 has been removed from hits_M1_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
15945791 has been removed from inactives_M1_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
15944844 has been removed from inactives_M1_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
12004634 has been removed from inactives_M1_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
15945428 has been removed from inactives_M1_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
12005684 has been removed from inactives_M1_antagonist because it is a mixture with less than 5 atoms dif

In [64]:
with open(f'{data_folder}/before_finished/step_8/mixture.txt', 'w') as f:
    f.write(f"""
Hits before processing: {len(organic_hits)}
Hits before processing: {len(organic_hits)}
Hits after processing: {len(processed_hits_df)}
Mixtures detected: {len(removed_hits)}
Mixtures with small inorganic molecules: {len(small_inorganic_hits)}
Mixtures with big organic molecules passing Lipinski: {len(small_organic_hits)}
Mixtures with big organic molecules not passing Lipinski: {len(not_lipinski_hits)}

Inactives before processing: {len(organic_inactives)}
Inactives after processing: {len(processed_inactives_df)}
Mixtures detected: {len(removed_inactives)}
Mixtures with small inorganic molecules: {len(small_inorganic_inactives)}
Mixtures with big organic molecules passing Lipinski: {len(small_organic_inactives)}
Mixtures with big organic molecules not passing Lipinski: {len(not_lipinski_inactives)}
""")

# Save the processed dataframes to csv
processed_hits_df.to_csv(f'{data_folder}/before_finished/step_8/post8_hits.csv', index=False)
processed_inactives_df.to_csv(f'{data_folder}/before_finished/step_8/post8_inactives.csv', index=False)

print('Dataframes saved successfully')

Dataframes saved successfully


# 9. Neutralize & 10. Aromatize molecules

In [65]:
# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_9_10'):
    os.makedirs(f'{data_folder}/before_finished/step_9_10')

In [66]:
#import:
pre9_hits = pd.read_csv(f'{data_folder}/before_finished/step_8/post8_hits.csv', sep=',', header=0)
pre9_inactives = pd.read_csv(f'{data_folder}/before_finished/step_8/post8_inactives.csv', sep=',', header=0)

In [67]:
def neutralize_atoms(mol):
    """
    Code adapted from https://www.rdkit.org/docs/Cookbook.html. 
    Source: https://baoilleach.blogspot.com/2019/12/no-charge-simple-approach-to.html
    (Noel O’Boyle, 2019)

    This function return a neutralized molecules for a given input Mol object. 
    Additional handling was added for molecules with tetracoordinated boron. 
    """
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            
            #Skip adjustment for tetracoordinated boron
            if atom.GetAtomicNum() == 5 and atom.GetDegree() == 4: #ADD COMMENT
                continue  # Just bypass the problematic atom

            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    return mol

In [68]:
def aromatize_smile(mol):
    """
    This function dekekulize an input Mol object and return the aromatic form of isomeric SMILES. 
    """
    Chem.Kekulize(mol)
    Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_ALL)
    aromatic_smiles = Chem.MolToSmiles(mol, isomericSmiles = True)
    return aromatic_smiles

In [69]:
updated_smi = []

#Update dataset with neutralized, aromatic SMILES
for smi in pre9_hits['PUBCHEM_EXT_DATASOURCE_SMILES']: 
    mol = Chem.MolFromSmiles(smi)
    mol_neu = neutralize_atoms(mol)
    smi_arom = aromatize_smile(mol_neu)
    updated_smi.append(smi_arom)
    
#update the smiles in this df
pre9_hits['PUBCHEM_EXT_DATASOURCE_SMILES'] = updated_smi

In [70]:
updated_smi = []

#Update dataset with neutralized, aromatic SMILES
for smi in pre9_inactives['PUBCHEM_EXT_DATASOURCE_SMILES']: 
    mol = Chem.MolFromSmiles(smi)
    mol_neu = neutralize_atoms(mol)
    smi_arom = aromatize_smile(mol_neu)
    updated_smi.append(smi_arom)

#update the smiles in this df
pre9_inactives['PUBCHEM_EXT_DATASOURCE_SMILES'] = updated_smi

In [71]:
#Save
pre9_hits.to_csv(f'{data_folder}/before_finished/step_9_10/post10_hits.csv', index=False)
pre9_inactives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', index=False)

# Post 9+10: Update InChI


Is it important to now update InChI in our datasets, for 2 reasons:

(1) Some mixture compounds have been modified (removal of small inorganic or organic molecules) in SMILES representation but not InChIs.

(2) The SMILES representations have been neutralized and aromatized, but not InChIs.

In [72]:
#export the smiles columns to txt
pre9_hits['PUBCHEM_EXT_DATASOURCE_SMILES'].to_csv(f'{data_folder}/before_finished/step_9_10/smiles_hits.txt', index=False, header=False)
pre9_inactives['PUBCHEM_EXT_DATASOURCE_SMILES'].to_csv(f'{data_folder}/before_finished/step_9_10/smiles_inactives.txt', index=False, header=False)

In [73]:
"""
Submit the smiles files to PubChem Identifier Exchange Service: 
    Input IDs: "SMILES"
    Operator type: "same CID" 
    Output IDs: "InChI"
    Output method: "Two column file showing each input output-correspondence"
    Compression: "No compression"
InChI list should be saved into "step_9_10" folder, named as "inchi_hits.txt" and "inchi_inactives" 
"""
#Import the converted InChIs
cleaned_inchi_hits = pd.read_csv(f'{data_folder}/before_finished/step_9_10/inchi_hits.txt', sep='\t', header=None)
cleaned_inchi_inactives = pd.read_csv(f'{data_folder}/before_finished/step_9_10/inchi_inactives.txt', sep='\t', header=None)

#a dictionary of smiles and corresponding inchi in cleaned_inchi_hits
hits_smi_inchi_dict = dict(zip(cleaned_inchi_hits[0], cleaned_inchi_hits[1]))
inactives_smi_inchi_dict = dict(zip(cleaned_inchi_inactives[0], cleaned_inchi_inactives[1]))
                             
#update the pre9_hits by matching the smiles with keys and replace inchi with values:
pre9_hits['InChI'] = pre9_hits['PUBCHEM_EXT_DATASOURCE_SMILES'].map(hits_smi_inchi_dict) 
pre9_inactives['InChI'] = pre9_inactives['PUBCHEM_EXT_DATASOURCE_SMILES'].map(inactives_smi_inchi_dict)


In [74]:
#export: 
pre9_hits.to_csv(f'{data_folder}/before_finished/step_9_10/post10_hits.csv', index=False)
pre9_inactives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', index=False)

# 11. PAIN filters

## 11.1. Frequency of hits (FoH)

Frequency of Hits is a complex concept that requires a merticulous approach. In general, the rule is if a compound was tested active in multiple assays, it is likely to be a promiscuous compound. 
1. For each compounds, retrieve the information on its tested assays
2. For each of the assay tested, retrieve the sequence of the protein target. 
3. Given all sequence of the protein tested, do a multiple sequence alignment to find the percentage Percent Identity (similarty) between these proteins. If an assay has high percentage to other targets, then these assays contribute less to promiscuousity of the compound. 
4. Use the percentage identity as a weight: 
w = 1 - %SI/100
Calculate the frequency of hits for each compound:
FoH = wACC/TAC
wACC is the weighed total number of assay tested where the compounds were identified acitives. TAC is the total number of assays tested. 

In [4]:
pre11_hits = pd.read_csv(f'{data_folder}/before_finished/step_9_10/post10_hits.csv', sep=',', header=0)
pre11_inactives = pd.read_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', sep=',', header=0)

In [76]:
# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_1'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_1')

### 11.1.1. PubChem testing information for each compound

This part illustrates how to retrieve the information of how each compound was tested from the PubChem database. Bulk data retrieval from the ftp server is used to get the information of every bioassay in PubChem:

In [77]:
url = 'https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Extras/bioassays.tsv.gz' #this FTP file records the summary data of all available AIDs in PubChem

local_save_dir = 'H:/coding/HiChem/curation/pubchem_sum'
local_save_path = os.path.join(local_save_dir, 'bioassays.tsv.gz')

if not os.path.exists(local_save_dir):
    os.makedirs(local_save_dir)
r = requests.get(url, stream=True)

with open(local_save_path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)
print('Downloaded to %s' % local_save_path)

Downloaded to H:/coding/HiChem/curation/pubchem_sum\bioassays.tsv.gz


In [5]:
path = 'pubchem_sum/bioassays.tsv.gz'

# Read the TSV file
all_bioassay = pd.read_csv(path, delimiter='\t')

In [6]:
all_bioassay.head()

Unnamed: 0,AID,BioAssay Name,Deposit Date,Modify Date,Source Name,Source ID,Substance Type,Outcome Type,Project Category,BioAssay Group,BioAssay Types,Protein Accessions,UniProts IDs,Gene IDs,Target TaxIDs,Taxonomy IDs,Number of Tested SIDs,Number of Active SIDs,Number of Tested CIDs,Number of Active CIDs
0,1,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,55228,3318,53214,3094
1,3,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,51435,2615,49564,2467
2,5,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,54079,2503,52046,2317
3,7,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,54062,4335,52033,4098
4,9,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,53977,3159,52001,2981


### 11.1.2 Retrieving protein sequences for assays tested:

Then, the testing information for each compound is retrieved from the PugREST API

In [7]:
# Cache to store the number of compounds tested per AID to avoid redundant call. 
num_compounds_tested_cache = {}

def get_num_compounds_tested(aid, all_bioassay=all_bioassay):
    """
    This function retrieves the information of how many compounds were tested in a given assay (by AID).
    """
    if aid in num_compounds_tested_cache:
        return num_compounds_tested_cache[aid]
    else: 
        #return the 'Number of Tested CIDs' column value at the row where the 'AID' column is equal to aid in the all_bioassay dataframe
        num_compounds_tested = all_bioassay[all_bioassay['AID'] == aid]['Number of Tested CIDs'].values[0]
    return num_compounds_tested

def get_assay_data(cid):
    """
    Return a dictionary of all targets that a given compound (by CID) was tested on in PubChem 
    and the activity values of the compound. 
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON" #PUG-REST compound summary by CID
    response = requests.get(url)
    data = response.json()

    target_activity = {}

    if 'Table' in data and 'Row' in data['Table']:
        for row in data['Table']['Row']:
            cells = row['Cell']
            aid = int(cells[0])  # Extracting the AID from the first cell

            # Proceed only if the assay is a screening assay
            if cells[10] == 'Screening':

                # Proceed only if more than 10,000 compounds were tested
                num_compounds_tested = get_num_compounds_tested(aid)
                if num_compounds_tested > 10000:
                    target_gi = cells[5] # Retrieve the protein target's GI
                    activity_outcome = cells[4].lower()

                    if target_gi not in target_activity:
                        target_activity[target_gi] = activity_outcome == 'active'
                    elif activity_outcome == 'active':
                        target_activity[target_gi] = True # If a compound was tested multiple times on the same protein, priotize "active" outcome.
            
            else:
                continue

    return (cid, target_activity)

In [8]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

cids_list = pre11_hits['PUBCHEM_CID'].tolist()

def execute_with_multiprocessing(cids_list):
    """
    For a given list of CIDs, return a dictionary of dictionaries 
    of protein targets these compounds were tested on and the activity outcomes
    Input: 
        [list of CIDs]
    Output: 
        Dictionary of testing information for all CIDs, such as:
        {CID1:{target1:activity1, target3:activity3, ...},{CID2:{target2:activity2, target4:activity4, ...}, ...}}
    """
    results_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Prepare futures for all CIDs
        futures = [executor.submit(get_assay_data, cid) for cid in cids_list]
        
        # Process futures as they complete
        for future in tqdm(as_completed(futures), total=len(cids_list), desc="Processing CIDs"):
            try:
                cid, target_activity = future.result()
                results_dict[cid] = target_activity
            except Exception as e:
                print(f"Error processing CID: {e}")
    return results_dict

results_dict = execute_with_multiprocessing(cids_list)

Processing CIDs:   0%|          | 0/171 [00:00<?, ?it/s]

Processing CIDs: 100%|██████████| 171/171 [00:37<00:00,  4.57it/s]


In [82]:
#export results_dict
with open(f'{data_folder}/before_finished/step_11/11_1/results_dict.json', 'w') as f:
    json.dump(results_dict, f)

In [9]:
#import results_dict:
with open(f'{data_folder}/before_finished/step_11/11_1/results_dict.json', 'r') as f:
    results_dict = json.load(f)

In [10]:
#get the list of all keys of the values in the dictionary:
protein_ids = []
for value in results_dict.values():
    protein_ids.extend(value.keys())

#clean the list
protein_ids = list(set(protein_ids))
protein_ids = [id for id in protein_ids if id != '']

print(protein_ids)
print(f'Require multiple sequencing alignment for {len(protein_ids)} proteins.')

['73747889', '78486550', '21618340', '14149746', '11141885', '4503383', '4505209', '398366139', '164058', '126698238', '4581413', '115347926', '4504343', '59036749', '148378801', '111305821', '13124881', '83779224', '5730106', '48428097', '536029', '1709543', '9937384', '4507793', '156416009', '12830367', '47678551', '42741659', '4503351', '62526033', '31542303', '1927', '67463988', '32425330', '14719829', '23943882', '139424501', '13177715', '55976631', '10092597', '149631', '10835013', '75495260', '27368096', '270133071', '225543099', '56202836', '2358024', '2935630', '118764400', '111034851', '124263658', '90421313', '597517265', '4507615', '8574038', '208342286', '90111653', '156104889', '160877737', '194306653', '341916350', '339641', '7108463', '166209887', '1166512', '74356043', '119579178', '6009644', '166202459', '171229', '27807367', '71746704', '32307126', '216409728', '74752344', '13272532', '5729858', '134244587', '21359873', '4503219', '76364066', '4757840', '1519312078',

Now we retrieve all the FASTA sequences of proteins tested for all of our compounds with Biopython API to Entrez of NCBI. The FASTA sequence is saved as "sequences.fasta"

In [85]:
# Always tell NCBI who you are
Entrez.email = "hdong26@amherst.edu"

# The filename where you want to save the sequences
output_filename = f'{data_folder}/before_finished/step_11/11_1/sequences.fasta'

# Open a file to write the sequences
with open(output_filename, "w") as output_file:
    for id in protein_ids:
        try:
            # Fetch the sequence from NCBI
            handle = Entrez.efetch(db="protein", id=id, rettype="fasta", retmode="text")
            sequence_data = handle.read()
            handle.close()
            
            # Write the sequence data to the file
            output_file.write(sequence_data)
        except Exception as e:
            print(f"An error occurred while fetching {id}: {e}")

From the FASTA sequence, we also need to retrieve the list of protein names, since these are different from the protein GIs

In [11]:
def extract_protein_names(file_path):
    protein_names = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                # Split the line at spaces and take the first item
                parts = line.split(' ')
                protein_name = parts[0]
                # Remove the leading '>' character
                protein_name = protein_name[1:]
                protein_names.append(protein_name)
    return protein_names

In [12]:
file_path = f'{data_folder}/before_finished/step_11/11_1/sequences.fasta'
protein_names = extract_protein_names(file_path)

In [13]:
# Create a dictionary to map protein IDs to protein names by index 
protein_id_to_name = {protein_ids[i]: protein_names[i] for i in range(len(protein_ids))}

### 11.1.3 Percent Sequence Identity by Multiple Sequence Alignment

The sequences.fasta file is submitted to https://www.ebi.ac.uk/jdispatcher/msa/clustalo for multiple sequencing alignment. The resulted table of percent sequence identity matrix is saved and imported for the calculation of FoH

In [15]:
"""
Submit sequences.fasta to https://www.ebi.ac.uk/jdispatcher/msa/clustalo
    Input sequence type: Protein
    Output format: ClustalW with character counts
Download the resulted Percent Identity Matrix file file and save as "percent_identity_matrix.txt"
"""

#import the identity matrix:
protein_si = pd.read_csv(
    f'{data_folder}/before_finished/step_11/11_1/percent_identity_matrix.txt',
    delimiter='\s+',
    header=None,
    skiprows=6 
)

In [16]:
#remove the first column:
protein_si = protein_si.drop(protein_si.columns[0], axis=1)

In [17]:
name = protein_si[1].tolist()
name = ['protein name'] + name
protein_si.columns = name

In [18]:
protein_si

Unnamed: 0,protein name,CAI16307.1,NP_001903.1,NP_987096.1,AAH19268.2,NP_848757.5,NP_598230.2,NP_000876.3,sp|Q9HBX9.2|RXFP1_HUMAN,AAD14062.3,...,NP_003812.1,CAA96025.1,NP_005021.2,CAC29064.1,NP_112168.1,sp|P31749.2|AKT1_HUMAN,CAD53427.1,sp|P53779.2|MK10_HUMAN,NP_063937.2,NP_002084.2
0,CAI16307.1,100.00,92.44,14.12,13.92,12.87,15.79,12.90,13.22,16.67,...,12.12,9.33,10.34,13.19,13.19,12.05,7.69,9.20,6.98,6.98
1,NP_001903.1,92.44,100.00,14.29,10.23,9.46,13.53,12.78,11.81,7.14,...,14.06,8.99,10.00,13.51,13.51,12.24,7.69,8.33,7.07,6.06
2,NP_987096.1,14.12,14.29,100.00,13.04,14.34,15.59,16.75,19.48,22.22,...,12.85,10.13,12.42,10.62,10.62,10.61,11.11,14.01,11.11,10.42
3,AAH19268.2,13.92,10.23,13.04,100.00,14.89,10.87,13.04,17.36,13.33,...,7.59,14.93,7.35,14.52,14.52,5.36,13.89,9.21,10.96,12.33
4,NP_848757.5,12.87,9.46,14.34,14.89,100.00,13.33,16.36,20.07,13.33,...,8.43,10.14,12.08,14.47,14.47,11.94,12.40,11.85,12.95,16.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,sp|P31749.2|AKT1_HUMAN,12.05,12.24,10.61,5.36,11.94,5.13,12.31,12.90,0.00,...,22.40,13.70,26.02,29.95,29.95,100.00,9.73,21.55,22.92,22.84
417,CAD53427.1,7.69,7.69,11.11,13.89,12.40,10.31,9.47,10.67,28.57,...,10.49,18.00,8.94,8.57,8.57,9.73,100.00,13.98,28.21,16.67
418,sp|P53779.2|MK10_HUMAN,9.20,8.33,14.01,9.21,11.85,9.52,10.06,8.65,18.75,...,17.82,15.61,22.41,25.67,25.67,21.55,13.98,100.00,26.52,25.25
419,NP_063937.2,6.98,7.07,11.11,10.96,12.95,11.97,10.73,11.48,6.25,...,19.50,14.01,23.74,21.51,21.51,22.92,28.21,26.52,100.00,76.85


### 11.1.4 Calculation of FoHs:

Until now, we have a dictionary of (cid: assays tested); (assay_tested:protein name), and percentage identity matrix with first columns as protein names. 
For each compound, we retrieve the list of all protein names tested on that compounds by matching between the two first dictionary. From this list, we retrieve the corresponding matrix of percentages identitiy of these proteins corresponding to these compounds and calculate the FoH

In [19]:
protein_si_dict = {}
for name in protein_si['protein name']: 
    for other_name in protein_si['protein name']: 
        if other_name != name: 
            protein_si_dict[(name, other_name)] = protein_si.loc[protein_si['protein name'] == name, other_name].values[0]

In [20]:
import tqdm

foh_dict = {}

for cid, targets in tqdm.tqdm(results_dict.items()):
    active_weight_list = []
    total_weight_list = []

    for target_id, result in targets.items():
        if target_id == '':
            continue

        protein_name = protein_id_to_name[target_id]
        max_weight = 0

        for other_id, other_result in targets.items():
            if other_id != target_id and other_id != '':
                other_protein_name = protein_id_to_name[other_id]
                value = protein_si_dict[(protein_name, other_protein_name)]
                max_weight = max(max_weight, value)

        target_weight = 1 - max_weight / 100

        if result:
            active_weight_list.append(target_weight)
        total_weight_list.append(target_weight)

    if total_weight_list:
        foh_score = sum(active_weight_list) / sum(total_weight_list)
        foh_dict[cid] = foh_score
    else: 
        foh_dict[cid] = 0


100%|██████████| 171/171 [00:11<00:00, 14.76it/s]


In [21]:
#export foh_dict
with open(f'{data_folder}/before_finished/step_11/11_1/foh_dict.json', 'w') as f:
    json.dump(foh_dict, f)

For compounds with FoH larger than 0.26, we remove them

In [22]:
to_drop = []
for cid, foh_score in foh_dict.items():
    if foh_score > 0.26: 
        to_drop.append(cid)

post_FoH_hits = pre11_hits[~pre11_hits['PUBCHEM_CID'].isin(to_drop)]
print(f'Dropped {(len(to_drop))} compounds with FoH larger than 0.26')

Dropped 0 compounds with FoH larger than 0.26


In [23]:
# Save post_FoH_hits: 
post_FoH_hits.to_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_hits.csv', index=False)

In [24]:
pre11_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_inactives.csv', index=False)

## 11.2 Autofluoresence & Luceferase inhibition

When finding false positive due to autofluorescence and luceferase inhibition, it is important to check if the particular assays use one of these technologies. Here, all three assays (AID626, AID1488, and AID1741) use fluorescence technologies, so it is optimal to remove compounds that are active in AIDs: 587, 588, 590, 591, 592, 593, 594

In [25]:
autofluorescence_aids = ['587', '588', '590', '591', '592', '593', '594']
autofluorescence_cids = []
col_list = ['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME']

count = 0
for AID in autofluorescence_aids:
    url = f'https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&response_type=save&aid={AID}'
    autofluorescence_df = pd.read_csv(url, usecols=col_list)
    #delete rows with nan values
    autofluorescence_df = autofluorescence_df.dropna(subset=['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME'])

    #convert cids to int: 
    autofluorescence_df['PUBCHEM_CID'] = autofluorescence_df['PUBCHEM_CID'].astype(int)

    #keep only rows said "Active"
    autofluorescence_df = autofluorescence_df[autofluorescence_df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']
    autofluorescence_cids.extend(autofluorescence_df['PUBCHEM_CID'].tolist())


In [26]:
#drop duplicates in the list:
autofluorescence_cids = [item for item in set(autofluorescence_cids)]

In [27]:
to_drop_hits = []
for cid in post_FoH_hits: 
    if cid in autofluorescence_cids:
        to_drop.append(cid)
post_autofluorescence = post_FoH_hits[~post_FoH_hits['PUBCHEM_CID'].isin(to_drop_hits)]
print(f'Dropped {(len(to_drop_hits))} autofluorescence compounds')

Dropped 0 autofluorescence compounds


In [28]:
post_autofluorescence

Unnamed: 0,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,InChI,Mol removed from mixture,Small inorganic molecule,Small organic molecule
0,17367817,COc1ccc(CNCc2cccc3ccccc23)cc1,Active,InChI=1S/C19H19NO/c1-21-18-11-9-15(10-12-18)13...,C(=O)(C(=O)O)O,C(=O)(C(=O)O)O,
1,776996,COc1ccc(CNCc2cccc(Oc3ccccc3)c2)cc1,Active,InChI=1S/C21H21NO2/c1-23-19-12-10-17(11-13-19)...,,,
2,656332,Cc1cc(C)nc(SCc2nnc(SCC(=O)OC3CCCCC3)o2)n1,Active,InChI=1S/C17H22N4O3S2/c1-11-8-12(2)19-16(18-11...,,,
3,6602913,COc1ccc(Nc2ccnc3ccccc23)cc1,Active,InChI=1S/C16H14N2O/c1-19-13-8-6-12(7-9-13)18-1...,Cl,Cl,
4,24761107,COc1ccc(CNCc2ccc3c(c2)OCO3)cc1Br,Active,InChI=1S/C16H16BrNO3/c1-19-14-4-2-11(6-13(14)1...,C(=O)(C(=O)O)O,C(=O)(C(=O)O)O,
...,...,...,...,...,...,...,...
166,2813491,O=S(=O)(Nc1ccccc1N1CCOCC1)c1ccc(F)c(Cl)c1,Active,InChI=1S/C16H16ClFN2O3S/c17-13-11-12(5-6-14(13...,,,
167,6485989,COc1cccc(CNc2ccc3[nH]c(=O)[nH]c3c2)c1OCc1ccccc1F,Active,InChI=1S/C22H20FN3O3/c1-28-20-8-4-6-14(21(20)2...,,,
168,9551932,CNCCC1Sc2ccccc2Oc2ccccc21,Active,InChI=1S/C16H17NOS/c1-17-11-10-15-12-6-2-3-7-1...,C(=O)(C(=O)O)O,C(=O)(C(=O)O)O,
169,666539,COc1ccc(Oc2c(-c3ccc(O)cc3O)n[nH]c2C)cc1,Active,InChI=1S/C17H16N2O4/c1-10-17(23-13-6-4-12(22-2...,,,


In [29]:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_2'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_2')

In [30]:
#save: 
post_autofluorescence.to_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_hits.csv', index=False)
pre11_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_inactives.csv', index=False)

## 11.3 RDKit PAIN filter

In [31]:
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

In [32]:
def detect_pains(df):
    pains = []
    count_pains = 0
    count_not_pains = 0
    smiles_column = 'PUBCHEM_EXT_DATASOURCE_SMILES'
    cids_column = 'PUBCHEM_CID'
    for i in df.index:
        smile = str(df.loc[i, smiles_column])
        cid = df.loc[i, cids_column]
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            if catalog.HasMatch(mol):
                pains.append(cid)
                count_pains += 1
                print(f'{count_pains} pains detected')
            else: 
                count_not_pains += 1
    return pains

pains = detect_pains(post_autofluorescence)

1 pains detected
2 pains detected
3 pains detected
4 pains detected
5 pains detected
6 pains detected
7 pains detected
8 pains detected
9 pains detected
10 pains detected


In [33]:
post_pains_hits = post_autofluorescence[~post_autofluorescence['PUBCHEM_CID'].isin(pains)]

In [34]:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_3'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_3')

post_pains_hits.to_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_hits.csv', index=False)
pre11_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_inactives.csv', index=False)

# 12. Drug-likeness filter: 

In [35]:
pre12_hits = pd.read_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_hits.csv', sep=',', header=0)
pre12_inactives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_inactives.csv', sep=',', header=0)

In [36]:
from tqdm import tqdm

def drug_likeness_filter(smiles):
    """
    This functions check if a given smiles satisfies the common standard conditions for drug-likeness. 
    Input:
        SMILES (str)
    Output: 
        Result (bool)
    """

    # Convert SMILES string to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False  # Return False if the molecule cannot be parsed
    
    # Check molecular weight
    mw = Chem.rdMolDescriptors.CalcExactMolWt(mol)
    if not (150 < mw < 800):
        return False
    
    # Check AlogP
    logp = Chem.Crippen.MolLogP(mol)
    if not (-0.3 < logp < 5):
        return False
    
    # Check number of rotatable bonds
    rotatable_bonds = Lipinski.NumRotatableBonds(mol)
    if rotatable_bonds >= 15:
        return False
    
    # Check H-bond acceptor count and H-bond donor count
    hba = Lipinski.NumHAcceptors(mol)
    hbd = Lipinski.NumHDonors(mol)
    if hba >= 15 or hbd >= 15:
        return False
    
    # Check total formal charge
    total_charge = sum(atom.GetFormalCharge() for atom in mol.GetAtoms())
    if not (-2 < total_charge < 2):
        return False
    
    # If all filters passed, return True
    return True

def drug_likeness_filter_multiprocessing(df):
    """
    This function update a given dataframe by dropping molecules that don't pass the drug-likeness filter.
    """
    to_drop = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Create future tasks for each SMILES string in the dataframe
        futures = {executor.submit(drug_likeness_filter, row['PUBCHEM_EXT_DATASOURCE_SMILES']): row['PUBCHEM_CID'] for index, row in df.iterrows()}
        
        # Use tqdm to display progress bar
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing SMILES"):
            cid = futures[future]
            if not future.result():
                to_drop.append(cid)
    return to_drop

In [37]:
not_drug_hits = drug_likeness_filter_multiprocessing(pre12_hits)

Processing SMILES: 100%|██████████| 161/161 [00:00<00:00, 952.32it/s]


In [38]:
not_drug_inactives = drug_likeness_filter_multiprocessing(pre12_inactives)

Processing SMILES: 100%|██████████| 302812/302812 [00:02<00:00, 149675.58it/s]


In [39]:
post12_hits = pre12_hits[~pre12_hits['PUBCHEM_CID'].isin(not_drug_hits)]
post12_inactives = pre12_inactives[~pre12_inactives['PUBCHEM_CID'].isin(not_drug_inactives)]

print(f'Dropped {(len(not_drug_hits))} hit compounds that do not pass the drug likeness filter')
print(f'Dropped {(len(not_drug_inactives))} inactive compounds that do not pass the drug likeness filter')


Dropped 6 hit compounds that do not pass the drug likeness filter
Dropped 14549 inactive compounds that do not pass the drug likeness filter


In [40]:
if not os.path.exists(f'{data_folder}/before_finished/step_12'):
    os.makedirs(f'{data_folder}/before_finished/step_12')

#Export not_drug_hits and inactives:
with open(f'{data_folder}/before_finished/step_12/not_drug_hits.json', 'w') as f:
    json.dump(not_drug_hits, f)
with open(f'{data_folder}/before_finished/step_12/not_drug_inactives.json', 'w') as f:
    json.dump(not_drug_inactives, f)

In [41]:
# save: 
post12_hits.to_csv(f'{data_folder}/before_finished/step_12/post12_hits.csv', index=False)
post12_inactives.to_csv(f'{data_folder}/before_finished/step_12/post12_inactives.csv', index=False)

# 13. ChemBL Curation Pipeline

In [3]:
pre13_hits = pd.read_csv(f'{data_folder}/before_finished/step_12/post12_hits.csv', sep=',', header=0)
pre13_inactives = pd.read_csv(f'{data_folder}/before_finished/step_12/post12_inactives.csv', sep=',', header=0)

In [4]:
def checker_score(smiles, cid):
    result = checker.check_molblock(Chem.MolToMolBlock(Chem.MolFromSmiles(smiles)))
    if result == ():
        penalty_score = 0
    else:
        penalty_score = result[0][0]
    return cid, penalty_score

def checker_multiprocessing(df):
    chembl_score_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Map futures to CIDs directly for easier reference
        futures = {executor.submit(checker_score, row['PUBCHEM_EXT_DATASOURCE_SMILES'], row['PUBCHEM_CID']): row['PUBCHEM_CID'] for _, row in df.iterrows()}
        # Properly use tqdm to create a progress bar
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing SMILES"):
            cid, penalty_score = future.result()
            chembl_score_dict[cid] = penalty_score
    return chembl_score_dict

In [5]:
score_hits = checker_multiprocessing(pre13_hits)
score_inactives = checker_multiprocessing(pre13_inactives)

Processing SMILES: 100%|██████████| 155/155 [00:00<?, ?it/s]
[21:29:21] Conflicting single bond directions around double bond at index 4.
[21:29:21]   BondStereo set to STEREONONE and single bond directions set to NONE.
[21:29:36] Conflicting single bond directions around double bond at index 7.
[21:29:36]   BondStereo set to STEREONONE and single bond directions set to NONE.
[21:33:25] Conflicting single bond directions around double bond at index 4.
[21:33:25]   BondStereo set to STEREONONE and single bond directions set to NONE.
[21:33:28] Conflicting single bond directions around double bond at index 4.
[21:33:28]   BondStereo set to STEREONONE and single bond directions set to NONE.
[21:38:03] Conflicting single bond directions around double bond at index 5.
[21:38:03]   BondStereo set to STEREONONE and single bond directions set to NONE.
[21:38:05] Conflicting single bond directions around double bond at index 4.
[21:38:05]   BondStereo set to STEREONONE and single bond direction

In [7]:
#print all unique values in the dictionary:
print(set(score_hits.values()))
print(set(score_inactives.values()))

{0, 2}
{0, 2, 5, 6}


In [8]:
# Create a new step folder
if not os.path.exists(f'{data_folder}/before_finished/step_13'):
    os.makedirs(f'{data_folder}/before_finished/step_13')

# save the scores:
with open(f'{data_folder}/before_finished/step_13/score_hits.json', 'w') as f:
    json.dump(score_hits, f)
with open(f'{data_folder}/before_finished/step_13/score_inactives.json', 'w') as f:
    json.dump(score_inactives, f)

In [9]:
#drop all compounds with a penalty score of 7:
to_drop_hits = []
to_drop_inactives = []
for cid, penalty_score in score_hits.items():
    if penalty_score == 7:
        to_drop_hits.append(cid)
for cid, penalty_score in score_inactives.items():
    if penalty_score == 7:
        to_drop_inactives.append(cid)

post13_hits = pre13_hits[~pre13_hits['PUBCHEM_CID'].isin(to_drop_hits)]
post13_inactives = pre13_inactives[~pre13_inactives['PUBCHEM_CID'].isin(to_drop_inactives)]

In [10]:
#save final_hits and inactives:
post13_hits.to_csv(f'{data_folder}/before_finished/step_13/post13_hits.csv', index=False)
post13_inactives.to_csv(f'{data_folder}/before_finished/step_13/post13_inactives.csv', index=False)

# 14. Final handling of chemical representation

There are two problems that requires InChI update: 

(1) Some of the InChI will be missing, since the PubChem Identifier Exchange service might not able to find the corresponding InChI for the aromatized, neutralized SMILES. 

(2) While handling mixtures, some mixtures whose component molecules are identical will result in duplicates. Therefore, we need to check their activities. 
- If all duplicates share the same results (active/inactive), we keep one of them. 
- If duplicates of the same molecules returned different activity, we remove both of them. 

In [11]:
#import: 
pre14_hits = pd.read_csv(f'{data_folder}/before_finished/step_13/post13_hits.csv', sep=',', header=0)
pre14_inactives = pd.read_csv(f'{data_folder}/before_finished/step_13/post13_inactives.csv', sep=',', header=0)

## 14.1 Update InChI

In [12]:
def smi_to_inchi(smi):
    mol = Chem.MolFromSmiles(smi)
    inchi = Chem.inchi.MolToInchi(mol)
    return inchi

In [13]:
count = 0 
for index, row in pre14_hits.iterrows():
    if row['InChI'] != row['InChI']:
        pre14_hits.at[index, 'InChI'] = smi_to_inchi(row['PUBCHEM_EXT_DATASOURCE_SMILES'])
        count += 1
print(f'Updated {count} InChI values in pre14_hits')

count = 0
for index, row in pre14_inactives.iterrows():
    if row['InChI'] != row['InChI']:
        pre14_inactives.at[index, 'InChI'] = smi_to_inchi(row['PUBCHEM_EXT_DATASOURCE_SMILES'])
        count += 1
print(f'Updated {count} InChI values in pre14_inactives')

Updated 0 InChI values in pre14_hits







































Updated 70 InChI values in pre14_inactives


## 14.2 Handle duplicates

In [14]:
#Check if a mol in hit set appeared in inactive set:
for i in pre14_hits['PUBCHEM_EXT_DATASOURCE_SMILES']:
    if i in list(pre14_inactives['PUBCHEM_EXT_DATASOURCE_SMILES']):
        print(f'{i} SMILES appeared in both hit and inactive sets')
for i in pre14_hits['InChI']:
    if i in list(pre14_inactives['InChI']):
        print(f'{i} InChI appeared in both hit and inactive sets')

#Return all duplicates by comparing InChI:
final_hits_duplicates_InChI = pre14_hits[pre14_hits.duplicated(subset=['InChI'], keep=False)]
final_inactives_duplicates_InChI = pre14_inactives[pre14_inactives.duplicated(subset=['InChI'], keep=False)]
final_hits_duplicates_smi = pre14_hits[pre14_hits.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]
final_inactives_duplicates_smi = pre14_inactives[pre14_inactives.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]

print('Number of InChI duplicates in hits: ', len(final_hits_duplicates_InChI))
print('Number of InChI duplicates in inactives: ', len(final_inactives_duplicates_InChI))
print('Number of SMILES duplicates in hits: ', len(final_hits_duplicates_smi))
print('Number of SMILES duplicates in inactives: ', len(final_inactives_duplicates_smi))

Number of InChI duplicates in hits:  0
Number of InChI duplicates in inactives:  279
Number of SMILES duplicates in hits:  0
Number of SMILES duplicates in inactives:  277


In [15]:
if not os.path.exists(f'{data_folder}/before_finished/step_14'):
    os.makedirs(f'{data_folder}/before_finished/step_14')

#write all the duplicates to a file:
#write duplicates to a txt file: 
with open(f'{data_folder}/before_finished/step_14/duplicates.txt', 'w') as f:
    f.write('InChI duplicates in hits: \n')
    f.write(final_hits_duplicates_InChI.to_string())
    f.write('\n\n')
    f.write('InChI duplicates in inactives: \n')
    f.write(final_inactives_duplicates_InChI.to_string())
    f.write('\n\n')
    f.write('SMILES duplicates in hits: \n')
    f.write(final_hits_duplicates_smi.to_string())
    f.write('\n\n')
    f.write('SMILES duplicates in inactives: \n')
    f.write(final_inactives_duplicates_smi.to_string())

In [16]:
#remove these duplicates, keep the first one: 
#by inchi:
final_hits = pre14_hits.drop_duplicates(subset=['InChI'], keep='first')
final_inactives = pre14_inactives.drop_duplicates(subset=['InChI'], keep='first')

In [17]:
#as expected, there might still be SMILES duplicates:
if len(final_hits[final_hits.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]) == 0:
    print('No more duplicates in hits')

if len(final_inactives[final_inactives.duplicated(subset=['PUBCHEM_EXT_DATASOURCE_SMILES'], keep=False)]) == 0:
    print('No more duplicates in inactives')

No more duplicates in hits
No more duplicates in inactives


In [18]:
# save: 
if not os.path.exists(f'{data_folder}/finished'):
    os.makedirs(f'{data_folder}/finished')
final_hits.to_csv(f'{data_folder}/finished/final_hits.csv',sep=',', index=False)
final_inactives.to_csv(f'{data_folder}/finished/final_inactives.csv',sep=',', index=False)

# 0.1. Adjust columns: 

In [22]:
final_hits = pd.read_csv(f'{data_folder}/finished/final_hits.csv', sep=',', header=0)
final_inactives = pd.read_csv(f'{data_folder}/finished/final_inactives.csv', sep=',', header=0)

In [23]:
# Add another column: "activity_value" with all empty NaN values: 
final_hits.loc[:, 'activity_value'] = np.nan
final_inactives.loc[:, 'activity_value'] = np.nan

In [24]:
# Rename the columns:
final_hits = final_hits.rename(columns={
    'PUBCHEM_CID': 'CID', 
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
    'Mol removed from mixture': 'mol_removed_from_mixture',
    'Small inorganic molecule': 'small_inorganic_mol_from_mixture',
    'Small organic molecule': 'small_organic_mol_from_mixture'
})
final_inactives = final_inactives.rename(columns={
    'PUBCHEM_CID': 'CID', 
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
    'Mol removed from mixture': 'mol_removed_from_mixture',
    'Small inorganic molecule': 'small_inorganic_mol_from_mixture',
    'Small organic molecule': 'small_organic_mol_from_mixture'
})

In [25]:
#swap the positions of the columns InChI and activity_outcome: 
final_hits = final_hits[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]
final_inactives = final_inactives[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]

In [27]:
#export: 
final_hits.to_csv(f'{data_folder}/finished/final_hits.csv', sep=',', index=False)
final_inactives.to_csv(f'{data_folder}/finished/final_inactives.csv', sep=',', index=False)

# 0.2. Exporting raw data without further curation (only smiles, inchi) for control experiments

In [30]:
for AID in AIDs: 
    exec(f"raw{AID} = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv', sep=',', header=0)")

#import inchi:
for AID in AIDs: 
    exec(f"std_inchi{AID} = pd.read_csv(f'{data_folder}/before_finished/step_3/std_inchi_{AID}.txt', sep='\t', header=None)")

#Update inchi
for AID in AIDs: 
    exec(f"""
raw_inchi_dict{AID} = dict(zip(std_inchi{AID}[0], std_inchi{AID}[1]))
raw{AID}['InChI'] = raw{AID}['PUBCHEM_CID'].map(raw_inchi_dict{AID})
""")

In [31]:
raw_hits = raw1672[raw1672['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']
raw_inactives = raw1672[raw1672['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inactive']

In [33]:
# Add some other columns to match the format of the curated data: 
raw_hits.loc[:, 'activity_value'] = np.nan
raw_hits.loc[:, 'mol_removed_from_mixture'] = np.nan
raw_hits.loc[:, 'small_inorganic_mol_from_mixture'] = np.nan
raw_hits.loc[:, 'small_organic_mol_from_mixture'] = np.nan

raw_inactives.loc[:, 'activity_value'] = np.nan
raw_inactives.loc[:, 'mol_removed_from_mixture'] = np.nan
raw_inactives.loc[:, 'small_inorganic_mol_from_mixture'] = np.nan
raw_inactives.loc[:, 'small_organic_mol_from_mixture'] = np.nan

In [34]:
# Rename the columns:
raw_hits = raw_hits.rename(columns={
    'PUBCHEM_CID': 'CID', 
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
})
raw_inactives = raw_inactives.rename(columns={
    'PUBCHEM_CID': 'CID', 
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
})

#swap the positions of the columns InChI and activity_outcome: 
raw_hits = raw_hits[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]
raw_inactives = raw_inactives[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]

In [36]:
if not os.path.exists(f'{data_folder}/finished/control_data'):
    os.makedirs(f'{data_folder}/finished/control_data')

#save the hits and inactives
raw_hits.to_csv(f'{data_folder}/finished/control_data/raw_hits.csv', sep=',', index=False)
raw_inactives.to_csv(f'{data_folder}/finished/control_data/raw_inactives.csv', sep=',', index=False)

#save as txt: 
raw_hits.to_csv(f'{data_folder}/finished/control_data/raw_hits.txt', sep=';', index=False, header=False)
raw_inactives.to_csv(f'{data_folder}/finished/control_data/raw_inactives.txt', sep=';', index=False, header=False)

# 0.3. Convert the files to txt. for CORINA Classic

In [37]:
final_hits = pd.read_csv(f'{data_folder}/finished/final_hits.csv', sep=',', header=0)
final_inactives = pd.read_csv(f'{data_folder}/finished/final_inactives.csv', sep=',', header=0)

# export to .txt files:
final_hits.to_csv(f'{data_folder}/finished/final_hits.txt', sep=';', index=False, header=False)
final_inactives.to_csv(f'{data_folder}/finished/final_inactives.txt', sep=';', index=False, header=False)