# 0. Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import requests
import os
import json
import tqdm

from rdkit import Chem
from tqdm import tqdm
from thermo import functional_groups
from Bio import Entrez
from chembl_structure_pipeline import checker
from rdkit.Chem import rdMolDescriptors, Descriptors, Lipinski, Crippen, inchi
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from concurrent.futures import ThreadPoolExecutor, as_completed

import utils
import filters
from data_gathering import download_and_save

[15:15:21] Initializing Normalizer


In [2]:
# Save the path to your curation folder to "curation_path"
dataset_name = 'AID435008'
curation_path = 'S:/coding/WelQrate/'
data_folder = f'{curation_path}/data/{dataset_name}'

# Columns to be extracted from the assays:
# Modify if your datasets have different format
smi_col = 'PUBCHEM_EXT_DATASOURCE_SMILES' # Column containing SMILES
cid_col = 'PUBCHEM_CID' # Column containing identifiers (e.g, CIDs)
activity_col = 'PUBCHEM_ACTIVITY_OUTCOME' # Column containing activity outcomes
col_list = [cid_col, smi_col, activity_col]

# 1. Data Gathering

Before importing data, need to identify which AIDs will be included. 

Data will be imported from https://pubchem.ncbi.nlm.nih.gov/assay/. For more information on PubChem's programmatic access, refer to: https://pubchem.ncbi.nlm.nih.gov/docs/bioassays. Some other programmatic access options available such as PUG-REST. However, these might not be optimal for bulk retrieval or handling of large dataset due to the limitation of request volume.

Data for individual assays include 7 required columns (CIDs, isomeric SMILES, etc.) and optional test results. Refer to https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/README for further details. For datasets intended for regression model, additional columns could be extracted accordingly.

In [3]:
# Desired AIDs:
AIDs = [485270, 434989, 463079, 492964, 493232, 492965, 492963, 504701, 504699]

In [4]:
#Keep unique values in list AIDs (since there could be overlapping AIDs from different targets or project)
AIDs = list(set(AIDs))
AIDs = [str(AID) for AID in AIDs]
print('Number of datasets retrieving: ', len(AIDs))

Number of datasets retrieving:  9


In [5]:
download_and_save(AIDs, data_folder, col_list, smi_col, cid_col, activity_col)

Completed 1 out of 9 datasets.
Completed 2 out of 9 datasets.
Completed 3 out of 9 datasets.
Completed 4 out of 9 datasets.
Completed 5 out of 9 datasets.
Completed 6 out of 9 datasets.
Completed 7 out of 9 datasets.
Completed 8 out of 9 datasets.
Completed 9 out of 9 datasets.


# 2. Isomeric SMILES

For the purpose of our project, we would like to include isomeric form of SMILES representation in our final dataset. Although PubChem claimed that their datatable should include isomeric SMILES (https://pubchem.ncbi.nlm.nih.gov/docs/bioassays), some dataset might include non-isomeric SMILES. This step is to import isomeric SMILES based on CIDs.

Several packages such as RDkit have modules to return isomeric SMILES from a given input SMILES. However, for consistency, we decided to use the PubChem Identifier Exchange Service, which take an input identifier (CIDs, SMILES, InChI, etc.)  and return the corresponding identifier (CIDs, isomeric SMILES, InChIs, etc.). Here, we export the list of CIDs for compounds in our dataset and use this server to retrieve their isomeric SMILES. For more information, refer to: https://pubchem.ncbi.nlm.nih.gov/docs/identifier-exchange-service

In [None]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_2'):
    os.makedirs(f'{data_folder}/before_finished/step_2')

#Export list of CIDs to csv with one column (without the column name):
for AID in AIDs:
    assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')
    cids = assay['PUBCHEM_CID'].astype(int)  # Ensure the CIDs are integers
    cids.to_csv(f'{data_folder}/before_finished/step_2/CID{AID}.csv', index=False, header=False)

#After this step, we submit the list at https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi with operator type "same CID" and Output IDs "SMILES" (isomeric SMILES by default)
#https://pubchem.ncbi.nlm.nih.gov/docs/identifier-exchange-service for more details
#Here I named the output file as "SMILES{AID}.txt"

In [None]:
def check_isomeric_smiles(AIDs):
    """
    Check if the SMILES in the assay are the same as the isomeric forms returned by pubchem idexchange.
    Input: AIDs (list of strings)
    Output: non_isomeric_smi_cids (dictionary with AID as key and list of CIDs as values for the datasets in AIDs
    """
    non_isomeric_smi_cids = {}
    for AID in AIDs:    
        non_isomeric_smi_cids[AID] = []
        #import SMILES.txt file as a table:
        correct_isomeric_smiles = pd.read_csv(f'{data_folder}/before_finished/step_2/isomeric_smi_{AID}.txt', sep='\t', header=None)
        assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')

        #compare smiles in assay with smiles in correct_smiles:
        for cid in assay['PUBCHEM_CID']:
            if assay.loc[assay['PUBCHEM_CID'] == cid, 'PUBCHEM_EXT_DATASOURCE_SMILES'].values[0] != correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]:
                non_isomeric_smi_cids[AID].append(cid)

        if len(non_isomeric_smi_cids[AID]) == 0:
            print(f'All SMILES in AID {AID} are isomeric')
        else:
            print(f'There are some potential non-isomeric SMILES in AID {AID}:')
            print(non_isomeric_smi_cids[AID])

    return non_isomeric_smi_cids

def update_isomeric(AIDs, non_isomeric_smi_cids):
    """
    Update the SMILES in the assay to isomeric SMILES.
    Input: AIDs (list of strings), non_isomeric_smi_cids (dictionary with AID as key and list of non-isomeric CIDs as values)
    """
    with open(f'{data_folder}/before_finished/step_2/non_isomeric_smi_cids.txt', 'w') as f:
        # record the non-isomeric SMILES 
        for AID in AIDs:
            assay = pd.read_csv(f'{data_folder}/before_finished/step_1/AID{AID}.csv')
            correct_isomeric_smiles = pd.read_csv(f'{data_folder}/before_finished/step_2/isomeric_smi_{AID}.txt', sep='\t', header=None)
            f.write(f'AID {AID}: {non_isomeric_smi_cids[AID]}\n')

            for cid in non_isomeric_smi_cids[AID]:
                f.write(f'CID {cid}: {assay.loc[assay["PUBCHEM_CID"] == cid, "PUBCHEM_EXT_DATASOURCE_SMILES"].values[0]} -> {correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]}\n')
                assay.loc[assay['PUBCHEM_CID'] == cid, 'PUBCHEM_EXT_DATASOURCE_SMILES'] = correct_isomeric_smiles.loc[correct_isomeric_smiles[0] == cid, 1].values[0]

            f.write(f'===\n')
            assay.to_csv(f'{data_folder}/before_finished/step_2/AID{AID}.csv', index=False)       

In [None]:
non_isomeric_smi_cids = check_isomeric_smiles(AIDs)

All SMILES in AID 1488 are isomeric
There are some non-isomeric SMILES in AID 626:
[2997662, 2997957, 2999888]
All SMILES in AID 1741 are isomeric


Note: Here they returned that three smiles in AID626 were not isomeric. Again, this shows that the SMILES representation of some compounds in the given datasets might not be isomeric.

In [None]:
update_isomeric(AIDs, non_isomeric_smi_cids)

# 3. Import InChI 

We would like to include standard InChI to diversify users' choice of which data they would like to use for their own benchmark.

In [None]:
# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_3'):
    os.makedirs(f'{data_folder}/before_finished/step_3')

Again, it is convenient to use the PubChem Identifier Exchange Service (https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi) with operator type "same CID" and Output IDs "InChI" to retrieve InChI from a given list of input CIDs. The same CID lists from STEP 2 could be used here. The resulted InChIs could be checked if being standard by indentifying the presence of 'InChI=1S' at the begining of each InChI string.

In [None]:
"""
CID lists (in "step_2" folder should be submitted to PubChem Identifier Exchange Service)
    Operator type: "same CID" 
    Output IDs "InChI"
    Output method: "Two column file showing each input output-correspondence"
    Compression: "No compression"
InChI list should be saved into "step_3" folder, named as "std_inchi_{AID}.txt" 
"""
# Import dataframes:
for AID in AIDs: 
    exec(f'AID{AID} = pd.read_csv("{data_folder}/before_finished/step_2/AID{AID}.csv")')
    exec(f'AID{AID}_InChI = pd.read_csv("{data_folder}/before_finished/step_3/std_inchi_{AID}.txt", sep="\\t", header=None)')

In [None]:
#Check if they are all standard InChI:
for AID in AIDs:
    check_inchi = f"""
non_standard_InChI = []
for i in range(len(AID{AID}_InChI[1])):
    if not AID{AID}_InChI[1][i].startswith('InChI=1S'):
        non_standard_InChI.append(AID{AID}_InChI[1][i])
if not non_standard_InChI:
    print('All InChI in AID{AID} are standard')
else:
    print('There are some non-standard InChI in AID{AID}')
    print(non_standard_InChI)
    print('===')
"""
    exec(check_inchi)

All InChI in AID492963 are standard
All InChI in AID492964 are standard
All InChI in AID492965 are standard
All InChI in AID463079 are standard
All InChI in AID434989 are standard
All InChI in AID493232 are standard
All InChI in AID485270 are standard
All InChI in AID504699 are standard
All InChI in AID504701 are standard


Now we concatenate the InChIs in our tables:

In [None]:
# Update and save the files
for AID in AIDs: 
    update_inchi = f"""
AID{AID}_InChI_dict = dict(zip(AID{AID}_InChI[0], AID{AID}_InChI[1]))
AID{AID}['InChI'] = AID{AID}[cid_col].map(AID{AID}_InChI_dict)
AID{AID}['InChI'] = AID{AID}['InChI'].astype(str)
AID{AID}[cid_col] = AID{AID}[cid_col].astype(int)
AID{AID}.to_csv(r"{data_folder}/before_finished/step_3/AID{AID}.csv", index=False)
"""
    exec(update_inchi)

# 4. Check Duplicates

When checking duplicates in the datasets, we would like to know if there are
1) Multiple identical molecules
2) Molecules with identical CID but different InChIs or SMILES
3) Molecules with identical InChI but with different CIDs or SMILES

In [None]:
#import:
for AID in AIDs:
    exec(f"AID{AID} = pd.read_csv(r'{data_folder}/before_finished/step_3/AID{AID}.csv', sep=',', header=0)")

#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_4'):
    os.makedirs(f'{data_folder}/before_finished/step_4')

## 4.1. Checking identical molecules

In [None]:
#Return all duplicates by comparing InChI, SMILES, and CIDs:
for AID in AIDs:
    check_duplicate = f"""
AID{AID}_duplicates_InChI = AID{AID}[AID{AID}.duplicated(subset=['InChI'], keep=False)]
AID{AID}_duplicates_SMILES = AID{AID}[AID{AID}.duplicated(subset=[smi_col], keep=False)]
AID{AID}_duplicates_CIDs = AID{AID}[AID{AID}.duplicated(subset=[cid_col], keep=False)]
print('Number of AID{AID} InChI duplicates: ', len(AID{AID}_duplicates_InChI))
print('Number of AID{AID} SMILES duplicates: ', len(AID{AID}_duplicates_SMILES))
print('Number of AID{AID} CID duplicates: ', len(AID{AID}_duplicates_CIDs))
"""
    exec(check_duplicate)

Number of AID492963 InChI duplicates:  0
Number of AID492963 SMILES duplicates:  0
Number of AID492963 CID duplicates:  0
Number of AID492964 InChI duplicates:  2
Number of AID492964 SMILES duplicates:  2
Number of AID492964 CID duplicates:  2
Number of AID492965 InChI duplicates:  0
Number of AID492965 SMILES duplicates:  0
Number of AID492965 CID duplicates:  0
Number of AID463079 InChI duplicates:  273
Number of AID463079 SMILES duplicates:  221
Number of AID463079 CID duplicates:  219
Number of AID434989 InChI duplicates:  273
Number of AID434989 SMILES duplicates:  221
Number of AID434989 CID duplicates:  219
Number of AID493232 InChI duplicates:  2
Number of AID493232 SMILES duplicates:  2
Number of AID493232 CID duplicates:  2
Number of AID485270 InChI duplicates:  273
Number of AID485270 SMILES duplicates:  221
Number of AID485270 CID duplicates:  219
Number of AID504699 InChI duplicates:  0
Number of AID504699 SMILES duplicates:  0
Number of AID504699 CID duplicates:  0
Number

In [None]:
#write duplicates to a txt file: 
with open(f'{data_folder}/before_finished/step_4/duplicates.txt', 'w') as f:
    for AID in AIDs: 
        duplicates_InChI = eval(f'AID{AID}_duplicates_InChI')
        duplicates_SMILES = eval(f'AID{AID}_duplicates_SMILES')
        duplicates_CIDs = eval(f'AID{AID}_duplicates_CIDs')
        f.write(f'\n\nAID{AID} InChI duplicates:\n')
        f.write(duplicates_InChI.to_string())
        f.write(f'\nAID{AID} SMILES duplicates:\n')
        f.write(duplicates_SMILES.to_string())
        f.write(f'\nAID{AID} CID duplicates:\n')
        f.write(duplicates_CIDs.to_string())

## 4.2. Same CIDs but different chemical representations

In [None]:
#reindex
for AID in AIDs: 
    exec(f"AID{AID}_duplicates_CIDs.reset_index(drop=True, inplace=True)")

In [None]:
with open(f'{data_folder}/before_finished/step_4/sameCID_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameCID_differentInChI = []
        sameCID_differentSMILES = []
        duplicates_CIDs = eval(f'AID{AID}_duplicates_CIDs')
        for i in range(len(duplicates_CIDs[cid_col])):
            for j in range(i+1, len(duplicates_CIDs[cid_col])):
                if duplicates_CIDs[cid_col][i] == duplicates_CIDs[cid_col][j]:
                    if duplicates_CIDs['InChI'][i] != duplicates_CIDs['InChI'][j]:
                        sameCID_differentInChI.append((duplicates_CIDs[cid_col][i], duplicates_CIDs[cid_col][j]))
                    if duplicates_CIDs[smi_col][i] != duplicates_CIDs[smi_col][j]:
                        sameCID_differentSMILES.append((duplicates_CIDs[cid_col][i], duplicates_CIDs[cid_col][j]))

        if sameCID_differentInChI == []:
            f.write(f'No duplicate CIDs with different InChIs in AID{AID}\n')
        else:
            f.write('Found duplicate CIDs with different InChIs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameCID_differentInChI))
            f.write("\n")
        
        if sameCID_differentSMILES == []:
            f.write(f'No duplicate CIDs with different SMILES in AID{AID}\n')
        else:
            f.write(f'Found duplicate CIDs with different SMILES in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameCID_differentSMILES))
            f.write("\n")
        f.write("===\n")

## 4.3. Same InChI but with different CIDs or SMILES

In [None]:
#reindex
for AID in AIDs:
    exec(f"AID{AID}_duplicates_InChI.reset_index(drop=True, inplace=True)")

In [None]:
with open(f'{data_folder}/before_finished/step_4/sameInChI_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameInChI_differentCID = []
        sameInChI_differentSMILES = []
        duplicates_InChI = eval(f'AID{AID}_duplicates_InChI')
        for i in range(len(duplicates_InChI['InChI'])):
            for j in range(i+1, len(duplicates_InChI['InChI'])):
                if duplicates_InChI['InChI'][i] == duplicates_InChI['InChI'][j]:
                    if duplicates_InChI[cid_col][i] != duplicates_InChI[cid_col][j]:
                        sameInChI_differentCID.append((duplicates_InChI[cid_col][i], duplicates_InChI[cid_col][j]))
                    if duplicates_InChI[smi_col][i] != duplicates_InChI[smi_col][j]:
                        sameInChI_differentSMILES.append((duplicates_InChI[cid_col][i], duplicates_InChI[cid_col][j]))
        
        if sameInChI_differentCID == []:
            f.write(f'No duplicate InChIs with different CIDs in AID{AID}\n')
        else:
            f.write('Found duplicate InChIs with different CIDs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameInChI_differentCID))
            f.write("\n")
        
        if sameInChI_differentSMILES == []:
            f.write(f'No duplicate InChIs with different SMILES in AID{AID}\n')
        else:
            f.write(f'Found duplicate InChIs with different SMILES in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameInChI_differentSMILES))
            f.write("\n")
        f.write("===\n")

## 4.4. Same SMILES but with different CIDs or SMILES

In [None]:
#reindex
for AID in AIDs:
    exec(f"AID{AID}_duplicates_SMILES.reset_index(drop=True, inplace=True)")

In [None]:
with open(f'{data_folder}/before_finished/step_4/sameSMILES_different_others.txt', 'w') as f:
    for AID in AIDs: 
        sameSMILES_differentCID = []
        sameSMILES_differentInChI = []
        duplicates_SMILES = eval(f'AID{AID}_duplicates_SMILES')
        for i in range(len(duplicates_SMILES[smi_col])):
            for j in range(i+1, len(duplicates_SMILES[smi_col])):
                if duplicates_SMILES[smi_col][i] == duplicates_SMILES[smi_col][j]:
                    if duplicates_SMILES[cid_col][i] != duplicates_SMILES[cid_col][j]:
                        sameSMILES_differentCID.append((duplicates_SMILES[cid_col][i], duplicates_SMILES[cid_col][j]))
                    if duplicates_SMILES['InChI'][i] != duplicates_SMILES['InChI'][j]:
                        sameSMILES_differentInChI.append((duplicates_SMILES[cid_col][i], duplicates_SMILES[cid_col][j]))
        
        if sameSMILES_differentCID == []:
            f.write(f'No duplicate SMILES with different CIDs in AID{AID}\n')
        else:
            f.write(f'Found duplicate SMILES with different CIDs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameSMILES_differentCID))
            f.write("\n")
        
        if sameSMILES_differentInChI == []:
            f.write(f'No duplicate SMILES with different InChIs in AID{AID}\n')
        else:
            f.write(f'Found duplicate SMILES with different InChIs in AID{AID}:\n')
            f.write('\n'.join(str(item) for item in sameSMILES_differentInChI))
            f.write("\n")
        f.write("===\n")

## 4.5. Drop duplicates

When dropping duplicates, we will keep the first molecule in a pair or a group of duplicates. For example, here there are 12 duplicates (6 pairs) so we keep 6 of them.

In [None]:
# Keep only the first duplicate in the dataframes:
for AID in AIDs: 
    exec(f"AID{AID}.drop_duplicates(subset=['InChI'], keep='first', inplace=True)")

    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=['InChI'], keep=False)]) == 0:
    print('No more duplicate InChI in AID{AID}')
else:
    print('There are still duplicate InChI in AID{AID}')   
    """
    exec(last_check)

No more duplicate InChI in AID492963
No more duplicate InChI in AID492964
No more duplicate InChI in AID492965
No more duplicate InChI in AID463079
No more duplicate InChI in AID434989
No more duplicate InChI in AID493232
No more duplicate InChI in AID485270
No more duplicate InChI in AID504699
No more duplicate InChI in AID504701


In [None]:
for AID in AIDs: 
    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=[smi_col], keep=False)]) == 0:
    print('No more duplicate SMILES in AID{AID}')
else:
    print('There are still duplicate SMILES in AID{AID}')   
    """
    exec(last_check)

In [None]:
for AID in AIDs: 
    last_check = f"""
if len(AID{AID}[AID{AID}.duplicated(subset=[cid_col], keep=False)]) == 0:
    print('No more duplicate CID in AID{AID}')
else:
    print('There are still duplicate CID in AID{AID}')   
    """
    exec(last_check)

In [None]:
# Save the dataframes to csv:
for AID in AIDs: 
    exec(f"AID{AID}.to_csv(r'{data_folder}/before_finished/step_4/AID{AID}.csv', index=False)")

# 5. Hierarchical Curation

For the hierarchical curation, there are some rules:

(1) All assays used should be on the same or close species/cell lines. Optimally, they should also be from the same project/laboratory.

(2) Primary actives (PrA) will have a large false-positive rate. Therefore, they should be tested in follow-up confirmatory screens (optimally dose-reponse).

(3) Actives could be promiscuous. Therefore, it is optimal to have counter-screens on different targets to test specificity.

(4) For some projects, compounds were tested in multiple rounds. Therefore, assays often have hierarchical relations. From a single primary screen (Pr), active compounds (Pr_A) could be tested in multiple rounds of confirmatory screens (Cf_1, Cf_2, ..., Cf_final) or counter screens (Ct_1, Ct_2, etc.). Actives from confirmatory screens (Cf_actives) have a higher possibility of being true active. If an active compound is tested active in counter screens (Cf_actives), it is likely to be a promiscuous compound and should not be included.

(4) It is important to know the relationship between assays. Active sets from downstream screens always have a lower false-positive rate than active sets from upstream screens due to better assay technologies on a smaller set of compounds. Therefore, final hits should be taken from the intersection of the very last confirmatory assays, without tested active in any counter-screen:
Final hits = [Cf_final1_actives ∩ Cf_final2_actives ∩ ...] \ [Ct_1_actives ∪ Ct_2_actives ∪ ...]

However, if the confirmatory assays are unrelated (tested on different set of compounds), then we might have to take the union of their active sets instead of the intersections as in this formula.

(5) The hierarchical relations should be inspected carefully to see if follow-up confirmatory screens include extra compounds (Ex) that were not tested in earlier screens or tested inactive in earlier screens. If exist, these compounds require manual inspection.

(6) Final inactives should be taken from primary inactives (Pr_inactives) (not inconclusive, unspecified, or probes), plus extra compounds that were tested inactive in conformatory screens (Ex_inactives), if justified.
Final inactives = Pr_inactives ∪ Ex_inactives

## 5.1. Classify groups of compounds in each assay by activities

In [None]:
path = f'{data_folder}/before_finished/step_4'
keynumbers = [485270, 434989, 463079, 492964, 493232, 492965, 492963, 504701, 504699] # specify the keynumbers you want to import

for keynumber in keynumbers:
    filename = os.path.join(path, f'AID{keynumber}.csv')
    if os.path.exists(filename):
        df = pd.read_csv(filename, index_col=None, header=0)
        exec(f'AID{keynumber} = df')
        exec(f'AID{keynumber}_active = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Active"]')
        exec(f'AID{keynumber}_inactive = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Inactive"]')
        exec(f'AID{keynumber}_inconclusive = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Inconclusive"]')
        exec(f'AID{keynumber}_unspecified = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Unspecified"]')
        exec(f'AID{keynumber}_probe = df[df["PUBCHEM_ACTIVITY_OUTCOME"]=="Probe"]')

In [None]:
#Create a df with first column the variables name, and the second column the number of rows:
df = pd.DataFrame(columns=['AID', 'Tested Compounds', 'Active', 'Inactive', 'Inconclusive', 'Unspecified', 'Probe'])
for keynumber in keynumbers:
    exec(f'df.loc[len(df)] = ["AID{keynumber}", len(AID{keynumber}), len(AID{keynumber}_active), len(AID{keynumber}_inactive), len(AID{keynumber}_inconclusive), len(AID{keynumber}_unspecified), len(AID{keynumber}_probe)]')
df

Unnamed: 0,AID,Tested Compounds,Active,Inactive,Inconclusive,Unspecified,Probe
0,AID485270,325997,5748,320249,0,0,0
1,AID434989,325997,1387,324610,0,0,0
2,AID463079,325997,2039,323958,0,0,0
3,AID492964,1806,883,923,0,0,0
4,AID493232,1806,862,944,0,0,0
5,AID492965,1095,102,993,0,0,0
6,AID492963,1095,421,674,0,0,0
7,AID504701,247,228,19,0,0,0
8,AID504699,247,161,86,0,0,0


## 5.2. Check the hierachical relations

In [None]:
def check_is_in(downstream, upstream):
    downstream_in_upstream = downstream[downstream['PUBCHEM_CID'].isin(upstream['PUBCHEM_CID'])]
    downstream_notin_upstream = downstream[~downstream['PUBCHEM_CID'].isin(upstream['PUBCHEM_CID'])]
    return downstream_in_upstream, downstream_notin_upstream

### Flow 1: AID485270 (Pr), AID492964 (Cf_1), AID493232 (Ct_1), and AID504701 (Cf_2)

Here, we have the flow from Pr, to Cf_1 and Ct_2. Then actives from Cf_1 that were not active in Ct_1 be tested in Cf_2. Therefore we need to confirm three layers of hierarchical relation:

- We want to know if any compounds in Cf_2 were inactive in Cf_1 or Pr, or active in Ct_1. If there are, they are probably some compounds that the assays' authors found valuable (e.g, having some similar structures to the confirmed actives) and wanted to re-test. If they became active finally, we would like to include them as potential_hits and remove them from the potential_inactives (given the high rigor of dose-response conditions). If they became inactive finally, we would like to include them in potential_inactives since they could offer a more challenging training by having similar structure to some of the confirmed actives.

- We want to know if any compounds in Cf_2 were not even tested in Pr at all. If there are, they are probably some compounds that the assays' authors found valuable (e.g, having some similar structures to the confirmed actives) and wanted to test in dose-response. If they became active finally, we would like to include them as potential_hits (given the high rigor of dose-response conditions). If they became inactive finally, we would like to include them in potential_inactives since they could offer a more challenging training by having similar structure to some of the confirmed actives.

- We also want to know if any compounds in Cf_2 were actually active in Ct_1. If they are active finally, they likely to be promicuous compounds and should be removed from the final_hits.

In [None]:
# Between Cf_2 and Cf_1
a1, a2 = check_is_in(AID504701, AID492964_inactive)
a3, a4 = check_is_in(a1, AID504701_active)
print(f'Among AID504701, {len(a1)} were tested inactive in AID492964. Among these, {len(a3)} became active')

# Between Cf_2 and Pr
b1, b2 = check_is_in(AID504701, AID485270_inactive)
b3, b4 = check_is_in(b1, AID504701_active)
b5, b6 = check_is_in(AID504701, AID485270)
b7, b8 = check_is_in(b6, AID504701_active)
print(f'Among AID504701, {len(b1)} were tested inactive in AID485270. Among these, {len(b3)} became active')
print(f'Among AID504701, {len(b6)} were not tested in the AID485270. Among these, {len(b7)} became active')

# Between Cf_2 and Ct_1
c1, c2 = check_is_in(AID504701, AID493232_active)
c3, c4 = check_is_in(c1, AID504701_active)
print(f'Among AID504701, {len(c1)} were tested active in the counter AID493232. Among these, {len(c3)} became active')

Among AID504701, 5 were tested inactive in AID492964. Among these, 4 became active
Among AID504701, 34 were tested inactive in AID485270. Among these, 16 became active
Among AID504701, 0 were not tested in the AID485270. Among these, 0 became active
Among AID504701, 6 were tested active in the counter AID493232. Among these, 6 became active


### Flow 2: AID434989 (Pr), AID463079 (Ct_0), AID492965 (Ct_1), AID492963 (Cf_1), and AID 504699 (Cf_2):

In [None]:
d1, d2 = check_is_in(AID504699, AID492963_inactive)
d3, d4 = check_is_in(d1, AID504699_active)
print(f'Among AID504699, {len(d1)} were tested inactive in AID492963. Among these, {len(d3)} became active')

e1, e2 = check_is_in(AID504699, AID434989_inactive)
e3, e4 = check_is_in(e1, AID504699_active)
e5, e6 = check_is_in(AID504699, AID434989)
e7, e6 = check_is_in(e6, AID504699_active)
print(f'Among AID504699, {len(e1)} were tested inactive in primary AID434989. Among these, {len(e3)} became active')
print(f'Among AID504699, {len(e6)} were not tested in primary AID434989. Among these, {len(e7)} became active')

f1, f2 = check_is_in(AID504699, AID492965_active)
f3, f4 = check_is_in(f1, AID504699_active)
print(f'Among AID504699, {len(f1)} were tested active in the counter AID492965. Among these, {len(f3)} became active')

g1, g2 = check_is_in(AID504699, AID463079_active)
g3, g4 = check_is_in(g1, AID504699_active)
print(f'Among AID504699, {len(g1)} were tested active in the counter AID463079. Among these, {len(g3)} became active')


Among AID504699, 19 were tested inactive in AID492963. Among these, 5 became active
Among AID504699, 106 were tested inactive in primary AID434989. Among these, 51 became active
Among AID504699, 0 were not tested in primary AID434989. Among these, 0 became active
Among AID504699, 1 were tested active in the counter AID492965. Among these, 0 became active
Among AID504699, 0 were tested active in the counter AID463079. Among these, 0 became active


We hypothesized that some compounds for some reasons were re-tested in the follow-up confirmatory screens even though they were tested inactives in earlier screens or active in counter screens. This is most likely because the assays authors find these molecules promissing and would like to test them again. Therefore, we decided that for all of these Extra compounds (Ex), the active ones should go to potential hits, and the inactive ones should go to potential inactives, given the rigorous conditions of the confirmatory screens.

It should be noted that the Ex compounds that became active in the final confirmatory screens should be removed from the potential inactives (taken from primary inactives). Also since there are two flows here, there might be some inactives in one flow but active in another flow. These should also be removed from the potential inactives.

Here, none of the compounds were not tested in the primary screens but appeared in the final confirmatory screens.

6 compound was tested active in one of the counter screen but was still included in the final confirmatory screen and became active at the end. These 6 compounds should also be removed from potential hits.

In [None]:
# Ex1_inactives (Extra compounds that became inactive from Flow 1) is the union of dataframes a4 and b4 (re-tested inactives that remains inactive))
Ex1_inactives = pd.concat([a4, b4])
Ex1_inactives = Ex1_inactives.drop_duplicates(subset=cid_col)
flow1_inactives = pd.concat([AID485270_inactive, Ex1_inactives])
flow1_inactives = flow1_inactives.drop_duplicates(subset=cid_col)

# Ex2_inactives (Extra compounds that became inactive from Flow 2) is the union of dataframes d4 and e4 (re-tested inactives that remains inactive))
Ex2_inactives = pd.concat([d4, e4])
Ex2_inactives = Ex2_inactives.drop_duplicates(subset=cid_col)
flow2_inactives = pd.concat([AID434989_inactive, Ex2_inactives])
flow2_inactives = flow2_inactives.drop_duplicates(subset=cid_col)

In [None]:
flow1_actives = AID504701_active[~AID504701_active[cid_col].isin(c3[cid_col])]
flow2_actives = AID504699_active

## 5.3. Export the data

In [None]:
potential_actives = pd.concat([flow1_actives, flow2_actives])
potential_actives = potential_actives.drop_duplicates(subset=cid_col)

potential_inactives = pd.concat([flow1_inactives, flow2_inactives])
potential_inactives = potential_inactives.drop_duplicates(subset=cid_col)

In [None]:
# Remove any other compound that appear in both potential hits and potential inactives:
potential_inactives = potential_inactives[~potential_inactives[cid_col].isin(potential_actives[cid_col])]

In [None]:
if not os.path.exists(f'{data_folder}/before_finished/step_5'):
    os.makedirs(f'{data_folder}/before_finished/step_5')

In [None]:
#export the potential hits and inactives to csv:
potential_actives.to_csv(f'{data_folder}/before_finished/step_5/potential_actives.csv', index=False)
potential_inactives.to_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', index=False)

# 6. RDkit Parse Check

In [None]:
potential_actives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_actives.csv', sep=',', header=0)
potential_inactives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', sep=',', header=0)

In [None]:
problems_actives, cannot_parse_actives = filters.rdkit_parse(potential_actives, smi_col, cid_col)
problems_inactives, cannot_parse_inactives = filters.rdkit_parse(potential_inactives, smi_col, cid_col)

No problems detected
No problems detected


In [None]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_6'):
    os.makedirs(f'{data_folder}/before_finished/step_6')

with open(f'{data_folder}/before_finished/step_6/problem_list_actives.txt', 'w') as f:
    f.write("Problems:\n")
    for item in problems_actives:
        f.write("%s\n" % item)
    f.write("Cannot parse:\n")
    for item in cannot_parse_actives:
        f.write("%s\n" % item)

with open(f'{data_folder}/before_finished/step_6/problem_list_inactives.txt', 'w') as f:
    f.write("Problems:\n")
    for item in problems_inactives:
        f.write("%s\n" % item)
    f.write("Cannot parse:\n")
    for item in cannot_parse_inactives:
        f.write("%s\n" % item)

Our dataset returned no problem or non-parsable molecule.

# 7. Inorganics Filter

In [None]:
# Import data
potential_actives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_actives.csv', sep=',', header=0)
potential_inactives = pd.read_csv(f'{data_folder}/before_finished/step_5/potential_inactives.csv', sep=',', header=0)

In [None]:
inorganic_actives_cids, organic_actives_cids = filters.inorganic_filter(potential_actives, smi_col, cid_col, type='smiles')
print(f'Among actives, there are {len(organic_actives_cids)} organic molecules and {len(inorganic_actives_cids)} inorganic molecules')

inorganic_inactives_cids, organic_inactives_cids = filters.inorganic_filter(potential_inactives, smi_col, cid_col, type='smiles')
print(f'In inactives, there are {len(organic_inactives_cids)} organic molecules and {len(inorganic_inactives_cids)} inorganic molecules')

In [None]:
#Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_7'):
    os.makedirs(f'{data_folder}/before_finished/step_7')

with open(f'{data_folder}/before_finished/step_7/inorganic.txt', 'w') as f:
    f.write("Actives:\n")
    for item in inorganic_actives_cids:
        f.write("%s\n" % item)
    f.write("\n\nInactives:\n")
    for item in inorganic_inactives_cids:
        f.write("%s\n" % item)

# Drop inorganics: 
potential_actives = potential_actives[~potential_actives[cid_col].isin(inorganic_actives_cids)]
potential_inactives = potential_inactives[~potential_inactives[cid_col].isin(inorganic_inactives_cids)]

#save: 
potential_actives.to_csv(f'{data_folder}/before_finished/step_7/organic_actives.csv', index=False)
potential_inactives.to_csv(f'{data_folder}/before_finished/step_7/organic_inactives.csv', index=False)

print('Dropped inorganic and saved organic compounds into step_7 folder.')

# 8. Mixtures Handling

## 8.1. Quick Check

In [None]:
#import: 
organic_actives = pd.read_csv(f'{data_folder}/before_finished/step_7/organic_actives.csv', sep=',', header=0)
organic_inactives = pd.read_csv(f'{data_folder}/before_finished/step_7/organic_inactives.csv', sep=',', header=0)

In [None]:
filters.quick_check_mixtures('actives', organic_actives[smi_col])
filters.quick_check_mixtures('inactives', organic_inactives[smi_col])

Total number of mixtures in hits is 4
Total number of mixtures in inactives is 12905


## 8.2. Handling mixture

In [None]:
processed_actives, removed_actives, small_organic_actives, small_inorganic_actives, not_lipinski_actives, cleaned_actives = filters.process_smi_mixtures(organic_actives, smi_col, cid_col)
processed_inactives, removed_inactives, small_organic_inactives, small_inorganic_inactives, not_lipinski_inactives, cleaned_inactives = filters.process_smi_mixtures(organic_inactives, smi_col, cid_col)

Big organic molecule for CID 24892478 does not pass Lipinski's rule of five
Big organic molecule for CID 24891768 does not pass Lipinski's rule of five
Big organic molecule for CID 24892697 does not pass Lipinski's rule of five
Big organic molecule for CID 24891798 does not pass Lipinski's rule of five
Big organic molecule for CID 24892529 does not pass Lipinski's rule of five
Cannot decide between ['CC1=NC2=C(S1)C3=CC=CC=C3C=C2', 'C1=C(C=C(C(=C1[N+](=O)[O-])O)[N+](=O)[O-])[N+](=O)[O-]'] for CID 654127
Big organic molecule for CID 135675926 does not pass Lipinski's rule of five
Big organic molecule for CID 24892525 does not pass Lipinski's rule of five
Big organic molecule for CID 24891822 does not pass Lipinski's rule of five
Big organic molecule for CID 24891816 does not pass Lipinski's rule of five
Big organic molecule for CID 24982208 does not pass Lipinski's rule of five
Big organic molecule for CID 24891814 does not pass Lipinski's rule of five
Big organic molecule for CID 248918



Big organic molecule for CID 24892292 does not pass Lipinski's rule of five
Big organic molecule for CID 53312878 does not pass Lipinski's rule of five
Big organic molecule for CID 53299506 does not pass Lipinski's rule of five
Big organic molecule for CID 12005684 does not pass Lipinski's rule of five
Big organic molecule for CID 24891796 does not pass Lipinski's rule of five
Big organic molecule for CID 24892534 does not pass Lipinski's rule of five
Big organic molecule for CID 44143187 does not pass Lipinski's rule of five
Big organic molecule for CID 24891780 does not pass Lipinski's rule of five
Big organic molecule for CID 24892466 does not pass Lipinski's rule of five
Big organic molecule for CID 10864994 does not pass Lipinski's rule of five
Big organic molecule for CID 24892699 does not pass Lipinski's rule of five
Big organic molecule for CID 16682212 does not pass Lipinski's rule of five
Big organic molecule for CID 56603528 does not pass Lipinski's rule of five
Big organic 



Big organic molecule for CID 53312841 does not pass Lipinski's rule of five
Big organic molecule for CID 24891868 does not pass Lipinski's rule of five
Big organic molecule for CID 24892288 does not pass Lipinski's rule of five
Big organic molecule for CID 24891886 does not pass Lipinski's rule of five
Big organic molecule for CID 24982687 does not pass Lipinski's rule of five
Big organic molecule for CID 24892341 does not pass Lipinski's rule of five
Big organic molecule for CID 24892474 does not pass Lipinski's rule of five
Cannot decide between ['C1=CC=C(C=C1)OC2=CC=NC=C2', 'C1=C(C=C(C(=C1[N+](=O)[O-])O)[N+](=O)[O-])[N+](=O)[O-]'] for CID 23723127
Cannot decide between ['CC(C)NC[C@H](C1=CC(=C(C=C1)O)O)O', 'C(C(C(=O)O)O)(C(=O)O)O'] for CID 6852409
Big organic molecule for CID 24892293 does not pass Lipinski's rule of five
Big organic molecule for CID 3244813 does not pass Lipinski's rule of five
Big organic molecule for CID 24892536 does not pass Lipinski's rule of five
Big organic m

In [None]:
# Create a new step folder
if not os.path.exists(f'{data_folder}/before_finished/step_8'):
    os.makedirs(f'{data_folder}/before_finished/step_8')

#Generate df with the smiles column in the cleaned_actives or cleaned_inactives dictionary:
cleaned_actives_df = pd.DataFrame(list(cleaned_actives.values()), columns=[smi_col])
cleaned_inactives_df = pd.DataFrame(list(cleaned_inactives.values()), columns=[smi_col])

#Export the cleaned hits and inactives to csv:
cleaned_actives_df.to_csv(f'{data_folder}/before_finished/step_8/cleaned_mixtures_actives.csv', index=False, header=False)
cleaned_inactives_df.to_csv(f'{data_folder}/before_finished/step_8/cleaned_mixtures_inactives.csv', index=False, header=False)

In [None]:
processed_hits_df = filters.process_mixture_df('actives_ORX1R_antagonist', organic_actives, processed_actives, removed_actives, small_organic_actives, small_inorganic_actives, smi_col, cid_col)
processed_inactives_df = filters.process_mixture_df('inactives_ORX1R_antagonist', organic_inactives, processed_inactives, removed_inactives, small_organic_inactives, small_inorganic_inactives, smi_col, cid_col)

24892478 has been removed from inactives_ORX1R_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
24891768 has been removed from inactives_ORX1R_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
24892697 has been removed from inactives_ORX1R_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
24891798 has been removed from inactives_ORX1R_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
24892529 has been removed from inactives_ORX1R_antagonist because it is a mixture with less than 5 atoms difference or the big organic molecule does not pass Lipinski's rule of five.
654127 has been removed from inactives_ORX1R_antagonist because it is a mixture with 

In [None]:
with open(f'{data_folder}/before_finished/step_8/mixture.txt', 'w') as f:
    f.write(f"""
Hits before processing: {len(organic_actives)}
Hits before processing: {len(organic_actives)}
Hits after processing: {len(processed_hits_df)}
Mixtures detected: {len(removed_actives)}
Mixtures with small inorganic molecules: {len(small_inorganic_actives)}
Mixtures with big organic molecules passing Lipinski: {len(small_organic_actives)}
Mixtures with big organic molecules not passing Lipinski: {len(not_lipinski_actives)}

Inactives before processing: {len(organic_inactives)}
Inactives after processing: {len(processed_inactives_df)}
Mixtures detected: {len(removed_inactives)}
Mixtures with small inorganic molecules: {len(small_inorganic_inactives)}
Mixtures with big organic molecules passing Lipinski: {len(small_organic_inactives)}
Mixtures with big organic molecules not passing Lipinski: {len(not_lipinski_inactives)}
""")

# Save the processed dataframes to csv
processed_hits_df.to_csv(f'{data_folder}/before_finished/step_8/post8_actives.csv', index=False)
processed_inactives_df.to_csv(f'{data_folder}/before_finished/step_8/post8_inactives.csv', index=False)

print('Dataframes saved successfully')

Dataframes saved successfully


# 9. Neutralize & 10. Aromatize Molecules

In [None]:
# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_9_10'):
    os.makedirs(f'{data_folder}/before_finished/step_9_10')

#import:
pre9_actives = pd.read_csv(f'{data_folder}/before_finished/step_8/post8_actives.csv', sep=',', header=0)
pre9_inactives = pd.read_csv(f'{data_folder}/before_finished/step_8/post8_inactives.csv', sep=',', header=0)

In [None]:
updated_smi = []

#Update dataset with neutralized, aromatic SMILES
for smi in pre9_actives[smi_col]: 
    mol = Chem.MolFromSmiles(smi)
    mol_neu = utils.neutralize_atoms(mol)
    smi_arom = utils.aromatize_smile(mol_neu)
    updated_smi.append(smi_arom)
    
#update the smiles in this df
pre9_actives[smi_col] = updated_smi

In [None]:
updated_smi = []

#Update dataset with neutralized, aromatic SMILES
for smi in pre9_inactives[smi_col]: 
    mol = Chem.MolFromSmiles(smi)
    mol_neu = utils.neutralize_atoms(mol)
    smi_arom = utils.aromatize_smile(mol_neu)
    updated_smi.append(smi_arom)

#update the smiles in this df
pre9_inactives[smi_col] = updated_smi

In [None]:
#Save
pre9_actives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_actives.csv', index=False)
pre9_inactives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', index=False)

# Post 9+10: Update InChI


Is it important to now update InChI in our datasets, for 2 reasons:

(1) Some mixture compounds have been modified (removal of small inorganic or organic molecules) in SMILES representation but not InChIs.

(2) The SMILES representations have been neutralized and aromatized, but not InChIs.

In [None]:
#export the smiles columns to txt
pre9_actives[smi_col].to_csv(f'{data_folder}/before_finished/step_9_10/smiles_actives.txt', index=False, header=False)
pre9_inactives[smi_col].to_csv(f'{data_folder}/before_finished/step_9_10/smiles_inactives.txt', index=False, header=False)

In [None]:
"""
Submit the smiles files to PubChem Identifier Exchange Service: 
    Input IDs: "SMILES"
    Operator type: "same CID" 
    Output IDs: "InChI"
    Output method: "Two column file showing each input output-correspondence"
    Compression: "No compression"
InChI list should be saved into "step_9_10" folder, named as "inchi_actives.txt" and "inchi_inactives" 
"""
#Import the converted InChIs
cleaned_inchi_hits = pd.read_csv(f'{data_folder}/before_finished/step_9_10/inchi_actives.txt', sep='\t', header=None)
cleaned_inchi_inactives = pd.read_csv(f'{data_folder}/before_finished/step_9_10/inchi_inactives.txt', sep='\t', header=None)

#a dictionary of smiles and corresponding inchi in cleaned_inchi_hits
hits_smi_inchi_dict = dict(zip(cleaned_inchi_hits[0], cleaned_inchi_hits[1]))
inactives_smi_inchi_dict = dict(zip(cleaned_inchi_inactives[0], cleaned_inchi_inactives[1]))
                             
#update the pre9_hits by matching the smiles with keys and replace inchi with values:
pre9_actives['InChI'] = pre9_actives[smi_col].map(hits_smi_inchi_dict) 
pre9_inactives['InChI'] = pre9_inactives[smi_col].map(inactives_smi_inchi_dict)

In [None]:
#export: 
pre9_actives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_actives.csv', index=False)
pre9_inactives.to_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', index=False)

# 11. PAIN filters

## 11.1. Frequency of Hits (FoH) Filter

Frequency of Hits is a complex concept that requires a merticulous approach. In general, the rule is if a compound was tested active in multiple assays, it is likely to be a promiscuous compound.
1. For each compounds, retrieve the information on its tested assays
2. For each of the assay tested, retrieve the sequence of the protein target.
3. Given all sequence of the protein tested, do a multiple sequence alignment to find the percentage Percent Identity (similarty) between these proteins. If an assay has high percentage to other targets, then these assays contribute less to promiscuousity of the compound.
4. Use the percentage identity as a weight:
w = 1 - %SI/100
Calculate the frequency of hits for each compound:
FoH = wACC/TAC
wACC is the weighed total number of assay tested where the compounds were identified acitives. TAC is the total number of assays tested.

In [None]:
pre11_actives = pd.read_csv(f'{data_folder}/before_finished/step_9_10/post10_actives.csv', sep=',', header=0)
pre11_inactives = pd.read_csv(f'{data_folder}/before_finished/step_9_10/post10_inactives.csv', sep=',', header=0)

# Create a new step folder:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_1'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_1')

### 11.1.1. PubChem testing information for each compound

This part illustrates how to retrieve the information of how each compound was tested from the PubChem database. Bulk data retrieval from the ftp server is used to get the information of every bioassay in PubChem:

In [None]:
url = 'https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Extras/bioassays.tsv.gz' #this FTP file records the summary data of all available AIDs in PubChem

local_save_dir = 'S:\coding\WelQrate\pubchem_sum'
local_save_path = os.path.join(local_save_dir, 'bioassays.tsv.gz')

if not os.path.exists(local_save_dir):
    os.makedirs(local_save_dir)
r = requests.get(url, stream=True)

with open(local_save_path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)
print('Downloaded to %s' % local_save_path)

path = 'pubchem_sum/bioassays.tsv.gz'

# Read the TSV file
all_bioassay = pd.read_csv(path, delimiter='\t')

In [None]:
all_bioassay.head()

Unnamed: 0,AID,BioAssay Name,Deposit Date,Modify Date,Source Name,Source ID,Substance Type,Outcome Type,Project Category,BioAssay Group,BioAssay Types,Protein Accessions,UniProts IDs,Gene IDs,Target TaxIDs,Taxonomy IDs,Number of Tested SIDs,Number of Active SIDs,Number of Tested CIDs,Number of Active CIDs
0,1,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,55228,3318,53214,3094
1,3,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,51435,2615,49564,2467
2,5,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,54079,2503,52046,2317
3,7,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,54062,4335,52033,4098
4,9,NCI human tumor cell line growth inhibition as...,20040815,20240410,DTP/NCI,NCI human tumor cell line growth inhibition as...,small-molecule,Confirmatory,Other,NCI-60_DOSERESP,,,,,,,53977,3159,52001,2981


### 11.1.2 Retrieving protein sequences for assays tested:

Then, the testing information for each compound is retrieved from the PugREST API

In [None]:
# Cache to store the number of compounds tested per AID to avoid redundant call. 
num_compounds_tested_cache = {}

def get_num_compounds_tested(aid, all_bioassay=all_bioassay):
    """
    This function retrieves the information of how many compounds were tested in a given assay (by AID).
    """
    if aid in num_compounds_tested_cache:
        return num_compounds_tested_cache[aid]
    else: 
        #return the 'Number of Tested CIDs' column value at the row where the 'AID' column is equal to aid in the all_bioassay dataframe
        num_compounds_tested = all_bioassay[all_bioassay['AID'] == aid]['Number of Tested CIDs'].values[0]
    return num_compounds_tested

def get_assay_data(cid):
    """
    Return a dictionary of all targets that a given compound (by CID) was tested on in PubChem 
    and the activity values of the compound. 
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON" #PUG-REST compound summary by CID
    response = requests.get(url)
    data = response.json()

    target_activity = {}

    if 'Table' in data and 'Row' in data['Table']:
        for row in data['Table']['Row']:
            cells = row['Cell']
            aid = int(cells[0])  # Extracting the AID from the first cell

            # Proceed only if the assay is a screening assay
            if cells[10] == 'Screening':

                # Proceed only if more than 10,000 compounds were tested
                num_compounds_tested = get_num_compounds_tested(aid)
                if num_compounds_tested > 10000:
                    target_gi = cells[5] # Retrieve the protein target's GI
                    activity_outcome = cells[4].lower()

                    if target_gi not in target_activity:
                        target_activity[target_gi] = activity_outcome == 'active'
                    elif activity_outcome == 'active':
                        target_activity[target_gi] = True # If a compound was tested multiple times on the same protein, priotize "active" outcome.
            
            else:
                continue

    return (cid, target_activity)

In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

cids_list = pre11_actives[cid_col].tolist()

def execute_with_multiprocessing(cids_list):
    """
    For a given list of CIDs, return a dictionary of dictionaries 
    of protein targets these compounds were tested on and the activity outcomes
    Input: 
        [list of CIDs]
    Output: 
        Dictionary of testing information for all CIDs, such as:
        {CID1:{target1:activity1, target3:activity3, ...},{CID2:{target2:activity2, target4:activity4, ...}, ...}}
    """
    results_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Prepare futures for all CIDs
        futures = [executor.submit(get_assay_data, cid) for cid in cids_list]
        
        # Process futures as they complete
        for future in tqdm(as_completed(futures), total=len(cids_list), desc="Processing CIDs"):
            try:
                cid, target_activity = future.result()
                results_dict[cid] = target_activity
            except Exception as e:
                print(f"Error processing CID: {e}")
    return results_dict

results_dict = execute_with_multiprocessing(cids_list)

Processing CIDs:   0%|          | 0/234 [00:00<?, ?it/s]

Processing CIDs: 100%|██████████| 234/234 [01:03<00:00,  3.66it/s]


In [None]:
#export results_dict
with open(f'{data_folder}/before_finished/step_11/11_1/results_dict.json', 'w') as f:
    json.dump(results_dict, f)

In [None]:
#import results_dict:
with open(f'{data_folder}/before_finished/step_11/11_1/results_dict.json', 'r') as f:
    results_dict = json.load(f)

#get the list of all keys of the values in the dictionary:
protein_ids = []
for value in results_dict.values():
    protein_ids.extend(value.keys())

#clean the list
protein_ids = list(set(protein_ids))
protein_ids = [id for id in protein_ids if id != '']

print(protein_ids)
print(f'Require multiple sequencing alignment for {len(protein_ids)} proteins.')

['7657550', '4505209', '21618340', '4506537', '67191027', '5729858', '48146199', '16306916', '27368096', '14790119', '486173', '55956923', '7108463', '187960042', '70832125', '1782953264', '20336315', '124376142', '194068499', '253722402', '68474550', '17507875', '7669492', '120538355', '15610945', '16130724', '4758484', '14719829', '164058', '11528014', '15607504', '89191863', '109633019', '6009644', '116076351', '6978787', '6325022', '6679827', '1237937630', '38174238', '9937384', '38788193', '78070770', '21315078', '296434520', '218891639', '31563518', '24119166', '139424501', '2935630', '119580345', '160707929', '38027923', '10835013', '47678551', '23893623', '7381449', '195969650', '13124881', '1302091', '55958172', '6680530', '216548193', '5453722', '46367787', '31881630', '4758878', '6016094', '168184763', '160333370', '125541954', '111305821', '15680217', '219518789', '62526033', '597517265', '14389423', '149631', '4506243', '154146191', '317373446', '528078313', '4826834', '15

Now we retrieve all the FASTA sequences of proteins tested for all of our compounds with Biopython API to Entrez of NCBI. The FASTA sequence is saved as "sequences.fasta"

In [None]:
# Always tell NCBI who you are
Entrez.email = "hdong26@amherst.edu"

# The filename where you want to save the sequences
output_filename = f'{data_folder}/before_finished/step_11/11_1/sequences.fasta'

# Open a file to write the sequences
with open(output_filename, "w") as output_file:
    for id in protein_ids:
        try:
            # Fetch the sequence from NCBI
            handle = Entrez.efetch(db="protein", id=id, rettype="fasta", retmode="text")
            sequence_data = handle.read()
            handle.close()
            
            # Write the sequence data to the file
            output_file.write(sequence_data)
        except Exception as e:
            print(f"An error occurred while fetching {id}: {e}")

From the FASTA sequence, we also need to retrieve the list of protein names, since these are different from the protein GIs

In [None]:
def extract_protein_names(file_path):
    protein_names = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                # Split the line at spaces and take the first item
                parts = line.split(' ')
                protein_name = parts[0]
                # Remove the leading '>' character
                protein_name = protein_name[1:]
                protein_names.append(protein_name)
    return protein_names

file_path = f'{data_folder}/before_finished/step_11/11_1/sequences.fasta'
protein_names = extract_protein_names(file_path)

# Create a dictionary to map protein IDs to protein names by index 
protein_id_to_name = {protein_ids[i]: protein_names[i] for i in range(len(protein_ids))}

In [None]:
print(protein_id_to_name)

{'7657550': 'NP_002418.1', '4505209': 'NP_000762.2', '21618340': 'NP_417296.1', '4506537': 'NP_066285.1', '67191027': 'NP_003989.2', '5729858': 'AAB23646.1', '48146199': 'NP_579856.2', '16306916': 'NP_000256.4', '27368096': 'NP_899192.1', '14790119': 'AAI36361.1', '486173': 'DAA06693.1', '55956923': 'pdb|1ZHH|B', '7108463': 'NP_652928.1', '187960042': 'NP_848757.5', '70832125': 'sp|P08482.1|ACM1_RAT', '1782953264': 'NP_217253.1', '20336315': 'BAH02301.1', '124376142': 'NP_008837.1', '194068499': 'CAH72619.1', '253722402': 'NP_004337.2', '68474550': 'sp|P86926.1|RLGM1_TRYBB', '17507875': 'EAW58811.1', '7669492': 'NP_001370.1', '120538355': 'NP_663745.1', '15610945': 'CAG33189.1', '16130724': 'NP_004070.3', '4758484': 'NP_065688.1', '14719829': 'AAC51766.1', '164058': 'NP_001136020.1', '11528014': 'AAA36557.1', '15607504': 'NP_000015.2', '89191863': 'AAH70052.1', '109633019': 'CAI19360.1', '6009644': 'NP_001357589.1', '116076351': 'CAB40158.1', '6978787': 'AAH94064.1', '6325022': 'AAG293

### 11.1.3 Percent Sequence Identity by Multiple Sequence Alignment

The sequences.fasta file is submitted to https://www.ebi.ac.uk/jdispatcher/msa/clustalo for multiple sequencing alignment. The resulted table of percent sequence identity matrix is saved and imported for the calculation of FoH

In [None]:
"""
Submit sequences.fasta to https://www.ebi.ac.uk/jdispatcher/msa/clustalo
    Input sequence type: Protein
    Output format: ClustalW with character counts
Download the resulted Percent Identity Matrix file file and save as "percent_identity_matrix.txt"
"""

#import the identity matrix:
protein_si = pd.read_csv(
    f'{data_folder}/before_finished/step_11/11_1/percent_identity_matrix.txt',
    delimiter='\s+',
    header=None,
    skiprows=6 
)

In [None]:
#remove the first column:
protein_si = protein_si.drop(protein_si.columns[0], axis=1)

name = protein_si[1].tolist()
name = ['protein name'] + name
protein_si.columns = name

In [None]:
protein_si

Unnamed: 0,protein name,NP_065132.1,NP_001017408.1,CAD52000.1,NP_037457.3,AAI51237.1,pdb|1ZHH|A,AAI02981.1,NP_417299.1,AAB23646.1,...,sp|Q8N884.2|CGAS_HUMAN,NP_001610.2,NP_208344.1,AAC14371.1,NP_036543.4,NP_057051.4,NP_001138884.1,NP_207880.1,NP_976226.1,EAW58774.1
0,NP_065132.1,100.00,100.00,11.17,12.20,15.69,12.73,13.79,15.13,0.00,...,10.19,3.95,11.26,5.93,7.14,6.94,8.15,12.15,4.35,12.28
1,NP_001017408.1,100.00,100.00,11.56,12.66,15.66,13.73,12.66,15.58,0.00,...,9.90,4.00,11.19,5.93,7.58,7.35,8.46,12.38,4.60,12.84
2,CAD52000.1,11.17,11.56,100.00,9.52,14.94,15.70,6.17,14.75,33.33,...,6.93,6.74,7.45,5.31,7.81,7.58,16.36,14.53,6.48,10.92
3,NP_037457.3,12.20,12.66,9.52,100.00,8.00,8.62,10.75,9.84,33.33,...,6.45,15.69,10.43,8.40,8.72,8.50,9.09,12.04,6.86,8.51
4,AAI51237.1,15.69,15.66,14.94,8.00,100.00,12.61,7.69,12.98,16.67,...,5.69,9.78,6.83,9.90,7.14,6.93,12.14,10.56,6.30,10.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,NP_057051.4,6.94,7.35,7.58,8.50,6.93,9.89,9.17,9.02,7.69,...,13.71,13.71,13.68,17.87,99.87,100.00,14.29,18.30,14.95,18.24
411,NP_001138884.1,8.15,8.46,16.36,9.09,12.14,11.76,10.71,8.58,0.00,...,9.17,18.66,14.36,16.62,14.29,14.29,100.00,18.07,16.88,17.15
412,NP_207880.1,12.15,12.38,14.53,12.04,10.56,9.30,10.00,10.60,0.00,...,13.37,22.51,17.38,16.10,17.85,18.30,18.07,100.00,13.39,21.07
413,NP_976226.1,4.35,4.60,6.48,6.86,6.30,7.69,8.33,7.88,0.00,...,9.13,16.54,16.79,16.63,15.51,14.95,16.88,13.39,100.00,25.21


### 11.1.4 Calculation of FoHs:

Until now, we have a dictionary of (cid: assays tested); (assay_tested:protein name), and percentage identity matrix with first columns as protein names.
For each compound, we retrieve the list of all protein names tested on that compounds by matching between the two first dictionary. From this list, we retrieve the corresponding matrix of percentages identitiy of these proteins corresponding to these compounds and calculate the FoH

In [None]:
protein_si_dict = {}
for name in protein_si['protein name']: 
    for other_name in protein_si['protein name']: 
        if other_name != name: 
            protein_si_dict[(name, other_name)] = protein_si.loc[protein_si['protein name'] == name, other_name].values[0]

In [None]:
foh_dict = {}

for cid, targets in tqdm.tqdm(results_dict.items()):
    active_weight_list = []
    total_weight_list = []

    for target_id, result in targets.items():
        if target_id == '':
            continue

        protein_name = protein_id_to_name[target_id]
        max_weight = 0

        for other_id, other_result in targets.items():
            if other_id != target_id and other_id != '':
                other_protein_name = protein_id_to_name[other_id]
                value = protein_si_dict[(protein_name, other_protein_name)]
                max_weight = max(max_weight, value)

        target_weight = 1 - max_weight / 100

        if result:
            active_weight_list.append(target_weight)
        total_weight_list.append(target_weight)

    if total_weight_list:
        foh_score = sum(active_weight_list) / sum(total_weight_list)
        foh_dict[cid] = foh_score
    else: 
        foh_dict[cid] = 0

100%|██████████| 234/234 [00:12<00:00, 19.13it/s]


In [None]:
#export foh_dict
with open(f'{data_folder}/before_finished/step_11/11_1/foh_dict.json', 'w') as f:
    json.dump(foh_dict, f)

For compounds with FoH larger than 0.26, we remove them

In [None]:
to_drop = []
for cid, foh_score in foh_dict.items():
    if foh_score > 0.26: 
        to_drop.append(cid)

post_FoH_actives = pre11_actives[~pre11_actives[cid_col].isin(to_drop)]
print(f'Dropped {(len(to_drop))} compounds with FoH larger than 0.26')

Dropped 0 compounds with FoH larger than 0.26


In [None]:
post_FoH_actives.to_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_actives.csv', index=False)
pre11_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_inactives.csv', index=False)

## 11.2. Autofluoresence Filter

In [None]:
post_FoH_actives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_actives.csv', sep=',', header=0)
post_FoH_inactives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_1/post_FoH_inactives.csv', sep=',', header=0)

When finding false positive due to autofluorescence and luceferase inhibition, it is important to check if the particular assays use one of these technologies. Here, all three assays (AID626, AID1488, and AID1741) use fluorescence technologies, so it is optimal to remove compounds that are active in AIDs: 587, 588, 590, 591, 592, 593, 594

In [None]:
autofluorescence_cids = filters.load_autofluorescence_cids(data_folder)

In [None]:
to_drop_actives = []
for cid in post_FoH_actives: 
    if cid in autofluorescence_cids:
        to_drop.append(cid)
post_autofluorescence_actives = post_FoH_actives[~post_FoH_actives['PUBCHEM_CID'].isin(to_drop_actives)]
print(f'Dropped {(len(to_drop_actives))} autofluorescence compounds')

Dropped 0 autofluorescence compounds


In [None]:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_2'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_2')

#save: 
post_autofluorescence_actives.to_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_actives.csv', index=False)
pre11_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_inactives.csv', index=False)

## 11.3 RDKit PAIN Filter

In [None]:
post_autofluorescence_actives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_actives.csv', sep=',', header=0)
post_autofluorescence_inactives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_2/post_autofluorescence_inactives.csv', sep=',', header=0)

In [None]:
pains_actives = filters.detect_pains(post_autofluorescence_actives, smi_col, cid_col)

1 pains detected
2 pains detected
3 pains detected
4 pains detected
5 pains detected
6 pains detected
7 pains detected
8 pains detected
9 pains detected
10 pains detected
11 pains detected
12 pains detected
13 pains detected
14 pains detected
15 pains detected
16 pains detected
17 pains detected
18 pains detected
19 pains detected
20 pains detected
21 pains detected
22 pains detected
23 pains detected


In [None]:
post_pains_actives = post_autofluorescence_actives[~post_autofluorescence_actives['PUBCHEM_CID'].isin(pains_actives)]

In [None]:
if not os.path.exists(f'{data_folder}/before_finished/step_11/11_3'):
    os.makedirs(f'{data_folder}/before_finished/step_11/11_3')

post_pains_actives.to_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_actives.csv', index=False)
post_autofluorescence_inactives.to_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_inactives.csv', index=False)

# 12. Drug-likeness filter:

In [None]:
pre12_actives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_actives.csv', sep=',', header=0)
pre12_inactives = pd.read_csv(f'{data_folder}/before_finished/step_11/11_3/post_pains_inactives.csv', sep=',', header=0)

In [None]:
not_drug_actives = filters.drug_likeness_filter_multiprocessing(pre12_actives, smi_col, cid_col)

Processing SMILES: 100%|██████████| 211/211 [00:00<00:00, 4981.30it/s]


In [None]:
not_drug_inactives = filters.drug_likeness_filter_multiprocessing(pre12_inactives, smi_col, cid_col)

Processing SMILES: 100%|██████████| 324990/324990 [00:07<00:00, 42575.21it/s] 


In [None]:
post12_actives = pre12_actives[~pre12_actives['PUBCHEM_CID'].isin(not_drug_actives)]
post12_inactives = pre12_inactives[~pre12_inactives['PUBCHEM_CID'].isin(not_drug_inactives)]

print(f'Dropped {(len(not_drug_actives))} hit compounds that do not pass the drug likeness filter')
print(f'Dropped {(len(not_drug_inactives))} inactive compounds that do not pass the drug likeness filter')

Dropped 35 hit compounds that do not pass the drug likeness filter
Dropped 17290 inactive compounds that do not pass the drug likeness filter


In [None]:
if not os.path.exists(f'{data_folder}/before_finished/step_12'):
    os.makedirs(f'{data_folder}/before_finished/step_12')

#Export not_drug_actives and inactives:
with open(f'{data_folder}/before_finished/step_12/not_drug_actives.json', 'w') as f:
    json.dump(not_drug_actives, f)
with open(f'{data_folder}/before_finished/step_12/not_drug_inactives.json', 'w') as f:
    json.dump(not_drug_inactives, f)

In [None]:
# save: 
post12_actives.to_csv(f'{data_folder}/before_finished/step_12/post12_actives.csv', index=False)
post12_inactives.to_csv(f'{data_folder}/before_finished/step_12/post12_inactives.csv', index=False)

# 13. ChemBL Curation Pipeline

Besides PubChem, the ChEMBL database is one of several public databases containing bioactivity data on small molecule compounds curated from various sources. Incoming compounds are typically not standardized according to consistent rules. To maintain the quality of the final database and to facilitate the comparison and integration of data on the same compound from different sources, it is essential to appropriately standardize the chemical structures in the database. This chemical curation pipeline has been developed by Bento, A.P, et al., using the open-source toolkit RDKit, including a Checker module that tests the validity of chemical structures and flags any serious errors. For ChEMBL, a penalty score of 7 is considered a fatal error, and the molfile is not loaded into the database.

In our dataset, we passed all compounds through this Checker module to evaluate the validity of the included chemical structures. For compounds that returned a penalty score of 7, we will check them manually.

Reference:
Bento, A.P., Hersey, A., Félix, E. et al. An open source chemical structure curation pipeline using RDKit. J Cheminform 12, 51 (2020). https://doi.org/10.1186/s13321-020-00456-1

In [None]:
pre13_actives = pd.read_csv(f'{data_folder}/before_finished/step_12/post12_actives.csv', sep=',', header=0)
pre13_inactives = pd.read_csv(f'{data_folder}/before_finished/step_12/post12_inactives.csv', sep=',', header=0)

In [None]:
# Apply the ChemBL Curation Pipeline Checker module:
score_actives = utils.checker_multiprocessing(pre13_actives, smi_col, cid_col)
score_inactives = utils.checker_multiprocessing(pre13_inactives, smi_col, cid_col)

Processing SMILES: 100%|██████████| 176/176 [00:00<?, ?it/s]
[05:49:48] Conflicting single bond directions around double bond at index 7.
[05:49:48]   BondStereo set to STEREONONE and single bond directions set to NONE.
[05:50:18] Conflicting single bond directions around double bond at index 4.
[05:50:18]   BondStereo set to STEREONONE and single bond directions set to NONE.
[05:50:44] Conflicting single bond directions around double bond at index 5.
[05:50:44]   BondStereo set to STEREONONE and single bond directions set to NONE.
[05:50:45] Conflicting single bond directions around double bond at index 4.
[05:50:45]   BondStereo set to STEREONONE and single bond directions set to NONE.
[05:52:30] Conflicting single bond directions around double bond at index 4.
[05:52:30]   BondStereo set to STEREONONE and single bond directions set to NONE.
[05:55:27] Conflicting single bond directions around double bond at index 4.
[05:55:27]   BondStereo set to STEREONONE and single bond direction

In [None]:
#print all unique values in the dictionary:
print(set(score_actives.values()))
print(set(score_inactives.values()))

{0, 2}
{0, 2, 5, 6}


In [None]:
# Create a new step folder
if not os.path.exists(f'{data_folder}/before_finished/step_13'):
    os.makedirs(f'{data_folder}/before_finished/step_13')

# save the scores:
with open(f'{data_folder}/before_finished/step_13/score_actives.json', 'w') as f:
    json.dump(score_actives, f)
with open(f'{data_folder}/before_finished/step_13/score_inactives.json', 'w') as f:
    json.dump(score_inactives, f)

In [None]:
#drop all compounds with a penalty score of 7:
to_drop_actives = []
to_drop_inactives = []
for cid, penalty_score in score_actives.items():
    if penalty_score == 7:
        to_drop_actives.append(cid)
for cid, penalty_score in score_inactives.items():
    if penalty_score == 7:
        to_drop_inactives.append(cid)

post13_actives = pre13_actives[~pre13_actives[cid_col].isin(to_drop_actives)]
post13_inactives = pre13_inactives[~pre13_inactives[cid_col].isin(to_drop_inactives)]

In [None]:
#save final_hits and inactives:
post13_actives.to_csv(f'{data_folder}/before_finished/step_13/post13_actives.csv', index=False)
post13_inactives.to_csv(f'{data_folder}/before_finished/step_13/post13_inactives.csv', index=False)

# 14. Final handling of chemical representation

Any kind of molecular processing should come with a special attention to post-processing adjustment. This is because during molecular processing, changes in data representation and format could lead to inconsistency or redundancy in the datasets. Below are the two adjustments performed on our datasets. 

(1) Some of the InChIs in our datasets should be updated. This is because some of the InChI will be missing, since the PubChem Identifier Exchange service might not able to find the corresponding InChI for the aromatized, neutralized SMILES.

(2) Presence of some additional duplicates resulted from molecular processing: While handling mixtures, there might be some mixtures whose component molecules are identical. For example, mixtures of organic and inorganic molecules (molX-ionA and molX-ionB) after removed the ions (ionA and ionB) will result in duplicates (molX). Moreover, since the original mixtures are different, their activities could be different. Therefore, we also need to check their activities while handling these duplicates.
- If all duplicates share the same results (active/inactive), we keep one of them, since it is likely that the organic molecule kept contributed more significantly to the activity of the mixture. 
- If duplicates of the same molecules returned different activity, it is safer to remove both of them.

In [None]:
#import: 
pre14_actives = pd.read_csv(f'{data_folder}/before_finished/step_13/post13_actives.csv', sep=',', header=0)
pre14_inactives = pd.read_csv(f'{data_folder}/before_finished/step_13/post13_inactives.csv', sep=',', header=0)

## 14.1 Update InChI

In [None]:
def smi_to_inchi(smi):
    mol = Chem.MolFromSmiles(smi)
    inchi = Chem.inchi.MolToInchi(mol)
    return inchi

In [None]:
count = 0 
for index, row in pre14_actives.iterrows():
    if row['InChI'] != row['InChI']:
        pre14_actives.at[index, 'InChI'] = smi_to_inchi(row[smi_col])
        count += 1
print(f'Updated {count} InChI values in pre14_actives')

count = 0
for index, row in pre14_inactives.iterrows():
    if row['InChI'] != row['InChI']:
        pre14_inactives.at[index, 'InChI'] = smi_to_inchi(row[smi_col])
        count += 1
print(f'Updated {count} InChI values in pre14_inactives')

Updated 0 InChI values in pre14_hits


































Updated 69 InChI values in pre14_inactives







## 14.2 Handle duplicates

In [None]:
#Check if a mol in active set appeared in inactive set:
for i in pre14_actives[smi_col]:
    if i in list(pre14_inactives[smi_col]):
        print(f'{i} SMILES appeared in both active and inactive sets')
for i in pre14_actives['InChI']:
    if i in list(pre14_inactives['InChI']):
        print(f'{i} InChI appeared in both active and inactive sets')

#Return all duplicates by comparing InChI:
final_actives_duplicates_InChI = pre14_actives[pre14_actives.duplicated(subset=['InChI'], keep=False)]
final_inactives_duplicates_InChI = pre14_inactives[pre14_inactives.duplicated(subset=['InChI'], keep=False)]
final_actives_duplicates_smi = pre14_actives[pre14_actives.duplicated(subset=[smi_col], keep=False)]
final_inactives_duplicates_smi = pre14_inactives[pre14_inactives.duplicated(subset=[smi_col], keep=False)]

print('Number of InChI duplicates in actives: ', len(final_actives_duplicates_InChI))
print('Number of InChI duplicates in inactives: ', len(final_inactives_duplicates_InChI))
print('Number of SMILES duplicates in actives: ', len(final_actives_duplicates_smi))
print('Number of SMILES duplicates in inactives: ', len(final_inactives_duplicates_smi))

In [None]:
if not os.path.exists(f'{data_folder}/before_finished/step_14'):
    os.makedirs(f'{data_folder}/before_finished/step_14')

#write all the duplicates to a file:
#write duplicates to a txt file: 
with open(f'{data_folder}/before_finished/step_14/duplicates.txt', 'w') as f:
    f.write('InChI duplicates in actives: \n')
    f.write(final_actives_duplicates_InChI.to_string())
    f.write('\n\n')
    f.write('InChI duplicates in inactives: \n')
    f.write(final_inactives_duplicates_InChI.to_string())
    f.write('\n\n')
    f.write('SMILES duplicates in actives: \n')
    f.write(final_actives_duplicates_smi.to_string())
    f.write('\n\n')
    f.write('SMILES duplicates in inactives: \n')
    f.write(final_inactives_duplicates_smi.to_string())

In [None]:
#remove these duplicates, keep the first one: 
#by inchi:
final_actives = pre14_actives.drop_duplicates(subset=['InChI'], keep='first')
final_inactives = pre14_inactives.drop_duplicates(subset=['InChI'], keep='first')

In [None]:
if len(final_actives[final_actives.duplicated(subset=[smi_col], keep=False)]) == 0:
    print('No more duplicates in hits')

if len(final_inactives[final_inactives.duplicated(subset=[smi_col], keep=False)]) == 0:
    print('No more duplicates in inactives')

No more duplicates in hits
No more duplicates in inactives


In [None]:
# save: 
if not os.path.exists(f'{data_folder}/finished'):
    os.makedirs(f'{data_folder}/finished')
final_actives.to_csv(f'{data_folder}/finished/final_actives.csv',sep=',', index=False)
final_inactives.to_csv(f'{data_folder}/finished/final_inactives.csv',sep=',', index=False)

# Some additional modifications

## A. Import Regression Data (only for regression datasets) & Adjust Column Names

In [None]:
final_actives = pd.read_csv(f'{data_folder}/finished/final_actives.csv', sep=",", header=0)
final_inactives = pd.read_csv(f'{data_folder}/finished/final_inactives.csv', sep=",", header=0)

In [None]:
#Data to be extracted from the assay:
regression_type = 'IC50'
col_list = ['PUBCHEM_CID', regression_type]
float_available_AIDs = [504699, 504701]

count = 0
for AID in float_available_AIDs:
    url = f'https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&response_type=save&aid={AID}'
    assay = pd.read_csv(url, usecols=col_list)

    #delete rows with nan values
    assay = assay.dropna(subset=col_list)

    #convert regression data to float
    assay[regression_type] = assay[regression_type].astype(float)

    #convert cids to int:
    assay['PUBCHEM_CID'] = assay['PUBCHEM_CID'].astype(int)

    #reindex assay dataframe from 0:
    assay.reset_index(drop=True, inplace=True)

    #Create a new folder to save the data (with relative path):
    if not os.path.exists(f'{data_folder}/before_finished/regression_data'):
        os.makedirs(f'{data_folder}/before_finished/regression_data')
    assay.to_csv(f'{data_folder}/before_finished/regression_data/AID{AID}_{regression_type}.csv', index=False)
    count += 1
    print(f'{count} out of {len(float_available_AIDs)} complete')

1 out of 2 complete
2 out of 2 complete


In [None]:
for AID in float_available_AIDs:
    exec(f'AID{AID}_regression = pd.read_csv("{data_folder}/before_finished/regression_data/AID{AID}_{regression_type}.csv", sep=",", header=0)')

In [None]:
regression_CIDs = AID504699_regression['PUBCHEM_CID'].tolist() + AID504701_regression['PUBCHEM_CID'].tolist()
#drop duplicates:
regression_CIDs = list(set(regression_CIDs))

In [None]:
#Make the regression data a dictionary:
regression_dict1 = {}
for index, row in AID504699_regression.iterrows():
    regression_dict1[row['PUBCHEM_CID']] = row['IC50']

regression_dict2 = {}
for index, row in AID504701_regression.iterrows():
    regression_dict2[row['PUBCHEM_CID']] = row['IC50']

In [None]:
regression_dict = {}
for CID in regression_CIDs:
    if CID in regression_dict1 and CID in regression_dict2:
        regression_dict[CID] = (regression_dict1[CID] + regression_dict2[CID]) / 2
    elif CID in regression_dict1 and CID not in regression_dict2:
        regression_dict[CID] = regression_dict1[CID]
    elif CID not in regression_dict1 and CID in regression_dict2:
        regression_dict[CID] = regression_dict2[CID]

In [None]:
cut_off = 10 #note the cut off value for hits
inactive_fixed = 1000 #set a fixed value for inactives
unit = 'uM' #unit of the regression values
URLs = []
for AID in float_available_AIDs:
    URL = f'https://pubchem.ncbi.nlm.nih.gov/bioassay/{AID}'
    URLs.append(URL)

In [None]:
#Export the regression dict:
with open(f'{data_folder}/before_finished/regression_data/regression_dict.json', 'w') as f:
    json.dump(regression_dict, f)

with open(f'{data_folder}/before_finished/regression_data/activity_value_details.txt', 'w') as f:
    f.write(f"""The regression data was extracted from AIDs: {float_available_AIDs}.
The {regression_type} values were averaged if there were multiple assays.
The final {regression_type} data exported for {len(regression_dict)} unique compounds, parsable from column "activity_value" in the .csv files with unit {unit}.
The range of {regression_type} values reported were from {min(regression_dict.values())} to {max(regression_dict.values())}.
For final inactives, the {regression_type} value was set to {inactive_fixed} {unit} if dose-response info was not available. This value could be modified by users.
The cut-off for activity was {cut_off} uM, which was retrieved from the PubChem database from the following URL(s):
""")
    f.write(f'\n'.join(URLs))

In [None]:
for index, row in final_actives.iterrows():
    if row['PUBCHEM_CID'] in regression_dict:
        final_actives.at[index, 'activity_value'] = regression_dict[row['PUBCHEM_CID']]

for index, row in final_inactives.iterrows():
    if row['PUBCHEM_CID'] in regression_dict:
        final_inactives.at[index, 'activity_value'] = regression_dict[row['PUBCHEM_CID']]
    else:
        final_inactives.at[index, 'activity_value'] = inactive_fixed

In [None]:
# Rename the columns:
final_actives = final_actives.rename(columns={
    'PUBCHEM_CID': 'CID',
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
    'Mol removed from mixture': 'mol_removed_from_mixture',
    'Small inorganic molecule': 'small_inorganic_mol_from_mixture',
    'Small organic molecule': 'small_organic_mol_from_mixture'
})
final_inactives = final_inactives.rename(columns={
    'PUBCHEM_CID': 'CID',
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
    'Mol removed from mixture': 'mol_removed_from_mixture',
    'Small inorganic molecule': 'small_inorganic_mol_from_mixture',
    'Small organic molecule': 'small_organic_mol_from_mixture'
})

In [None]:
#swap the positions of the columns InChI and activity_outcome:
final_actives = final_actives[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]
final_inactives = final_inactives[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]

In [None]:
#export:
final_actives.to_csv(f'{data_folder}/finished/final_actives.csv', sep=',', index=False)
final_inactives.to_csv(f'{data_folder}/finished/final_inactives.csv', sep=',', index=False)

## B. Compile Control Data

This part of the notebook includes our complilation of the control data (poorly curated data without applying hierarchical curation, molecular processing, or filters). Data format should be similar to curated data for convenience.

In [None]:
# Import raw data from the primary screen(s)
raw1 = pd.read_csv(f'{data_folder}/before_finished/step_1/AID485270.csv', sep=',', header=0)
raw2 = pd.read_csv(f'{data_folder}/before_finished/step_1/AID434989.csv', sep=',', header=0)

# Import Inchi:
std_inchi1 = pd.read_csv(f'{data_folder}/before_finished/step_3/std_inchi_485270.txt', sep='\t', header=None)
std_inchi2 = pd.read_csv(f'{data_folder}/before_finished/step_3/std_inchi_434989.txt', sep='\t', header=None)

In [None]:
# Update the raw data with Inchi
raw_inchi_dict1 = dict(zip(std_inchi1[0], std_inchi1[1]))
raw1['InChI'] = raw1['PUBCHEM_CID'].map(raw_inchi_dict1)

raw_inchi_dict2 = dict(zip(std_inchi2[0], std_inchi2[1]))
raw2['InChI'] = raw2['PUBCHEM_CID'].map(raw_inchi_dict2)

In [None]:
raw = pd.concat([raw1, raw2[~raw2['PUBCHEM_CID'].isin(raw1['PUBCHEM_CID'])]], ignore_index=True)

In [None]:
raw_hits = raw[raw['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']
raw_inactives = raw[raw['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inactive']

In [None]:
# Get the average activity values of hits:
final_hits = pd.read_csv(f'{data_folder}/finished/final_hits.csv', sep=',', header=0)
activity_values = final_hits['activity_value'].tolist()
average_activity_value = sum(activity_values) / len(activity_values)

In [None]:
# Update the activity values:
for index, row in raw_hits.iterrows():
    if row['PUBCHEM_CID'] in regression_dict:
        raw_hits.loc[index, 'activity_value'] = regression_dict[row['PUBCHEM_CID']]
    else:
        raw_hits.loc[index, 'activity_value'] = average_activity_value

for index, row in raw_inactives.iterrows():
    if row['PUBCHEM_CID'] in regression_dict:
        raw_inactives.loc[index, 'activity_value'] = regression_dict[row['PUBCHEM_CID']]
    else:
        raw_inactives.loc[index, 'activity_value'] = inactive_fixed

In [None]:
# Add some other columns to match the format of the curated data:
# raw_hits.loc[:, 'activity_value'] = np.nan # Comment out if using regression
raw_hits.loc[:, 'mol_removed_from_mixture'] = np.nan
raw_hits.loc[:, 'small_inorganic_mol_from_mixture'] = np.nan
raw_hits.loc[:, 'small_organic_mol_from_mixture'] = np.nan

# raw_inactives.loc[:, 'activity_value'] = np.nan # Comment out if using regression
raw_inactives.loc[:, 'mol_removed_from_mixture'] = np.nan
raw_inactives.loc[:, 'small_inorganic_mol_from_mixture'] = np.nan
raw_inactives.loc[:, 'small_organic_mol_from_mixture'] = np.nan

In [None]:
# Rename the columns:
raw_hits = raw_hits.rename(columns={
    'PUBCHEM_CID': 'CID',
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
})
raw_inactives = raw_inactives.rename(columns={
    'PUBCHEM_CID': 'CID',
    'PUBCHEM_ACTIVITY_OUTCOME': 'activity_outcome',
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'SMILES',
})

In [None]:
#swap the positions of the columns InChI and activity_outcome:
raw_hits = raw_hits[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]
raw_inactives = raw_inactives[['CID', 'SMILES', 'InChI', 'activity_outcome', 'activity_value', 'mol_removed_from_mixture', 'small_inorganic_mol_from_mixture', 'small_organic_mol_from_mixture']]

In [None]:
if not os.path.exists(f'{data_folder}/finished/control_data'):
    os.makedirs(f'{data_folder}/finished/control_data')

#save the hits and inactives
raw_hits.to_csv(f'{data_folder}/finished/control_data/raw_hits.csv', sep=',', index=False)
raw_inactives.to_csv(f'{data_folder}/finished/control_data/raw_inactives.csv', sep=',', index=False)

#save as txt:
raw_hits.to_csv(f'{data_folder}/finished/control_data/raw_hits.txt', sep=';', index=False, header=False)
raw_inactives.to_csv(f'{data_folder}/finished/control_data/raw_inactives.txt', sep=';', index=False, header=False)