<a href="https://colab.research.google.com/github/HEK-Research/Multitask-Deep-Learning-Affinity-Prediction/blob/Luis/00_Active_Compound_Curation_from_ChEMBL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Do NOT edit this notebook. 
## It is shared to show you the method I used in curate active compounds from original ChEMBL bioacitity dataset for individual target.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
file_path = '/content/drive/MyDrive/Project_4_MTDNN/ChEMBL Datasets/'

### Filter dataset with the multiple criteria:
Filters:
* 1	Human targets (Homo sapiens), single protein (target confidence score: 9) 
* 2	Only standard potency measurements (for example: EC50, IC50, Kd, Ki) were considered 
* 3	All compounds annotated as (‘inactive’, ‘not active’, ‘inconclusive’, ‘potential transcription error’, or ‘pan assay interference compounds (PAINS)’) were discarded.
* 4	Only compounds with reported direct interactions (target relationship type: “B”)
* 5	Exact activity measurements (“=”)
* 6	Molecular weight =< 1000 Da 
* 7	pChEMBL value (>=6, as "Active")

In [1]:
# For step 2. 
# Use groupby analysis to identify "Standard Type" with only numerical "pChEMBL Value"
# Return a list of selected "Standard Type"
def Select_Standard_Type(DF):
    # group the dataframe by type and inspect the pchembl_value column
    Grouped = DF.groupby('Standard Type')['pChEMBL Value']

    Standard_Type = []

    # check if any group has only numerical values or only None values
    for group_name, group_values in Grouped:
        group_values = group_values.dropna()
        if len(group_values) == 0:
            print(f"All values in group {group_name} are None.")
        elif all(isinstance(val, float) for val in group_values):
            print(f"All values in group {group_name} are numerical.")
            Standard_Type.append(group_name)
        else:
            print(f"Group {group_name} has mixed data types.")
        
    print("Standard potency measurement types to keep are:", Standard_Type)
    
    return Standard_Type

# For step 7. 
# Some compounds have multiple bioactivity data reported. 
# The mean and std (standard deviation) of pChEMBL Values are calculated for each unique compound, any compound with disagreeing pChEMBL Values (large std) are discarded
# 'mean' is merged to the dataset
def Mean_Std_pChEMBL_Value(Filtered_DF):
    print("Number of unique compounds: ",Filtered_DF['Molecule ChEMBL ID'].nunique())
    
    # Group by "Molecule ChEMBL ID" and calculate average and standard deviation of "pChEMBL Value"
    Grouped_Filtered_DF = Filtered_DF.groupby('Molecule ChEMBL ID')['pChEMBL Value'].agg(['mean', 'std'])
    
    # The number of unique compound with std of pChEMBL Value greater than 2.0
    n_drop = Grouped_Filtered_DF[Grouped_Filtered_DF['std']>2.0].shape[0]
    print("There are",n_drop, "compounds with std pChEMBL Value > 2.0")
    
    # Get the "Molecule ChEMBL ID" with a std pChEMBL Value <= 2.0
    Remaining_IDs = Grouped_Filtered_DF[Grouped_Filtered_DF['std'].isna() | (Grouped_Filtered_DF['std']<=2.0)].index.tolist()
    print("Number of unique compounds to keep: ",len(Remaining_IDs))
    
    # Merged Filtered_DF and Grouped_Filtered_DF by "Molecule ChEMBL ID" and add mean pChEMBL 
    Merged_DF = pd.merge(Filtered_DF, Grouped_Filtered_DF[['mean']], left_on='Molecule ChEMBL ID', right_index=True, how='left')
    # Keep only "Molecule ChEMBL ID" with std pChEMBL Value <= 2.0 according to Remaining_IDs
    Merged_DF_subset = Merged_DF.set_index('Molecule ChEMBL ID').loc[Remaining_IDs]
    # Drop duplicates based on the index ('Molecule ChEMBL ID')
    Merged_DF_unique = Merged_DF_subset[~Merged_DF_subset.index.duplicated(keep='first')]

    return Merged_DF_unique

# Step 1 - 7
# This function read in the original dataset, then return the filtered dataset with unique compounds. 
# Additionally, also clean out the smiles of all unique compounds. 
def get_filtered_df(DF):
    # Print out the first 20 rows of the original dataset
    print("*"*20,"Original bioactivity dataset", "*"*20)
    print(DF.loc[:,['Molecular Weight','Standard Type','pChEMBL Value','Comment','Assay Type']].head(20))
    print("Number of bioactivity data points:", DF.shape[0])
    print("Number of unique compounds:",DF['Molecule ChEMBL ID'].nunique())
    print("\n")
    
    # For step 6
    # Convert 'Molecular Weight' to float datatype, since we need to filter all compounds that are too large
    DF['Molecular Weight']=pd.to_numeric(DF['Molecular Weight'], errors='coerce')
    DF['Molecular Weight'].replace('None', np.nan, inplace=True)
    DF['Molecular Weight'] = DF['Molecular Weight'].astype(float)  
    
    # For step 3
    # Convert any comment strings contain one or more digits and nothing else to NaN, then only keep NaN comments
    DF['Comment'] = DF['Comment'].replace(to_replace=r'^\d+$', value=np.nan, regex=True)
    DF['Comment'] = DF['Comment'].replace(to_replace=['Active','active'], value=np.nan, regex=True)

    # For step 2
    # Identify the 'Standard Type' that has only numerical 'pChEMBL Value' and keep only those
    print("Calling function Select_Standard_Type")
    Standard_Type = Select_Standard_Type(DF)
    
    # Apply the data filters 
    Filtered_DF = DF[DF['Comment'].isna()] # Step 3: Drop any non NaN comments 
    Filtered_DF = Filtered_DF[Filtered_DF['Assay Type']=='B'] # Step 4: Keep only 'B' Assay Type
    Filtered_DF = Filtered_DF[Filtered_DF['Standard Relation']=="'='"] # Step 5 
    Filtered_DF = Filtered_DF.dropna(subset=['pChEMBL Value']) # Step 7: Drop any NaN pChEMBL Values
    Filtered_DF = Filtered_DF[Filtered_DF['Standard Type'].isin(Standard_Type)] # Step 2
    
    Filtered_DF = Filtered_DF[Filtered_DF['Molecular Weight']<=1000] # Step 6
    
    # Print out the first 20 rows of the filtered dataset with above dataframe sub settings 
    print("\n")
    print("*"*20,"Filtered bioactivity dataset", "*"*20)
    print(Filtered_DF.loc[:,['Molecular Weight','Standard Type','pChEMBL Value','Comment','Assay Type']].head(20))
    print("Number of bioactivity data points:",Filtered_DF.shape[0])
    print("Number of unique compounds:",Filtered_DF['Molecule ChEMBL ID'].nunique())
    
    print("\n")
    print("*"*20,"Filtered unique bioactivity dataset", "*"*20)
    print("Calling function Mean_Std_pChEMBL_Value")
    Merged_DF_unique = Mean_Std_pChEMBL_Value(Filtered_DF)
    
    print("\n")
    print("*"*20,"Filtered unique bioactivity dataset with pChEMBL Value >= 6.0", "*"*20)
    Merged_DF_unique = Merged_DF_unique[Merged_DF_unique['mean']>=6] # Step 7
    print("Final Number of Active:",Merged_DF_unique.shape[0])
    
    # Drop any row without 'Smiles'
    print("\n")
    print("*"*20,"Process and clean SMILES", "*"*20)
    Merged_DF_unique = Merged_DF_unique.dropna(subset=['Smiles'])
    
    # For molecule that has "." in the "Smiles" string, split the "Smiles" and keep the compound part
    Merged_DF_unique['Smiles']= Merged_DF_unique['Smiles'].apply(lambda x: max(x.split("."),key=len))
    print("There are", sum(Merged_DF_unique['Smiles'].str.find('.') != -1), "records with '.' in their canonical smiles string")
    
    return Merged_DF_unique

In [4]:
df = pd.read_csv(file_path+"CHEMBL3371.csv", delimiter=';', skiprows=0, low_memory=False)

NameError: ignored

In [None]:
filtered_df_unique = get_filtered_df(df)