In [1]:
#importating necessary dependencies
#rdkit is a chemioinformatics software

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [2]:
#this is the data from file data_collection file
df = pd.read_csv('data2.csv')
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,intermeidate
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,intermeidate
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive
...,...,...,...,...
5632,,,,intermeidate
5633,,,,active
5634,,,,active
5635,,,,active


In [3]:
# removing the NaN rows from the canocical_smiles

df1 = df[df.canonical_smiles.notna() & df.bioactivity.notna()]
df1.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,intermeidate
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,intermeidate
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive


## Lipinski's Rule of Five

This is a guideline to assess the drug-likeness of a molecule, indicating its potential for good absorption and permeation.
The rule states that a molecule is more likely to be orally active if it has: 

1) a molecular weight less than 500 Daltons
2) no more than
3) 5 hydrogen bond donors
4) No more than 10 hydrogen bond acceptors
5) ALogP less than 5. 

* Chem.MolFromSmiles(elem): Converts each SMILES string into an RDKit molecule object.
* Descriptors.MolWt(mol): Calculates the Molecular Weight.
* Descriptors.MolLogP(mol): Calculates the Octanol-water partition coefficient (LogP).
* Lipinski.NumHDonors(mol): Counts the number of Hydrogen Bond Donors.
* Lipinski.NumHAcceptors(mol): Counts the number of Hydrogen Bond Acceptors.

In [4]:
#function takes a list of smiles (Simplified Molecular Input Line Entry System)
#the function will return a pandas DataFrame with MW, LogP, NumHDonors, NUmHAcceptors
#I will try to catch any errors and return an empty df if list is empty or invalid strings

def lipinski(smiles_list): 

    if smiles_list.empty:
        return pd.DataFrame(columns=["MW", "LogP", "NumHDonors", "NumHAcceptors"])

    descriptor_data= []
    for element in smiles_list:
        mol = Chem.MolFromSmiles(element) 
        if mol is not None: #converts the smiles to a molecule after validating it was a string
            desc_MolWt = Descriptors.MolWt(mol)
            desc_MolLogP = Descriptors.MolLogP(mol)
            desc_NumHDonors = Lipinski.NumHDonors(mol)
            desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
            descriptor_data.append([desc_MolWt, desc_MolLogP, desc_NumHDonors, desc_NumHAcceptors])
        else:
            descriptor_data.append([np.nan, np.nan, np.nan, np.nan]) #to catch the potenital error to add nan to the row with invalid value

    columnNames = ["MW", "LogP", "NumHDonors", "NumHAcceptors"]
    descriptors_df = pd.DataFrame(data=descriptor_data, columns=columnNames)

    return descriptors_df

In [5]:
#applying the function on the canonical_smiles column

df_lipinski = lipinski(df1.canonical_smiles)
df_lipinski

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,576.742,5.28050,1,8
1,562.715,5.03450,1,8
2,543.672,4.50748,1,8
3,543.672,4.36498,1,8
4,493.615,4.59032,2,7
...,...,...,...,...
5372,479.501,-0.57000,5,11
5373,470.534,-0.46250,5,11
5374,471.518,-0.15700,5,11
5375,442.480,-1.14650,4,11


In [6]:
# combine two datasets we want the new lipinski do be included in this new dataset

df_combined = pd.concat([df1,df_lipinski], axis=1)
df_combined.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active,576.742,5.2805,1,8
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active,562.715,5.0345,1,8
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,intermeidate,543.672,4.50748,1,8
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,intermeidate,543.672,4.36498,1,8
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive,493.615,4.59032,2,7


In [7]:
#removing the Nan values 
df2 = df_combined[df_combined.MW.notna()]

In [8]:
#converting nM values to M
# we are using vectorized operations

def calculate_pIC50(data_frame):
    # Calculate molar concentration for the entire column at once
    molar_concentration = data_frame['standard_value_norm'] * (10**-9)
    
    # Calculate pIC50 for the entire column at once and assign directly
    data_frame['pIC50'] = -np.log10(molar_concentration)
    

    #dropping the standard_value_norm column
    result_df = data_frame.drop(columns=['standard_value_norm'])

    return result_df
        

In [9]:
# NOrmalizing the 'standard_value' column by capping values above 100,000,000 and creating a new 'standard_value_norm' column.
# the argument here is dataframe containing 'standard_value' from the data columns

def normalize_value(data_frame):
    #literally creates an exact copy of original standard_value clmn to start the funtion
    data_frame['standard_value_norm'] = data_frame['standard_value']
    
    #"boolean indexing" with .loc and converting the large data to 100000000
    data_frame.loc[data_frame['standard_value_norm']>100000000, 'standard_value_norm'] = 100000000

    #removing the standard_value clmn as we now have the normalized clmn
    result_df = data_frame.drop(columns=['standard_value'])
    return result_df
    

In [10]:
# the .copy() will ensure we do not affect/change the origin of the data

df_norm = normalize_value(df2.copy())
df_norm.describe()
df_norm

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity,MW,LogP,NumHDonors,NumHAcceptors,standard_value_norm
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,active,576.742,5.28050,1,8,128.0
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,active,562.715,5.03450,1,8,220.0
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,intermeidate,543.672,4.50748,1,8,8790.0
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,intermeidate,543.672,4.36498,1,8,1910.0
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,inactive,493.615,4.59032,2,7,30000.0
...,...,...,...,...,...,...,...,...
5372,CHEMBL5440493,Cn1ccc(-c2cn([C@@H]3O[C@H](C(=O)NC4CCc5nc[nH]c...,active,479.501,-0.57000,5,11,1000.0
5373,CHEMBL5432989,C[C@H](NC(=O)[C@H]1O[C@@H](n2cc(-c3ccn(C)n3)c3...,active,470.534,-0.46250,5,11,1000.0
5374,CHEMBL5439237,Cn1ccc(-c2cn([C@@H]3O[C@H](C(=O)N[C@H]4CC[C@](...,active,471.518,-0.15700,5,11,1000.0
5375,CHEMBL5396892,CN1CC[C@H](NC(=O)[C@H]2O[C@@H](n3cc(-c4ccn(C)n...,active,442.480,-1.14650,4,11,1000.0


In [11]:
#converting IC50 to pIC50 by calling the ealier function
#again using the .copy() to avoid affect my data

df3 = calculate_pIC50(df_norm.copy())

df3
df3.pIC50.describe()

count    5377.000000
mean        6.546059
std         1.432280
min         1.900000
25%         5.000000
50%         6.443697
75%         7.721246
max        10.823909
Name: pIC50, dtype: float64

In [16]:
# removing the intermediate value from the bioactivity column

df4 = df3[df3['bioactivity'] != 'intermeidate']
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,active,576.742,5.28050,1,8,6.892790
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,active,562.715,5.03450,1,8,6.657577
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,inactive,493.615,4.59032,2,7,4.522879
5,CHEMBL124035,COCCOc1cc2ncnc(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC...,active,495.580,3.80490,1,8,6.346787
6,CHEMBL125898,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,active,564.687,3.88080,1,9,7.397940
...,...,...,...,...,...,...,...,...
5372,CHEMBL5440493,Cn1ccc(-c2cn([C@@H]3O[C@H](C(=O)NC4CCc5nc[nH]c...,active,479.501,-0.57000,5,11,6.000000
5373,CHEMBL5432989,C[C@H](NC(=O)[C@H]1O[C@@H](n2cc(-c3ccn(C)n3)c3...,active,470.534,-0.46250,5,11,6.000000
5374,CHEMBL5439237,Cn1ccc(-c2cn([C@@H]3O[C@H](C(=O)N[C@H]4CC[C@](...,active,471.518,-0.15700,5,11,6.000000
5375,CHEMBL5396892,CN1CC[C@H](NC(=O)[C@H]2O[C@@H](n3cc(-c4ccn(C)n...,active,442.480,-1.14650,4,11,6.000000


In [17]:
df4.bioactivity.value_counts()

bioactivity
active      3181
inactive    1480
Name: count, dtype: int64

In [18]:
df4.to_csv('cleaned_drug_data.csv')