In [2]:
``
## ---------------------------------------------------------------- ##
## --------- Feature Engineering with Cell Based Format ----------- ##
## ---------------------------------------------------------------- ##

import numpy as np
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors
import plotly.graph_objects as go


cbf_df = pd.read_csv("erbb1_singleprotein_neglog10_ic50.csv")

cbf_df.head()


Unnamed: 0.1,Unnamed: 0,canonical_smiles,-log(M)
0,0,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,7.387216
1,3,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,6.769551
2,6,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,5.031517
3,9,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],4.017729
4,10,Cc1cc(C(=O)NCCN2CCOCC2)[nH]c1/C=C1\C(=O)N(C)c2...,5.274905


In [3]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

Mol_descriptors,desc_names = RDkit_descriptors(cbf_df['canonical_smiles'])


df_with_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)

df_with_200_descriptors.to_csv("cellbased_descriptors.csv", index = False)

print(df_with_200_descriptors.shape)


In [4]:
print(df_with_200_descriptors.shape)

(5385, 208)


In [6]:
df = pd.DataFrame(df_with_200_descriptors)
import re
from sklearn.feature_selection import VarianceThreshold


#-------Check for single value columns ----------

##---DEFINE FUNCTIONS---##

#Function to identify and remove single value columns in data.
def find_single_value_columns(dataframe):
    #Loop over columns and return those with only 1 unique value
    return [col for col in dataframe if dataframe[col].nunique() == 1]

#Update  user
single_value_columns = find_single_value_columns(df)
print("Single-value columns found and removed:")
for idx, column in enumerate(single_value_columns, 1):
    print(f"{column}")  #Print col name to user thats removed

#Identify and remove SVC's    
single_value_columns = find_single_value_columns(df)

#Execute removal of SVC's
df.drop(columns=single_value_columns, inplace=True)

#------- Remove columns with variance <1 ----------

selector = VarianceThreshold(threshold=1)  

numeric_df = df.select_dtypes(include=[np.number])
transformed_data = selector.fit_transform(numeric_df)

cols = numeric_df.columns[selector.get_support(indices=True)]

df_selected = pd.DataFrame(transformed_data, columns=cols)






Single-value columns found and removed:
SMR_VSA8
SlogP_VSA9
fr_azide
fr_benzodiazepine
fr_diazo
fr_dihydropyridine
fr_isocyan
fr_isothiocyan
fr_lactam
fr_nitro_arom_nonortho
fr_phos_ester
fr_prisulfonamd
fr_thiocyan
fr_unbrch_alkane
(5385, 94)
