In [3]:
import pandas as pd

df = pd.read_csv('cox2_cleaned_data.csv')
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Relation,pChEMBL Value,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,IC50_nM
0,CHEMBL345905,243.35,0.0,2.77,1B,CN1CCC(C[C@H]2Cc3ccccc3C2=O)CC1,'=',8.11,33.34,0.62,5.34,39.95,7.7
1,CHEMBL2448065,467.37,0.0,4.9,15,COc1cc2c(Nc3ccc(Cl)cc3F)ncnc2cc1OCC1CCN(C)CC1.Cl,'=',7.0,,,,,100.0
2,CHEMBL539822,407.72,0.0,4.93,63,Cl.Nc1ccc2c(c1)sc1c(Nc3cccc(Br)c3)ncnc12,'=',9.33,25.12,0.58,4.4,14.61,0.47
3,CHEMBL540082,437.71,1.0,5.26,70,Cl.O=[N+]([O-])c1cccc2c1sc1c(Nc3cccc(Br)c3)ncnc12,'=',6.8,16.95,0.39,1.54,8.4,158.0
4,CHEMBL31118,412.29,0.0,3.95,24,CN(C)CCOc1cc2c(Nc3cccc(Br)c3)c(C#N)cnc2cn1,'>',,,,,,10000.0


In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors

# Создадим RDKit-молекулы
df['mol'] = df['Smiles'].apply(Chem.MolFromSmiles)
desc_list = [desc_name for desc_name, _ in Descriptors._descList]

def compute_rdkit_descriptors(mol):
    return [desc(mol) for _, desc in Descriptors._descList]

desc_df = df['mol'].apply(compute_rdkit_descriptors)
desc_df = pd.DataFrame(desc_df.tolist(), columns=desc_list)
df_rdkit = pd.concat([df[['Smiles', 'IC50_nM']], desc_df], axis=1)

In [5]:
from rdkit.Chem import AllChem
import numpy as np

def get_morgan_fp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((1,))
    AllChem.DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

fp_array = df['Smiles'].apply(get_morgan_fp)
fp_df = pd.DataFrame(fp_array.tolist())
fp_df.columns = [f'FP_{i}' for i in range(fp_df.shape[1])]
df_fps = pd.concat([df[['Smiles', 'IC50_nM']], fp_df], axis=1)



In [10]:
df_rdkit_clean = df_rdkit.dropna(axis=1)
df_rdkit_clean = df_rdkit_clean.loc[:, df_rdkit_clean.nunique() > 1]
corr_matrix = df_rdkit_clean.drop(columns=['Smiles', 'IC50_nM']).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
df_rdkit_filtered = df_rdkit_clean.drop(columns=to_drop)

In [11]:
df_rdkit_filtered.to_csv('cox2_with_rdkit_descriptors.csv', index=False)
df_fps.to_csv('cox2_with_morgan_fp.csv', index=False)