In [1]:
%%capture

%pip install rdkit
%pip install mordred
%pip install padelpy


In [2]:
import pandas as pd
import numpy as np

import mordred

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdFingerprintGenerator, rdMolDescriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from mordred import Calculator, descriptors

from sklearn.feature_selection import VarianceThreshold

# 1. Загружаем csv файл с мини-таска 1


In [4]:
df = pd.read_csv('bace_clean_data.csv')
df.head()

Unnamed: 0,smiles,pIC50
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812


# 2. Рассчитываем дескрипторы



## RDKit


In [7]:
# инициализируем калькулятор 

desc_names = [name for name, _ in Descriptors._descList]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)

# функция для 1 мол

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)  
    try:
        return calculator.CalcDescriptors(mol)
    except:
        return [None] * len(desc_names)

# применяем к нашему дфрейму
desc_values = df['smiles'].apply(compute_descriptors)
desc_df = pd.DataFrame(desc_values.tolist(), columns=desc_names)

# склеиваем 
df_rdkit = pd.concat([df[['smiles', 'pIC50']], desc_df], axis=1)

In [8]:
df_rdkit.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,14.032746,14.032746,0.017379,-2.195009,0.041154,14.887324,999.085,932.557,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,13.595406,13.595406,0.09781,-1.573089,0.042501,15.460317,893.005,828.493,...,0,0,0,0,0,0,0,0,0,0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,13.243577,13.243577,0.114162,-1.303772,0.077027,15.384615,751.988,690.5,...,1,0,0,0,0,0,0,0,0,0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,13.416202,13.416202,0.118038,-1.312338,0.09937,15.962264,737.895,682.455,...,0,0,0,0,0,0,0,0,0,0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,13.721715,13.721715,0.112353,-1.361064,0.074085,15.966667,828.02,766.532,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_rdkit.shape

(6945, 219)

In [10]:
df_rdkit.to_csv('rdkit_desc_dirty.csv', index=False)

## Mordred

In [11]:
def compute_descriptors_mordred(df, smiles_column='smiles'):
    # инициализируем калькулятор
    calc = Calculator(descriptors, ignore_3D=True)

    # смайлз в мол
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # получаем дескрипторы
    desc_df = calc.pandas(mols)

    # склейка
    df_mordred = pd.concat([df, desc_df], axis=1)

    return df_mordred

In [12]:
df_mordred = compute_descriptors_mordred(df)

  0%|          | 1/6945 [00:05<9:44:34,  5.05s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 4/6945 [00:05<4:23:48,  2.28s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 9/6945 [00:06<54:31,  2.12it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  0%|          | 33/6945 [00:12<24:27,  4.71it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 60/6945 [00:25<1:27:58,  1.30it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 84/6945 [00:28<28:17,  4.04it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 98%|█████████▊| 6839/6945 [17:22<01:35,  1.11it/s]  

  s += (eig.vec[i, eig.max] * eig.vec[j, eig.max]) ** -0.5
  s += (eig.vec[i, eig.max] * eig.vec[j, eig.max]) ** -0.5


100%|██████████| 6945/6945 [17:33<00:00,  6.59it/s]


In [13]:
df_mordred.head()

Unnamed: 0,smiles,pIC50,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,4,1,84.986268,2.357398,4.714795,84.986268,...,10.820278,110.064076,998.459678,7.288027,28854,104,338.0,376.0,30.916667,15.944444
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,3,1,74.01907,2.337816,4.675632,74.01907,...,10.667023,101.362551,892.454199,7.027198,22054,91,296.0,327.0,29.194444,14.166667
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,0,61.659032,2.315876,4.631753,61.659032,...,10.458665,89.340175,751.419,6.649726,13123,74,242.0,267.0,23.388889,11.916667
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,0,65.429866,2.323173,4.646345,65.429866,...,10.531856,90.661269,737.399978,6.827778,13440,75,256.0,285.0,20.027778,11.888889
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,0,74.831685,2.33544,4.67088,74.831685,...,10.699124,98.40789,827.446929,6.838404,18407,87,294.0,330.0,21.5,13.416667


In [14]:
df_mordred.shape

(6945, 1615)

In [15]:
df_mordred.to_csv('mordred_desc_dirty.csv', index=False)

# 3. Чистим признаки

In [16]:
def cleaning_desc(df, smiles_col='smiles', activity_col='pIC50', corr_threshold=0.7):
    # надо было не склеивать сразу просто, но как вышло) 
    desc_only = df.drop(columns=[smiles_col, activity_col])

    # сразу отбираем только числовые признаки
    desc_only = desc_only.select_dtypes(include=[np.number])

    # убираем пропуски
    desc_only = desc_only.dropna(axis=1)

    # исключаем признаки с нулевой дисперсией
    selector = VarianceThreshold(threshold=0.0)
    desc_filtered = pd.DataFrame(
        selector.fit_transform(desc_only),
        columns=desc_only.columns[selector.get_support()]
    )


    # отбрасываем признаки с высокой корреляцией
    corr_matrix = desc_filtered.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > corr_threshold)]
    desc_filtered = desc_filtered.drop(columns=to_drop)

    # склеиваем обратно
    cleaned_df = pd.concat([df[[smiles_col, activity_col]].reset_index(drop=True),
                            desc_filtered.reset_index(drop=True)], axis=1)

    return cleaned_df

In [None]:
# df_rdkit.shape

In [17]:
df_cleaned_rdkit = cleaning_desc(df_rdkit)
df_cleaned_rdkit.shape

(6945, 104)

In [18]:
df_cleaned_rdkit.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,FpDensityMorgan1,AvgIpc,BalabanJ,...,fr_pyridine,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,14.032746,0.017379,-2.195009,0.041154,14.887324,0.492958,3.292489,2.319217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,13.595406,0.09781,-1.573089,0.042501,15.460317,0.603175,3.183525,3.068009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,13.243577,0.114162,-1.303772,0.077027,15.384615,0.769231,3.06055,2.941779,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,13.416202,0.118038,-1.312338,0.09937,15.962264,0.716981,3.118442,1.649423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,13.721715,0.112353,-1.361064,0.074085,15.966667,0.633333,3.217133,1.427263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_cleaned_rdkit.to_csv('cleaned_rdkit_data.csv', index=False)

In [20]:
df_mordred.shape

(6945, 1615)

In [21]:
df_cleaned_mordred = cleaning_desc(df_mordred)
df_cleaned_mordred.shape

#как-то сильно много удалилось. надеюсь, это норма)

(6945, 161)

In [22]:
df_cleaned_mordred.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,JGI1,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,Diameter
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,4.0,1.0,12.0,137.0,0.0,0.0,0.0,8.0,...,0.256944,0.046875,0.043029,0.025808,0.017155,0.014706,0.011409,0.008049,0.007768,31.0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,3.0,1.0,6.0,127.0,0.0,0.0,0.0,8.0,...,0.277778,0.04739,0.045971,0.025443,0.017007,0.016861,0.011707,0.007993,0.008686,30.0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,1.0,0.0,6.0,113.0,0.0,0.0,0.0,5.0,...,0.269231,0.052365,0.043731,0.022529,0.02065,0.014091,0.010755,0.010836,0.006276,25.0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,1.0,0.0,12.0,108.0,0.0,0.0,0.0,5.0,...,0.227273,0.045,0.041616,0.019263,0.019661,0.01221,0.009993,0.01011,0.006269,25.0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,1.0,0.0,18.0,121.0,0.0,0.0,0.0,5.0,...,0.214286,0.043822,0.04084,0.019583,0.019,0.012768,0.009429,0.009476,0.005748,27.0


In [23]:
df_cleaned_mordred.to_csv('cleaned_mordred_data.csv', index=False)

# 4. Вычислим фингерпринты

## morgan fp

In [24]:
def compute_morgan_fp(df, smiles_column='smiles', radius=2, n_bits=1024):
    # smiles to mol
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # создаем генератор
    gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)

    # вычисляем фингерпринты
    fps = [gen.GetFingerprint(mol) for mol in mols]

    # в векторы
    fps_array = np.array([list(fp) for fp in fps])

    # в дфрейм
    fps_df = pd.DataFrame(fps_array, columns=[f'FP_bit_{i}' for i in range(n_bits)])

    # склеиваем
    df_fps = pd.concat([df.reset_index(drop=True), fps_df.reset_index(drop=True)], axis=1)

    return df_fps

In [25]:
df_cleaned_rdkit_morgan = compute_morgan_fp(df_cleaned_rdkit)
df_cleaned_rdkit_morgan.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,FpDensityMorgan1,AvgIpc,BalabanJ,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,14.032746,0.017379,-2.195009,0.041154,14.887324,0.492958,3.292489,2.319217,...,0,1,0,0,1,0,0,0,0,0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,13.595406,0.09781,-1.573089,0.042501,15.460317,0.603175,3.183525,3.068009,...,0,1,0,0,1,0,0,0,0,0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,13.243577,0.114162,-1.303772,0.077027,15.384615,0.769231,3.06055,2.941779,...,0,1,0,1,0,0,0,0,0,0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,13.416202,0.118038,-1.312338,0.09937,15.962264,0.716981,3.118442,1.649423,...,0,1,0,1,0,1,0,0,0,0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,13.721715,0.112353,-1.361064,0.074085,15.966667,0.633333,3.217133,1.427263,...,0,1,0,1,0,1,0,0,0,0


In [26]:
df_cleaned_rdkit_morgan.to_csv('cleaned_rdkit_morgan_fp.csv', index=False)

In [27]:
df_cleaned_mordred_morgan = compute_morgan_fp(df_cleaned_mordred)
df_cleaned_mordred_morgan.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,4.0,1.0,12.0,137.0,0.0,0.0,0.0,8.0,...,0,1,0,0,1,0,0,0,0,0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,3.0,1.0,6.0,127.0,0.0,0.0,0.0,8.0,...,0,1,0,0,1,0,0,0,0,0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,1.0,0.0,6.0,113.0,0.0,0.0,0.0,5.0,...,0,1,0,1,0,0,0,0,0,0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,1.0,0.0,12.0,108.0,0.0,0.0,0.0,5.0,...,0,1,0,1,0,1,0,0,0,0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,1.0,0.0,18.0,121.0,0.0,0.0,0.0,5.0,...,0,1,0,1,0,1,0,0,0,0


In [28]:
df_cleaned_mordred_morgan.to_csv('cleaned_mordred_morgan_fp.csv', index=False)

## maccs

In [29]:
def compute_maccs_fp(df, smiles_column='smiles'):
    # smiles в мол
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # вычисляем maccs fp
    fps = [rdMolDescriptors.GetMACCSKeysFingerprint(mol) for mol in mols]

    # преобразуем в массив 0/1, игноря первый бит 
    fps_array = [[int(bit) for bit in fp.ToBitString()[1:]] for fp in fps]

    # в дфрейм
    fps_df = pd.DataFrame(fps_array, columns=[f'FP_MACCS_{i}' for i in range(1, 167)])

    # склеиваем
    df_fps = pd.concat([df.reset_index(drop=True), fps_df.reset_index(drop=True)], axis=1)

    return df_fps

In [30]:

df_cleaned_rdkit_maccs = compute_maccs_fp(df_cleaned_rdkit)
df_cleaned_rdkit_maccs.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,FpDensityMorgan1,AvgIpc,BalabanJ,...,FP_MACCS_157,FP_MACCS_158,FP_MACCS_159,FP_MACCS_160,FP_MACCS_161,FP_MACCS_162,FP_MACCS_163,FP_MACCS_164,FP_MACCS_165,FP_MACCS_166
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,14.032746,0.017379,-2.195009,0.041154,14.887324,0.492958,3.292489,2.319217,...,1,1,1,1,1,1,1,1,1,0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,13.595406,0.09781,-1.573089,0.042501,15.460317,0.603175,3.183525,3.068009,...,1,1,1,1,1,1,1,1,1,0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,13.243577,0.114162,-1.303772,0.077027,15.384615,0.769231,3.06055,2.941779,...,1,1,1,1,1,1,1,1,1,0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,13.416202,0.118038,-1.312338,0.09937,15.962264,0.716981,3.118442,1.649423,...,1,1,1,1,1,1,1,1,1,0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,13.721715,0.112353,-1.361064,0.074085,15.966667,0.633333,3.217133,1.427263,...,1,1,1,1,1,1,1,1,1,0


In [31]:
df_cleaned_mordred_maccs = compute_maccs_fp(df_cleaned_mordred)
df_cleaned_mordred_maccs.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,FP_MACCS_157,FP_MACCS_158,FP_MACCS_159,FP_MACCS_160,FP_MACCS_161,FP_MACCS_162,FP_MACCS_163,FP_MACCS_164,FP_MACCS_165,FP_MACCS_166
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,4.0,1.0,12.0,137.0,0.0,0.0,0.0,8.0,...,1,1,1,1,1,1,1,1,1,0
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,3.0,1.0,6.0,127.0,0.0,0.0,0.0,8.0,...,1,1,1,1,1,1,1,1,1,0
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,1.0,0.0,6.0,113.0,0.0,0.0,0.0,5.0,...,1,1,1,1,1,1,1,1,1,0
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,1.0,0.0,12.0,108.0,0.0,0.0,0.0,5.0,...,1,1,1,1,1,1,1,1,1,0
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,1.0,0.0,18.0,121.0,0.0,0.0,0.0,5.0,...,1,1,1,1,1,1,1,1,1,0


In [32]:
df_cleaned_rdkit_maccs.to_csv('df_cleaned_rdkit_maccs.csv', index=False)
df_cleaned_mordred_maccs.to_csv('df_cleaned_mordred_maccs.csv', index=False)