# Загрузка датасета 

In [1]:
%%capture

%pip install rdkit
%pip install mordred
%pip install chembl_webresource_client


In [3]:
import pandas as pd
import numpy as np

import mordred

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdFingerprintGenerator, rdMolDescriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from mordred import Calculator, descriptors

from sklearn.feature_selection import VarianceThreshold

from rdkit import Chem
from chembl_webresource_client.new_client import new_client


In [5]:
# подгузка датасета
activity = new_client.activity

target_chembl_id = 'CHEMBL262'

data = activity.filter(target_chembl_id=target_chembl_id).filter(standard_type__in=['IC50', 'pIC50'])

df = pd.DataFrame.from_records(data)

print(f'Загружено записей: {len(df)}')




Загружено записей: 4591


In [6]:
df.to_csv('gsk_3_beta_raw_data.csv', index=False)

# Предобработка

In [7]:
df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [8]:
#берем лишь нужные колонки

columns_to_keep = ['canonical_smiles', 'standard_value', 'standard_units', 'standard_type', 'standard_relation']

df = df[columns_to_keep]
df

Unnamed: 0,canonical_smiles,standard_value,standard_units,standard_type,standard_relation
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,380.0,nM,IC50,=
1,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,380.0,nM,IC50,=
2,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,6920.0,nM,IC50,=
3,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,410.0,nM,IC50,=
4,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,160.0,nM,IC50,=
...,...,...,...,...,...
4586,CCN1CCN(c2ccc(Nc3cc(N(C)C(=O)Nc4c(Cl)c(OC)cc(O...,10000.0,nM,IC50,>
4587,CCC(=O)N1CC[C@H](Nc2ncnc3c2CN(c2cnc(OC)c(C(F)(...,10000.0,nM,IC50,>
4588,CN1Cc2nccc(-c3cn(C)nc3-c3ccc(F)cc3)c2C1=O.Cl,10000.0,nM,IC50,=
4589,CC(=O)c1c(C)c2cnc(Nc3ccc(N4CCNCC4)cn3)nc2n(C2C...,10000.0,nM,IC50,>


In [None]:

df['standard_units'].value_counts()  #кажется, не нужно особо никаких преобразований по единице измерения

standard_units
nM    4473
Name: count, dtype: int64

In [11]:
df['standard_type'].value_counts()

standard_type
IC50     4588
pIC50       3
Name: count, dtype: int64

In [13]:
# Фильтруем только строки с IC50 и с реляцией= 
df = df[
    (df['standard_type'] == 'IC50') &
    (df['standard_relation'] == '=')
]



In [None]:
df.info()

#standard_value - не числовой признак, это странно 

<class 'pandas.core.frame.DataFrame'>
Index: 3674 entries, 0 to 4590
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   canonical_smiles   3674 non-null   object
 1   standard_value     3674 non-null   object
 2   standard_units     3674 non-null   object
 3   standard_type      3674 non-null   object
 4   standard_relation  3674 non-null   object
dtypes: object(5)
memory usage: 172.2+ KB


In [20]:
df['standard_value'] = pd.to_numeric(df['standard_value'], errors='coerce') #преобразуем во float

df = df[df['standard_value'].notna() & (df['standard_value'] > 0)] #удалим строки с NaN 

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3670 entries, 0 to 4590
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   canonical_smiles   3670 non-null   object 
 1   standard_value     3670 non-null   float64
 2   standard_units     3670 non-null   object 
 3   standard_type      3670 non-null   object 
 4   standard_relation  3670 non-null   object 
dtypes: float64(1), object(4)
memory usage: 172.0+ KB


In [22]:
# Убираем строки с нулями или отрицательными значениями в standard value
# df = df[df['standard_value'] > 0]

# Переводим IC50 в pIC50 (–log10(IC50 [M]))
df['pIC50'] = -np.log10(df['standard_value'] * 1e-9)

In [23]:
#удаляем пустые значения и дупликаты

df = df.dropna(subset=['canonical_smiles'])
df = df.drop_duplicates(subset='canonical_smiles')
df = df[df['standard_value'].notna()]
df.head()

Unnamed: 0,canonical_smiles,standard_value,standard_units,standard_type,standard_relation,pIC50
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,380.0,nM,IC50,=,6.420216
2,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,6920.0,nM,IC50,=,5.159894
3,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,410.0,nM,IC50,=,6.387216
4,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,160.0,nM,IC50,=,6.79588
6,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,9600.0,nM,IC50,=,5.017729


In [24]:
df.shape

(3109, 6)

In [25]:
# Оставляем только нужные колонки
df_final = df[['canonical_smiles', 'pIC50']].rename(columns={'canonical_smiles': 'smiles'})



In [26]:
df_final.shape #до проверки на валидность

(3109, 2)

In [27]:
# проверка корректности SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

df_final = df_final[df_final['smiles'].apply(is_valid_smiles)]
print("После проверки валидности SMILES:", df_final.shape)

После проверки валидности SMILES: (3109, 2)


In [29]:
# все SMILES в датасете валидны, поэтому сохраняем очищенный датасет в новый файл
df_final.to_csv('gsk_3_beta_clean_data.csv', index=False)

# 2. Рассчитываем дескрипторы



## RDKit


In [31]:
df = pd.read_csv('gsk_3_beta_clean_data.csv')
df.head()

Unnamed: 0,smiles,pIC50
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729


In [32]:
# инициализируем калькулятор 

desc_names = [name for name, _ in Descriptors._descList]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)

# функция для 1 мол

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)  
    try:
        return calculator.CalcDescriptors(mol)
    except:
        return [None] * len(desc_names)

# применяем к нашему дфрейму
desc_values = df['smiles'].apply(compute_descriptors)
desc_df = pd.DataFrame(desc_values.tolist(), columns=desc_names)

# склеиваем 
df_rdkit = pd.concat([df[['smiles', 'pIC50']], desc_df], axis=1)

In [33]:
df_rdkit.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,12.163164,12.163164,0.016875,-0.743819,0.333394,13.8,359.725,349.645,...,0,0,0,0,0,0,0,0,0,0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,5.787421,5.787421,0.119368,0.119368,0.506107,11.346154,353.35,338.23,...,0,0,0,0,0,0,0,0,0,0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,12.619437,12.619437,0.04941,-0.505798,0.410217,11.392857,384.404,364.244,...,0,0,0,0,0,0,0,0,0,0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,13.322081,13.322081,0.003567,-0.759892,0.405842,27.285714,466.541,440.333,...,0,0,0,0,0,0,0,0,0,0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,11.882601,11.882601,0.073612,-0.520107,0.359439,15.863636,307.318,290.182,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df_rdkit.shape

(3109, 219)

In [35]:
df_rdkit.to_csv('gsk_3_b_rdkit_desc_dirty.csv', index=False)

## Mordred

In [36]:
def compute_descriptors_mordred(df, smiles_column='smiles'):
    # инициализируем калькулятор
    calc = Calculator(descriptors, ignore_3D=True)

    # смайлз в мол
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # получаем дескрипторы
    desc_df = calc.pandas(mols)

    # склейка
    df_mordred = pd.concat([df, desc_df], axis=1)

    return df_mordred

In [37]:
df_mordred = compute_descriptors_mordred(df)

 42%|████▏     | 1317/3109 [02:22<19:42,  1.52it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|████▍     | 1374/3109 [02:29<10:15,  2.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 45%|████▌     | 1413/3109 [02:36<09:35,  2.95it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|███████▉  | 2456/3109 [04:47<04:06,  2.65it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|███████▉  | 2467/3109 [04:50<03:31,  3.03it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|███████▉  | 2471/3109 [04:51<03:00,  3.53it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3109/3109 [06:20<00:00,  8.17it/s]


In [38]:
df_mordred.head()

Unnamed: 0,smiles,pIC50,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,31.435973,2.489714,4.886395,31.435973,...,10.180551,74.180278,359.030898,10.258026,1436,41,132.0,157.0,9.361111,5.444444
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,34.022088,2.493274,4.804274,34.022088,...,10.149918,77.997626,353.146104,8.61332,1667,38,140.0,167.0,7.5,5.666667
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,36.218851,2.470628,4.787607,36.218851,...,10.048929,78.66702,384.17707,8.003689,2175,40,140.0,163.0,8.888889,6.472222
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,47.715405,2.749242,5.270881,47.715405,...,11.098394,90.994468,466.200491,7.642631,2737,85,220.0,290.0,9.340278,7.347222
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,28.827924,2.4715,4.795347,28.827924,...,9.921278,71.793347,307.150521,7.875654,1011,32,114.0,135.0,6.777778,4.972222


In [39]:
df_mordred.shape

(3109, 1615)

In [40]:
df_mordred.to_csv('gsk_3_b_mordred_desc_dirty.csv', index=False)

# 3. Чистим признаки

In [41]:
def cleaning_desc(df, smiles_col='smiles', activity_col='pIC50', corr_threshold=0.7):
    # надо было не склеивать сразу просто, но как вышло) 
    desc_only = df.drop(columns=[smiles_col, activity_col])

    # сразу отбираем только числовые признаки
    desc_only = desc_only.select_dtypes(include=[np.number])

    # убираем пропуски
    desc_only = desc_only.dropna(axis=1)

    # исключаем признаки с нулевой дисперсией
    selector = VarianceThreshold(threshold=0.0)
    desc_filtered = pd.DataFrame(
        selector.fit_transform(desc_only),
        columns=desc_only.columns[selector.get_support()]
    )


    # отбрасываем признаки с высокой корреляцией
    corr_matrix = desc_filtered.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > corr_threshold)]
    desc_filtered = desc_filtered.drop(columns=to_drop)

    # склеиваем обратно
    cleaned_df = pd.concat([df[[smiles_col, activity_col]].reset_index(drop=True),
                            desc_filtered.reset_index(drop=True)], axis=1)

    return cleaned_df

In [None]:
# df_rdkit.shape

In [42]:
df_cleaned_rdkit = cleaning_desc(df_rdkit)
df_cleaned_rdkit.shape

(3109, 118)

In [43]:
df_cleaned_rdkit.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,...,fr_pyridine,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,12.163164,0.016875,-0.743819,0.333394,13.8,0.276975,-0.506393,1.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,5.787421,0.119368,0.119368,0.506107,11.346154,0.243203,-0.377621,1.230769,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,12.619437,0.04941,-0.505798,0.410217,11.392857,0.293398,-0.377621,1.321429,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,13.322081,0.003567,-0.759892,0.405842,27.285714,0.252133,-0.374791,1.057143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,11.882601,0.073612,-0.520107,0.359439,15.863636,0.287211,-0.377621,1.272727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
df_cleaned_rdkit.to_csv('gsk_3_b_cleaned_rdkit_data.csv', index=False)

In [45]:
df_mordred.shape

(3109, 1615)

In [46]:
df_cleaned_mordred = cleaning_desc(df_mordred)
df_cleaned_mordred.shape

#как-то сильно много удалилось. надеюсь, это норма)

(3109, 171)

In [47]:
df_cleaned_mordred.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,JGI2,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,Diameter
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,0.0,0.0,12.0,35.0,0.0,0.0,0.0,3.0,...,0.08547,0.063516,0.043091,0.029701,0.020963,0.015687,0.012232,0.011792,0.010543,11.0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,0.0,1.0,21.0,41.0,0.0,0.0,0.0,11.0,...,0.075881,0.040387,0.029661,0.027547,0.017041,0.011237,0.010434,0.008385,0.005089,12.0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,0.0,1.0,16.0,48.0,0.0,0.0,0.0,10.0,...,0.077778,0.044618,0.030028,0.025508,0.020246,0.010967,0.007091,0.006861,0.006229,13.0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,0.0,1.0,20.0,61.0,0.0,2.0,0.0,4.0,...,0.068627,0.056944,0.040017,0.026681,0.02398,0.015447,0.011743,0.009199,0.006173,10.0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,0.0,1.0,10.0,39.0,0.0,0.0,0.0,9.0,...,0.080808,0.044054,0.035035,0.0258,0.016381,0.009954,0.008971,0.009452,0.0,9.0


In [48]:
df_cleaned_mordred.to_csv('gsk_3_b_cleaned_mordred_data.csv', index=False)

# 4. Вычислим фингерпринты

## morgan fp

In [49]:
def compute_morgan_fp(df, smiles_column='smiles', radius=2, n_bits=1024):
    # smiles to mol
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # создаем генератор
    gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)

    # вычисляем фингерпринты
    fps = [gen.GetFingerprint(mol) for mol in mols]

    # в векторы
    fps_array = np.array([list(fp) for fp in fps])

    # в дфрейм
    fps_df = pd.DataFrame(fps_array, columns=[f'FP_bit_{i}' for i in range(n_bits)])

    # склеиваем
    df_fps = pd.concat([df.reset_index(drop=True), fps_df.reset_index(drop=True)], axis=1)

    return df_fps

In [50]:
df_cleaned_rdkit_morgan = compute_morgan_fp(df_cleaned_rdkit)
df_cleaned_rdkit_morgan.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,12.163164,0.016875,-0.743819,0.333394,13.8,0.276975,-0.506393,1.12,...,0,0,0,0,0,0,0,0,0,0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,5.787421,0.119368,0.119368,0.506107,11.346154,0.243203,-0.377621,1.230769,...,0,0,0,0,0,0,0,0,0,0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,12.619437,0.04941,-0.505798,0.410217,11.392857,0.293398,-0.377621,1.321429,...,0,0,0,0,0,0,0,0,0,0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,13.322081,0.003567,-0.759892,0.405842,27.285714,0.252133,-0.374791,1.057143,...,0,0,0,0,0,1,0,0,0,0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,11.882601,0.073612,-0.520107,0.359439,15.863636,0.287211,-0.377621,1.272727,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_cleaned_rdkit_morgan.to_csv('gsk_3_b_cleaned_rdkit_morgan_fp.csv', index=False)

In [52]:
df_cleaned_mordred_morgan = compute_morgan_fp(df_cleaned_mordred)
df_cleaned_mordred_morgan.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,0.0,0.0,12.0,35.0,0.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,0.0,1.0,21.0,41.0,0.0,0.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,0.0,1.0,16.0,48.0,0.0,0.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,0.0,1.0,20.0,61.0,0.0,2.0,0.0,4.0,...,0,0,0,0,0,1,0,0,0,0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,0.0,1.0,10.0,39.0,0.0,0.0,0.0,9.0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
df_cleaned_mordred_morgan.to_csv('gsk_3_b_cleaned_mordred_morgan_fp.csv', index=False)

## maccs

In [54]:
def compute_maccs_fp(df, smiles_column='smiles'):
    # smiles в мол
    mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_column]]

    # вычисляем maccs fp
    fps = [rdMolDescriptors.GetMACCSKeysFingerprint(mol) for mol in mols]

    # преобразуем в массив 0/1, игноря первый бит 
    fps_array = [[int(bit) for bit in fp.ToBitString()[1:]] for fp in fps]

    # в дфрейм
    fps_df = pd.DataFrame(fps_array, columns=[f'FP_MACCS_{i}' for i in range(1, 167)])

    # склеиваем
    df_fps = pd.concat([df.reset_index(drop=True), fps_df.reset_index(drop=True)], axis=1)

    return df_fps

In [55]:

df_cleaned_rdkit_maccs = compute_maccs_fp(df_cleaned_rdkit)
df_cleaned_rdkit_maccs.head()

Unnamed: 0,smiles,pIC50,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,...,FP_MACCS_157,FP_MACCS_158,FP_MACCS_159,FP_MACCS_160,FP_MACCS_161,FP_MACCS_162,FP_MACCS_163,FP_MACCS_164,FP_MACCS_165,FP_MACCS_166
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,12.163164,0.016875,-0.743819,0.333394,13.8,0.276975,-0.506393,1.12,...,1,1,1,0,1,1,1,1,1,0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,5.787421,0.119368,0.119368,0.506107,11.346154,0.243203,-0.377621,1.230769,...,0,1,0,1,1,1,1,1,1,0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,12.619437,0.04941,-0.505798,0.410217,11.392857,0.293398,-0.377621,1.321429,...,0,1,1,1,1,1,1,1,1,0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,13.322081,0.003567,-0.759892,0.405842,27.285714,0.252133,-0.374791,1.057143,...,1,1,1,1,1,1,1,1,1,0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,11.882601,0.073612,-0.520107,0.359439,15.863636,0.287211,-0.377621,1.272727,...,0,1,1,0,1,1,1,1,1,0


In [56]:
df_cleaned_mordred_maccs = compute_maccs_fp(df_cleaned_mordred)
df_cleaned_mordred_maccs.head()

Unnamed: 0,smiles,pIC50,nAcid,nBase,nAromAtom,nAtom,nSpiro,nBridgehead,nB,nN,...,FP_MACCS_157,FP_MACCS_158,FP_MACCS_159,FP_MACCS_160,FP_MACCS_161,FP_MACCS_162,FP_MACCS_163,FP_MACCS_164,FP_MACCS_165,FP_MACCS_166
0,O=C1NC(=O)C(c2ccccc2[N+](=O)[O-])=C1Nc1ccc(O)c...,6.420216,0.0,0.0,12.0,35.0,0.0,0.0,0.0,3.0,...,1,1,1,0,1,1,1,1,1,0
1,CN(C)Cc1c(-c2n[nH]c(-c3ccncc3)n2)nnn1-c1nonc1N,5.159894,0.0,1.0,21.0,41.0,0.0,0.0,0.0,11.0,...,0,1,0,1,1,1,1,1,1,0
2,CCN(CC)Cc1c(C(=O)N/N=C/c2ccncc2)nnn1-c1nonc1N,6.387216,0.0,1.0,16.0,48.0,0.0,0.0,0.0,10.0,...,0,1,1,1,1,1,1,1,1,0
3,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,6.79588,0.0,1.0,20.0,61.0,0.0,2.0,0.0,4.0,...,1,1,1,1,1,1,1,1,1,0
4,NNC(=O)c1nnn(-c2nonc2N)c1CN1CCCCC1,5.017729,0.0,1.0,10.0,39.0,0.0,0.0,0.0,9.0,...,0,1,1,0,1,1,1,1,1,0


In [57]:
df_cleaned_rdkit_maccs.to_csv('df_gsk_3_b_cleaned_rdkit_maccs.csv', index=False)
df_cleaned_mordred_maccs.to_csv('df_gsk_3_b_cleaned_mordred_maccs.csv', index=False)