# Загрузка датасета

In [1]:
%%capture

%pip install rdkit
%pip install chembl_webresource_client

In [3]:
# импорт библиотек
import pandas as pd
from rdkit import Chem
from chembl_webresource_client.new_client import new_client

import numpy as np

In [5]:
# подгузка датасета
activity = new_client.activity

target_chembl_id = 'CHEMBL4822'

data = activity.filter(target_chembl_id=target_chembl_id).filter(standard_type__in=['IC50', 'pIC50'])

df = pd.DataFrame.from_records(data)

print(f'Загружено записей: {len(df)}')



Загружено записей: 10766


In [6]:
df.to_csv('bace_raw_data.csv', index=False)


# Предобработка

In [7]:
df = pd.read_csv('bace_raw_data.csv')
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,78857,[],CHEMBL653511,Inhibitory activity against Beta-secretase 1 w...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,nM,UO_0000065,,413.0
1,,,78857,[],CHEMBL653511,Inhibitory activity against Beta-secretase 1 w...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,nM,UO_0000065,,413.0
2,,,391560,[],CHEMBL653332,Compound was tested for its inhibitory activit...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.002
3,,,391983,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.46
4,,,395858,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,9.0


In [8]:
df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [9]:
#берем лишь нужные колонки

columns_to_keep = ['canonical_smiles', 'standard_value', 'standard_units', 'standard_type', 'standard_relation']

df = df[columns_to_keep]
df

Unnamed: 0,canonical_smiles,standard_value,standard_units,standard_type,standard_relation
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,413.0,nM,IC50,=
1,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,413.0,nM,IC50,=
2,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,2.0,nM,IC50,=
3,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,460.0,nM,IC50,=
4,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,9000.0,nM,IC50,=
...,...,...,...,...,...
10761,CCCN1CCCCCOc2cccc(c2)C[C@@H]([C@H](O)CNC(C)(C)...,6000.0,nM,IC50,=
10762,CCCN1CCOc2cccc(c2)C[C@@H]([C@H](O)CNC(C)(C)c2c...,1850.0,nM,IC50,=
10763,CCCN1CCOCCc2cccc(c2)C[C@@H]([C@H](O)CNC(C)(C)c...,22700.0,nM,IC50,=
10764,CCCN1CCCCc2cccc(c2)C[C@@H]([C@H](O)CNC(C)(C)c2...,3020.0,nM,IC50,=


In [10]:
df['standard_units'].unique() #кажется, можно оставить только nM
df['standard_units'].value_counts()


standard_units
nM         10609
ug.mL-1        2
Name: count, dtype: int64

In [11]:
#кажется, в нашем случае можно сделать даже так

# Фильтруем только строки с IC50 в наномолярных единицах
df = df[
    (df['standard_type'] == 'IC50') &
    (df['standard_units'] == 'nM') &
    (df['standard_relation'] == '=')
]

In [12]:
# Убираем строки с нулями или отрицательными значениями
df = df[df['standard_value'] > 0]

# Переводим IC50 в pIC50 (–log10(IC50 [M]))
df['pIC50'] = -np.log10(df['standard_value'] * 1e-9)

In [13]:
#удаляем пустые значения и дупликаты

df = df.dropna(subset=['canonical_smiles'])
df = df.drop_duplicates(subset='canonical_smiles')
df = df[df['standard_value'].notna()]
df.head()

Unnamed: 0,canonical_smiles,standard_value,standard_units,standard_type,standard_relation,pIC50
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,413.0,nM,IC50,=,6.38405
2,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,2.0,nM,IC50,=,8.69897
3,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,460.0,nM,IC50,=,6.337242
4,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,9000.0,nM,IC50,=,5.045757
5,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5600.0,nM,IC50,=,5.251812


In [14]:
df.shape

(6945, 6)

In [15]:
# Оставляем только нужные колонки
df_final = df[['canonical_smiles', 'pIC50']].rename(columns={'canonical_smiles': 'smiles'})



In [16]:
df_final.shape #до проверки на валидность

(6945, 2)

In [17]:
# проверка корректности SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

df_final = df_final[df_final['smiles'].apply(is_valid_smiles)]
print("После проверки валидности SMILES:", df_final.shape)

После проверки валидности SMILES: (6945, 2)


In [18]:
# все SMILES в датасете валидны, поэтому сохраняем очищенный датасет в новый файл
df_final.to_csv('bace_clean_data.csv', index=False)
