In [93]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import seaborn as sns
import matplotlib.pyplot as plt

In [94]:
df = pd.read_csv('cox2_raw_data.csv', delimiter = ";")
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL345905,,,243.35,0.0,2.77,1B,CN1CCC(C[C@H]2Cc3ccccc3C2=O)CC1,IC50,'=',...,CHEMBL1126158,1,Scientific Literature,J Med Chem,1992.0,,,,,7.7
1,CHEMBL2448065,,,467.37,0.0,4.9,15,COc1cc2c(Nc3ccc(Cl)cc3F)ncnc2cc1OCC1CCN(C)CC1.Cl,IC50,'=',...,CHEMBL1135889,1,Scientific Literature,J Med Chem,2002.0,,,,,0.1
2,CHEMBL539822,,,407.72,0.0,4.93,63,Cl.Nc1ccc2c(c1)sc1c(Nc3cccc(Br)c3)ncnc12,IC50,'=',...,CHEMBL1132555,1,Scientific Literature,J Med Chem,1999.0,,,,,0.47
3,CHEMBL540082,,,437.71,1.0,5.26,70,Cl.O=[N+]([O-])c1cccc2c1sc1c(Nc3cccc(Br)c3)ncnc12,IC50,'=',...,CHEMBL1132555,1,Scientific Literature,J Med Chem,1999.0,,,,,158.0
4,CHEMBL31118,,,412.29,0.0,3.95,24,CN(C)CCOc1cc2c(Nc3cccc(Br)c3)c(C#N)cnc2cn1,IC50,'>',...,CHEMBL1147619,1,Scientific Literature,Bioorg Med Chem Lett,2004.0,,,,,10000.0


In [95]:
# Фильтрация по типу активности
df = df[df['Standard Type'] == 'IC50']

# Выводим доступные колонки, чтобы понять, какие нам важны
df.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type', 'Standard Text Value', 'V

In [96]:
columns_to_keep = ['Molecule ChEMBL ID',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Target Type',
       'Value']
df = df[columns_to_keep]
df.head()

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Uo Units,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Target Type,Value
0,CHEMBL345905,243.35,0.0,2.77,1B,CN1CCC(C[C@H]2Cc3ccccc3C2=O)CC1,'=',7.7,nM,8.11,UO_0000065,33.34,0.62,5.34,39.95,SINGLE PROTEIN,7.7
1,CHEMBL2448065,467.37,0.0,4.9,15,COc1cc2c(Nc3ccc(Cl)cc3F)ncnc2cc1OCC1CCN(C)CC1.Cl,'=',100.0,nM,7.0,UO_0000065,,,,,SINGLE PROTEIN,0.1
2,CHEMBL539822,407.72,0.0,4.93,63,Cl.Nc1ccc2c(c1)sc1c(Nc3cccc(Br)c3)ncnc12,'=',0.47,nM,9.33,UO_0000065,25.12,0.58,4.4,14.61,SINGLE PROTEIN,0.47
3,CHEMBL540082,437.71,1.0,5.26,70,Cl.O=[N+]([O-])c1cccc2c1sc1c(Nc3cccc(Br)c3)ncnc12,'=',158.0,nM,6.8,UO_0000065,16.95,0.39,1.54,8.4,SINGLE PROTEIN,158.0
4,CHEMBL31118,412.29,0.0,3.95,24,CN(C)CCOc1cc2c(Nc3cccc(Br)c3)c(C#N)cnc2cn1,'>',10000.0,nM,,UO_0000065,,,,,SINGLE PROTEIN,10000.0


In [97]:
df = df[pd.to_numeric(df['Standard Value'], errors='coerce').notna()]
df['Standard Value'] = df['Standard Value'].astype(float)

In [98]:
df = df.dropna(subset=['Smiles', 'Standard Value'])
df = df.drop_duplicates(subset=['Smiles'])

In [99]:
from rdkit import Chem

def is_valid_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

df['is_valid'] = df['Smiles'].apply(is_valid_smiles)
df = df[df['is_valid']].drop(columns=['is_valid'])

In [100]:
df['Standard Units'].value_counts()

Standard Units
nM             51380
ug.mL-1          150
/uM                6
ucm                4
10'5pM             3
10^-4microM        2
10'6pM             2
ug                 1
%                  1
Name: count, dtype: int64

In [87]:
# Сохраняем только нормальные единицы и те, что можно обработать вручную
df_clean = df[df['Standard Units'] == 'nM'].copy()

In [88]:
# Обработка '10^-4microM' — 0.1 нМ
mask1 = df['Standard Units'] == "10^-4microM"
df1 = df[mask1].copy()
df1['IC50_nM'] = 0.1

# Обработка "10'6pM" — 10^6 пМ = 1000 нМ
mask2 = df['Standard Units'] == "10'6pM"
df2 = df[mask2].copy()
df2['IC50_nM'] = 1000

# Обработка "10'5pM" — 10^5 пМ = 100 нМ
mask3 = df['Standard Units'] == "10'5pM"
df3 = df[mask3].copy()
df3['IC50_nM'] = 100

In [89]:
# Стандартные записи
df_clean['IC50_nM'] = df_clean['Standard Value'].astype(float)

# Объединяем с редкими обработанными
df_final = pd.concat([df_clean, df1, df2, df3], ignore_index=True)

In [90]:
df_final = df_final.drop(columns=['Standard Units'])

In [91]:
df_final.head()

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Relation,Standard Value,pChEMBL Value,Uo Units,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Target Type,Value,IC50_nM
0,CHEMBL345905,243.35,0.0,2.77,1B,CN1CCC(C[C@H]2Cc3ccccc3C2=O)CC1,'=',7.7,8.11,UO_0000065,33.34,0.62,5.34,39.95,SINGLE PROTEIN,7.7,7.7
1,CHEMBL2448065,467.37,0.0,4.9,15,COc1cc2c(Nc3ccc(Cl)cc3F)ncnc2cc1OCC1CCN(C)CC1.Cl,'=',100.0,7.0,UO_0000065,,,,,SINGLE PROTEIN,0.1,100.0
2,CHEMBL539822,407.72,0.0,4.93,63,Cl.Nc1ccc2c(c1)sc1c(Nc3cccc(Br)c3)ncnc12,'=',0.47,9.33,UO_0000065,25.12,0.58,4.4,14.61,SINGLE PROTEIN,0.47,0.47
3,CHEMBL540082,437.71,1.0,5.26,70,Cl.O=[N+]([O-])c1cccc2c1sc1c(Nc3cccc(Br)c3)ncnc12,'=',158.0,6.8,UO_0000065,16.95,0.39,1.54,8.4,SINGLE PROTEIN,158.0,158.0
4,CHEMBL31118,412.29,0.0,3.95,24,CN(C)CCOc1cc2c(Nc3cccc(Br)c3)c(C#N)cnc2cn1,'>',10000.0,,UO_0000065,,,,,SINGLE PROTEIN,10000.0,10000.0


In [92]:
df_final.to_csv('cox2_cleaned_data.csv', index=False)