In [9]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
from data_loader import load_chembl_activity

targets = {
    "bace1": "CHEMBL4822",
    "gsk3b": "CHEMBL262",
    "ache": "CHEMBL220"
}

In [10]:
DATA_PATH = '../data/multitask_clean.csv'

## Preprocessing + verification

In [22]:
df = pd.read_csv('../data/multitask_clean.csv', low_memory=False)

for col in df.columns:
    if col.startswith("ic50_"):
        df[col] = pd.to_numeric(df[col], errors="coerce")

for col in df.columns:
    if col.startswith("ic50_"):
        df.loc[~(df[col] > 0), col] = np.nan

for col in df.columns:
    if col.startswith("ic50_"):
        target = col.split("_", 1)[1]
        df[f"pIC50_{target}"] = -np.log10(df[col] * 1e-9)

df = df.replace([np.inf, -np.inf], np.nan)

df.to_csv('../data/multitask_processed.csv', index=False)

print(f"Total rows: {df.shape[0]}")
print("Missing values per pIC50 column:")
print(df[[c for c in df.columns if c.startswith("pIC50_")]].isna().sum())

df.head()


Total rows: 9288
Missing values per pIC50 column:
pIC50_bace1    1829
pIC50_gsk3b    7431
pIC50_ache     9038
dtype: int64


Unnamed: 0,smiles,ic50_bace1,ic50_gsk3b,ic50_ache,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,pIC50_bace1,pIC50_gsk3b,pIC50_ache
0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,413.0,,,0,1,0,0,0,0,...,7.288027,28854,104,338.0,376.0,30.916666666666668,15.944444,6.38405,,
1,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,2.0,,,0,1,0,0,0,0,...,7.027198,22054,91,296.0,327.0,29.194444444444446,14.166667,8.69897,,
2,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,460.0,,,0,1,0,0,0,0,...,6.649726,13123,74,242.0,267.0,23.38888888888889,11.916667,6.337242,,
3,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,9000.0,,,0,1,1,0,0,0,...,6.827778,13440,75,256.0,285.0,20.02777777777778,11.888889,5.045757,,
4,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5600.0,,,0,1,1,0,0,0,...,6.838404,18407,87,294.0,330.0,21.5,13.416667,5.251812,,


In [21]:
print('Basic statistics:')
df.describe().T

Basic statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ic50_bace1,7459.0,42553.197380,1.097844e+06,0.000200,28.950000,264.000000,3400.000000,8.600000e+07
ic50_gsk3b,1857.0,71808.593722,1.643538e+06,0.013000,62.800000,700.000000,9660.000000,5.000000e+07
ic50_ache,250.0,154926.093792,1.849307e+06,0.090000,26.357500,1143.000000,5180.000000,2.900000e+07
fp_0,9288.0,0.003984,6.299359e-02,0.000000,0.000000,0.000000,0.000000,1.000000e+00
fp_1,9288.0,0.320736,4.667848e-01,0.000000,0.000000,0.000000,1.000000,1.000000e+00
...,...,...,...,...,...,...,...,...
Zagreb2,9288.0,201.597007,5.778160e+01,0.000000,169.000000,199.000000,228.000000,1.277000e+03
mZagreb2,9288.0,6.928390,2.362019e+00,0.000000,5.722222,6.527778,7.562500,4.923611e+01
pIC50_bace1,7459.0,6.475600,1.401509e+00,1.065502,5.468521,6.578396,7.538352,1.269897e+01
pIC50_gsk3b,1857.0,6.201264,1.337022e+00,1.301030,5.015023,6.154902,7.202040,1.088606e+01
