In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

from tqdm import tqdm
tqdm.pandas()


In [3]:
PROJECT_ROOT = Path("..").resolve()

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
RESULTS_DIR = PROJECT_ROOT / "results"

In [4]:
tox21 = pd.read_csv(DATA_PROCESSED / "tox21_clean.csv")

tox21.shape


(3074, 15)

In [5]:
def smiles_to_mol(smiles):
    return Chem.MolFromSmiles(smiles)

tox21["mol"] = tox21["smiles"].apply(smiles_to_mol)

tox21["mol"].isna().sum()

np.int64(0)

In [6]:
descriptor_names = [desc[0] for desc in Descriptors._descList]
len(descriptor_names)

217

In [7]:
def compute_descriptors(mol):
    return [desc[1](mol) for desc in Descriptors._descList]

In [8]:
X_desc = tox21["mol"].progress_apply(compute_descriptors)

X_desc = pd.DataFrame(X_desc.tolist(), columns=descriptor_names)
X_desc.shape

100%|██████████| 3074/3074 [01:27<00:00, 35.00it/s]


(3074, 217)

In [9]:
def morgan_fp(mol, radius=2, n_bits=2048):
    return AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=radius, nBits=n_bits
    )

X_fp = tox21["mol"].progress_apply(morgan_fp)

X_fp = np.array([np.array(fp) for fp in X_fp])
X_fp.shape


100%|██████████| 3074/3074 [00:00<00:00, 5803.45it/s]


(3074, 2048)

In [10]:
ASSAY_COLUMNS = [
    'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase',
    'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
    'SR-ARE', 'SR-ATAD5', 'SR-HSE',
    'SR-MMP', 'SR-p53'
]

y = tox21[ASSAY_COLUMNS]
y.shape


(3074, 12)

In [11]:
X_desc.to_csv(DATA_PROCESSED / "tox21_descriptors.csv", index=False)

np.save(DATA_PROCESSED / "tox21_morgan_fp.npy", X_fp)

y.to_csv(DATA_PROCESSED / "tox21_labels.csv", index=False)
