In [1]:
import sys
import torch
import pandas as pd
from tqdm import tqdm
from torch_geometric.data import Data, Batch
from transformers import AutoTokenizer, AutoModel

sys.path.append("../src/")
from preprocess import get_descriptors_2d, get_descriptors_3d, get_ecfp, MolPreprocessor
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MTR")


def get_chemberta_emb_single(
    tokenizer, model, smiles: str, device="cuda:0", max_len=256
) -> tuple[torch.Tensor]:

    input = tokenizer(
        smiles,
        max_length=max_len,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**input)
        last_hidden = outputs.last_hidden_state
        cls_emb = last_hidden[:, 0, :]
        pooled_atom_emb = last_hidden[:, 1:-1, :].mean(axis=1)
    
    return cls_emb, pooled_atom_emb

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### SMILES 로부터 descriptor, ChemBerta embedding, graph 특징 추출 및 데이터화

In [2]:
df = pd.read_pickle("../data/250714_preprocessed_HW_V2.pkl")
df.columns

Index(['smiles', 'ic50_nm', 'ic50_nm_imputed', 'is_active(10um)',
       'data_source', 'data_id', 'pvalue', 'pvalue_imputed', 'rdmol',
       'rdmol_confs5', 'medai_id'],
      dtype='object')

In [3]:
tr_save2dir = "../data/input/tr_vl"
mp = MolPreprocessor()

# 컬럼 인덱스 계산
smiles_idx = df.columns.get_loc("smiles")
pvalue_idx = df.columns.get_loc("pvalue")
pvalue_imputed_idx = df.columns.get_loc("pvalue_imputed")
rdmol_idx = df.columns.get_loc("rdmol_confs5")
mol_id_idx = df.columns.get_loc("medai_id")
is_active_idx = df.columns.get_loc("is_active(10um)")

for i in tqdm(range(df.shape[0])):
    smiles = df.iloc[i, smiles_idx]
    pvalue = df.iloc[i, pvalue_idx]
    pvalue_imputed = df.iloc[i, pvalue_imputed_idx]
    rd_mol = df.iloc[i, rdmol_idx]
    mol_id = df.iloc[i, mol_id_idx]
    is_active = df.iloc[i, is_active_idx]

    # Feature 계산
    desc_2d = get_descriptors_2d(rd_mol).unsqueeze(0)
    desc_3d = get_descriptors_3d(rd_mol).unsqueeze(0)
    ecfp = get_ecfp(rd_mol, radius=2, fpSize=2048).unsqueeze(0)

    # pyg Data 객체 형태로 만들기
    data = Data()

    # ChemBerta emb 추가
    cls_emb, pooled_atom_emb = get_chemberta_emb_single(
        tokenizer=tokenizer, model=model, smiles=smiles, device="cuda:1"
    )
    data.chemberta_cls = cls_emb
    data.chemberta_mol = pooled_atom_emb

    # Graph feature 추가
    data.x = mp.get_lig_feature(rd_mol, to_tensor=True)
    data.edge_index, data.edge_attr = mp.get_edge_info(rd_mol)
    data.pos = mp.get_atom_position(rd_mol)
    data.z = mp.get_atomic_number(rd_mol)

    # Descriptor, FP 추가
    data.desc_2d = desc_2d
    data.desc_3d = desc_3d
    data.ecfp = ecfp

    # label, smiles 추가
    data.smiles = smiles
    data.label = pvalue_imputed
    # 저장
    torch.save(data, f"{tr_save2dir}/{mol_id}.pt")

100%|██████████| 25753/25753 [06:45<00:00, 63.51it/s]


In [4]:
# Test set
test_df = pd.read_pickle("../data/250722_testset.pkl")
test_df.head()

Unnamed: 0,ID,Smiles,rdmol,rdmol_confs5
0,TEST_000,CC(C)n1cnnc1c2cccc(n2)N3C=Cc4ccc(Br)cc4C3=O,<rdkit.Chem.rdchem.Mol object at 0x786729fb62f0>,<rdkit.Chem.rdchem.Mol object at 0x786728b1c540>
1,TEST_001,C[C@H](CO)n1cnnc1c2cccc(n2)N3C=Cc4ccc(Br)cc4C3=O,<rdkit.Chem.rdchem.Mol object at 0x786729fb6070>,<rdkit.Chem.rdchem.Mol object at 0x786728b1c590>
2,TEST_002,C[C@H](CO)n1cnnc1c2cccc(n2)N3C=Cc4cc(Br)ccc4C3=O,<rdkit.Chem.rdchem.Mol object at 0x786729fb72e0>,<rdkit.Chem.rdchem.Mol object at 0x786728b1c5e0>
3,TEST_003,CC(C)n1cnnc1c2cccc(n2)N3C=NNC3=O,<rdkit.Chem.rdchem.Mol object at 0x786729fb7100>,<rdkit.Chem.rdchem.Mol object at 0x786728b1c680>
4,TEST_004,C[C@H](CO)n1cnnc1c2cccc(n2)N3C=Cc4cc(ccc4C3=O)...,<rdkit.Chem.rdchem.Mol object at 0x786729fb6200>,<rdkit.Chem.rdchem.Mol object at 0x786729f1a3e0>


In [8]:
ts_save2dir = "../data/input/test"

mp = MolPreprocessor()

# 컬럼 인덱스 계산
smiles_idx = test_df.columns.get_loc("Smiles")
rdmol_idx = test_df.columns.get_loc("rdmol_confs5")
mol_id_idx = test_df.columns.get_loc("ID")

for i in tqdm(range(test_df.shape[0])):
    smiles = test_df.iloc[i, smiles_idx]
    rd_mol = test_df.iloc[i, rdmol_idx]
    mol_id = test_df.iloc[i, mol_id_idx]

    # Feature 계산
    desc_2d = get_descriptors_2d(rd_mol).unsqueeze(0)
    desc_3d = get_descriptors_3d(rd_mol).unsqueeze(0)
    ecfp = get_ecfp(rd_mol, radius=2, fpSize=2048).unsqueeze(0)

    # pyg Data 객체 형태로 만들기
    data = Data()

    # ChemBerta emb 추가
    cls_emb, pooled_atom_emb = get_chemberta_emb_single(
        tokenizer=tokenizer, model=model, smiles=smiles
    )
    data.chemberta_cls = cls_emb
    data.chemberta_mol = pooled_atom_emb

    # Graph feature 추가
    data.x = mp.get_lig_feature(rd_mol, to_tensor=True)
    data.edge_index, data.edge_attr = mp.get_edge_info(rd_mol)
    data.pos = mp.get_atom_position(rd_mol)
    data.z = mp.get_atomic_number(rd_mol)

    # Descriptor, FP 추가
    data.desc_2d = desc_2d
    data.desc_3d = desc_3d
    data.ecfp = ecfp

    # smiles 추가
    data.smiles = smiles
    data.label = None
    # 저장
    torch.save(data, f"{ts_save2dir}/{mol_id}.pt")

100%|██████████| 127/127 [00:03<00:00, 42.32it/s]
