In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import random
import re
import joblib

In [2]:
clf = joblib.load("SVC.m")

# 生成训练数据

In [3]:
df_source = pd.read_csv('../data/n2.csv')
posotive = []
negtive = []
for item in df_source.values:
    smiles = item[3]
    iupac = item[1]

    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    fp_bits = fp.ToList()
    label = clf.predict(np.array(fp_bits).reshape(1,-1)).item()

    word_list = re.split("[\s\[\],\(\)-.;]",iupac)
    # filtered_word_list = [item for item in word_list if len(item)>1 and item[0].isnumeric() is False]
    filtered_word_list = [item for item in word_list if len(item)>2 and item[0].isnumeric() is False]
    if len(filtered_word_list) == 0:
        continue

    used_fg = []
    for i in range(4):
        fg = random.choice(filtered_word_list)

        if fg in used_fg:
            continue
        
        if label == 1:
            desc = f"The molecule contains {fg}. It can bind to KPCD3."
            posotive.append([0,0,desc,smiles])
        elif label == 0:
            desc = f"The molecule contains {fg}. It can not bind to KPCD3."
            negtive.append([0,0,desc,smiles])

        used_fg.append(fg)

In [4]:
print(len(posotive))
print(len(negtive))

1415974
38974


In [5]:
outputs = posotive[:2500]
outputs.extend(negtive[:2500])
len(outputs)

5000

In [6]:
outputs = pd.DataFrame(data=outputs, columns=['cid', 'iupac', 'desc', 'smiles'])
outputs.to_csv(f'../data/KPCD3.csv', index=False)
print('Saved.')

Saved.


# 生成测试数据

In [7]:
df_source = pd.read_csv('../data/n1.csv')
outputs = []
used_fg = []
for item in df_source.values:
    iupac = item[1]

    word_list = re.split("[\s\[\],\(\)-.;]",iupac)
    filtered_word_list = [item for item in word_list if len(item)>3 and item[0].isnumeric() is False]

    for i in range(4):
        fg = random.choice(filtered_word_list)

        if fg in used_fg:
            continue

        desc = f"The molecule contains {fg}. It can bind to KPCD3."
        desc2 = f"The molecule contains {fg}. It can not bind to KPCD3."
        used_fg.append(fg)

        outputs.append([0,0,desc,0])
        outputs.append([0,0,desc2,0])

    if len(outputs) > 1000:
        break

outputs = pd.DataFrame(data=outputs, columns=['cid', 'iupac', 'desc', 'smiles'])
outputs.to_csv(f'../data/eval_KPCD3.csv', index=False)
print('Saved.')

Saved.
