# Data reconstruction by DRL
- Convert molecular information into binary by molecular fingerprint, and restore it by DRL
    - Molecular fingerprint: Avalon fingerprint
    - DRL: REINVENT 2.0

In [1]:
import glob
import os
import sys
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit import Chem

sys.path.append("../lib")
from fp_func import fp_func, fp_similarity

sys.path.append("../REINVENT/lib/")
from train_agent import train_agent



In [2]:
#load random 150 molecules from Bradley's dataset
file_path="../database/random_smiles.csv"
df=pd.read_csv(file_path)
smiles_list=list(df["SMILES"].values)
smiles_list[:3]

['c1ccc2c(c1)nc(s2)N',
 'c1ccc(cc1)COC(=O)/C=C/c2ccccc2',
 'O=C1C=C(C)C(=O)C(C)=C1']

In [3]:
# DRL
def reconstruct(smiles):
    
    #calc fingerprint (fp)
    
    mol = Chem.MolFromSmiles(smiles)

    if os.getcwd().split("/")[-1] != "REINVENT":
        os.chdir('../REINVENT/')

    fp = fp_func(mol)
    fp = [str(i) for i in fp]
    fp = "".join(fp)

    #settings for DRL
    arg_dict = {
        "scoring_function": "fp_similarity",
        "scoring_function_kwargs": {
            "query_bit": fp,
        },
        "n_steps": 300,
        "verbose": False
    }

    # run
    train_agent(**arg_dict)

    os.chdir('../1_reconstruction_test')

    # analyze
    result_path = sorted(glob.glob("../REINVENT/data/results/*"))[-1]+"/memory"
    dqn_df = pd.read_csv(result_path, delimiter=" ")
    best_smiles = list(dqn_df.sort_values(
        ["Score", "PriorLogP"], ascending=False)[0:1]["SMILES"])[0]

    return best_smiles

In [4]:
from tqdm.notebook import tqdm
reconst_sm_list=[]

#loop
for sm1 in tqdm(smiles_list):
    try:
        sm2=reconstruct(sm1)
    except:
        sm2="C"
    reconst_sm_list.append(sm2)
    
    #joblib.dump(reconst_sm_list,"dqn_smiles_temp.bin")
    break

  0%|          | 0/150 [00:00<?, ?it/s]

cuda:1
Model initialized, starting training...

Best score in memory: 0.57

Best score in memory: 0.57

Best score in memory: 0.57

Best score in memory: 0.57

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.70

Best score in memory: 0.72

Best score in memory: 0.72

Best score in memory: 0.72

Best score in memory: 0.72

Best score in memory: 0.72

Best score in memory: 0.72

Best score in memory: 0.72



In [5]:
#show result
print(f"original: {sm1}")
print(f"reconstructed: {sm2}")

original: c1ccc2c(c1)nc(s2)N
reconstructed: Nc1nc2ccccc2s1
