In [1]:
import math
from copy import deepcopy
from itertools import product

from rdkit import Chem, DataStructs

import sqlite3
import pandas as pd

import numpy as np
import os
import glob

In [2]:
def read_db_data_in_folder(folder_path: str) -> pd.DataFrame:
    """
    Reads all data from the `results` table of all SQLite databases in a specified folder.

    Parameters
    ----------
    folder_path: str
        Path to the folder containing sqlite3 database files.

    Returns
    -------
    pd.DataFrame
        A combined dataframe containing rows from the `results` table 
        from all found databases in the folder. Returns an empty dataframe if none of the 
        tables exist or other errors.
    """
    # Step 1: Identify all .db files in the directory
    db_files = glob.glob(os.path.join(folder_path, "*.db"))

    combined_data = []
    columns = None

    for db_path in db_files:
        conn = sqlite3.connect(db_path)
        
        try:
            df = pd.read_sql("SELECT * FROM results", conn)
            
            if columns is None:
                columns = df.columns.tolist()
            
            combined_data.append(df)
        except sqlite3.OperationalError as e:
            if "no such table: results" in str(e):
                print(f"The table 'results' does not exist in the database at path: {db_path}")
            else:
                raise e
        finally:
            conn.close()

    if not combined_data:
        return pd.DataFrame(columns=columns or [])
    
    return pd.concat(combined_data, ignore_index=True)


In [3]:
df = read_db_data_in_folder("logs/mol_eval_seh_frag/final")
len(df)

1024

In [4]:
# Drop duplicates in the SMILES column
df = df.drop_duplicates(subset=["smi"])

df["mol"] = df["smi"].apply(lambda x: Chem.MolFromSmiles(x))
#df["fp"] = df["smi"].apply(lambda x: Chem.RDKFingerprint(Chem.MolFromSmiles(x)))
df

Unnamed: 0,smi,r,fr_0,ci_beta,mol
0,CC(=O)NNC(=O)CCC=C(C)C,0.210392,0.210392,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f974900>
1,O=C(O)N1CCOCC1,0.110476,0.110476,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f974580>
2,N#CCOP(=O)([O-])O,0.031916,0.031916,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f9745f0>
3,CC(=O)P(=O)(O)O,0.022560,0.022560,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f974740>
4,CC(C)(O)C(=O)NO,0.128396,0.128396,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f92e9e0>
...,...,...,...,...,...
1017,CC(C)C1C(C)CC(C#N)N1S,0.124972,0.124972,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f8f1890>
1018,N=C(N)c1c[nH]c2ccccc12,0.287834,0.287834,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f8f1900>
1019,CC(=CNCN1CCOCC1)[PH](=O)[O-],0.223632,0.223632,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f8f1970>
1020,CNn1nc(N)c(S(=O)(=O)[O-])c1-c1cc(-c2cccs2)cc(C...,0.347114,0.347114,1.0,<rdkit.Chem.rdchem.Mol object at 0x7fd47f8f19e0>


In [5]:
def compute_diverse_topk(candidates, k=100, tanimoto_thresh=0.7):
    modes = [candidates[0]]
    mode_fps = [Chem.RDKFingerprint(candidates[0][1])]
    for i in range(1, len(candidates)):
        fp = Chem.RDKFingerprint(candidates[i][1])
        sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps) 
        # if sim to any of the modes is less than thresh, add to modes
        if max(sim) < tanimoto_thresh:
            modes.append(candidates[i])
            mode_fps.append(fp)
        if len(modes) >= k:
            # last_idx = i
            break
    avg_reward_in_topk_modes = np.mean([i[0] for i in modes])  
    return avg_reward_in_topk_modes 

In [6]:
def compute_num_of_modes(candidates, reward_thresh=8, tanimoto_thresh=0.7):
    candidates = sorted(candidates, key=lambda m: m[0], reverse=True)
    # cut of candidates with reward less than reward_thresh
    candidates = [c for c in candidates if c[0] >= reward_thresh]

    num_candidates_above_reward_thresh = len(candidates)
    if num_candidates_above_reward_thresh == 0:
        return 0, 0
    
    modes = [candidates[0]]
    mode_fps = [Chem.RDKFingerprint(candidates[0][1])]
    for i in range(1, len(candidates)):
        fp = Chem.RDKFingerprint(candidates[i][1])
        sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps) 
        # if sim to any of the modes is less than thresh, add to modes
        if max(sim) < tanimoto_thresh:
            modes.append(candidates[i])
            mode_fps.append(fp)

    num_of_modes = len(modes)
    return num_of_modes,num_candidates_above_reward_thresh

In [7]:
def compute_avg_topk(rewards,k):
    # Sort the rewards
    rewards = sorted(rewards, reverse=True)
    # Get the top k rewards
    topk_rewards = rewards[:k]
    # Return the mean of the top k rewards
    return np.mean(topk_rewards)

In [8]:
def candidates_eval(path, k=100, reward_thresh=8, tanimoto_thresh=0.7):
    df = read_db_data_in_folder(path)

    df = df.drop_duplicates(subset=["smi"])

    if len(df) < k:
        raise ValueError(f"Number of unique SMILES ({len(df)}) is less than k ({k})")

    df["mol"] = df["smi"].apply(lambda x: Chem.MolFromSmiles(x))

    smiles = df["smi"].tolist()
    rewards = df["r"].tolist()
    mols = df["mol"].tolist()

    avg_topk = compute_avg_topk(rewards, k=k)

    candidates = list(zip(rewards,mols))
    candidates = sorted(candidates, key=lambda m: m[0], reverse=True)

    avg_reward_in_topk_modes = compute_diverse_topk(candidates, k=k, tanimoto_thresh=tanimoto_thresh)
    num_of_modes,num_candidates_above_reward_thresh = compute_num_of_modes(candidates, reward_thresh = reward_thresh, tanimoto_thresh=tanimoto_thresh)
    return {"avg_topk": avg_topk, "avg_reward_in_topk_modes": avg_reward_in_topk_modes,"num_of_modes":num_of_modes, "max_reward":max(rewards),"num_candidates":len(candidates),"num_candidates_above_reward_thresh":num_candidates_above_reward_thresh}


In [9]:
path = "logs/mol_eval_seh_frag/final"
k = 100
reward_thresh = 8.0
tanimoto_thresh = 0.7
candidates_eval(path,k,reward_thresh,tanimoto_thresh)

{'avg_topk': 0.5677922984957695,
 'avg_reward_in_topk_modes': 0.549915187060833,
 'num_of_modes': 0,
 'max_reward': 0.8157103657722473,
 'num_candidates': 942,
 'num_candidates_above_reward_thresh': 0}

In [10]:
df = read_db_data_in_folder(path)

df = df.drop_duplicates(subset=["smi"])

if len(df) < k:
    raise ValueError(f"Number of unique SMILES ({len(df)}) is less than k ({k})")

df["mol"] = df["smi"].apply(lambda x: Chem.MolFromSmiles(x))

smiles = df["smi"].tolist()
rewards = df["r"].tolist()
mols = df["mol"].tolist()


candidates = list(zip(rewards,mols))
candidates = sorted(candidates, key=lambda m: m[0], reverse=True)

avg_reward_in_topk_modes = compute_diverse_topk(candidates, k, tanimoto_thresh)
avg_reward_in_topk_modes

0.549915187060833

In [11]:
num_of_modes = compute_num_of_modes(candidates, 0.6, 0.7)
num_of_modes

(21, 25)