In [57]:
import pandas as pd
import sqlite3
import os
import glob

def read_sqlite_to_dataframe(db_path: str) -> pd.DataFrame:
    """
    Reads an SQLite database file into a pandas DataFrame.

    Parameters
    ----------
    db_path : str
        Path to the SQLite database.

    Returns
    -------
    pd.DataFrame
        DataFrame representation of the 'results' table in the SQLite database.
    """
    # Connect to the database
    conn = sqlite3.connect(db_path)
    
    # Extract the 'results' table into a DataFrame
    df = pd.read_sql_query("SELECT * FROM results", conn)
    
    # Close the database connection
    conn.close()
    
    return df

In [58]:
def read_db_data(db_path: str) -> list:
    """
    Reads all data from the `results` table of the SQLite database located at db_path.

    Parameters
    ----------
    db_path: str
        The sqlite3 database path.

    Returns
    -------
    list
        A list of tuples containing rows from the `results` table or an empty list if the table doesn't exist.
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    
    try:
        cur.execute("SELECT * FROM results")
        data = cur.fetchall()
    except sqlite3.OperationalError as e:
        if "no such table: results" in str(e):
            print("The table 'results' does not exist in the database.")
            data = []
        else:
            raise e
    finally:
        cur.close()
        conn.close()
    
    return data

In [59]:
db = read_db_data("logs/debug_run_seh_frag/final/generated_mols_0.db")

In [60]:
df = read_sqlite_to_dataframe("logs/debug_run_seh_frag/final/generated_mols_0.db")

In [61]:
from gflownet.utils.metrics import compute_diverse_top_k

In [71]:
import math
from copy import deepcopy
from itertools import product

import numpy as np
import torch
import torch.nn as nn
from botorch.utils.multi_objective import infer_reference_point, pareto
from botorch.utils.multi_objective.hypervolume import Hypervolume
from rdkit import Chem, DataStructs
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

# Should be calculated per preference
def compute_diverse_top_k(smiles, rewards, k, thresh=0.7):
    # mols is a list of (reward, mol)
    mols = []
    for i in range(len(smiles)):
        mols.append([rewards[i], smiles[i]])
    mols = sorted(mols, reverse=True)
    modes = [mols[0]]
    mode_fps = [Chem.RDKFingerprint(mols[0][1])]
    for i in range(1, len(mols)):
        fp = Chem.RDKFingerprint(mols[i][1])
        sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps)
        if max(sim) < thresh:
            modes.append(mols[i])
            mode_fps.append(fp)
        if len(modes) >= k:
            # last_idx = i
            break
    return np.mean([i[0] for i in modes])  # return sim

compute_diverse_top_k(df.smi.to_list(), df.r.to_list(), 10, thresh=0.7)

In [78]:
smiles = df["smi"].tolist()
rewards = df["r"].tolist()

mols = []
for i in range(len(smiles)):
    mols.append([rewards[i], smiles[i]])
# mols = sorted(mols, key=lambda m: m[0], reverse=True)
# modes = [mols[0]]
# mode_fps = [Chem.RDKFingerprint(mols[0][1])]

In [81]:
mols = sorted(mols, key=lambda m: m[0], reverse=True)

In [83]:
modes = [mols[0]]

In [84]:
mols[0][1]

'[NH3+]CC1CC(c2nc3c(c(=O)[nH]2)NC(N2CCN(c4cc(C5CC(N6CCOCC6)CN5)c[nH]4)CC2)CN3)C([n+]2cccc(O)c2)O1'

In [86]:
mols[0][1]

'[NH3+]CC1CC(c2nc3c(c(=O)[nH]2)NC(N2CCN(c4cc(C5CC(N6CCOCC6)CN5)c[nH]4)CC2)CN3)C([n+]2cccc(O)c2)O1'

In [91]:
smile = mols[0][1]
molecule = Chem.MolFromSmiles(smile)
Chem.RDKFingerprint(molecule)

fp = Chem.RDKFingerprint(mols[i][1])
sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps)

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f529a228040>

In [85]:
mode_fps = [Chem.RDKFingerprint(mols[0][1])]

ArgumentError: Python argument types in
    rdkit.Chem.rdmolops.RDKFingerprint(str)
did not match C++ signature:
    RDKFingerprint(RDKit::ROMol mol, unsigned int minPath=1, unsigned int maxPath=7, unsigned int fpSize=2048, unsigned int nBitsPerHash=2, bool useHs=True, double tgtDensity=0.0, unsigned int minSize=128, bool branchedPaths=True, bool useBondOrder=True, boost::python::api::object atomInvariants=0, boost::python::api::object fromAtoms=0, boost::python::api::object atomBits=None, boost::python::api::object bitInfo=None)

In [72]:
def compute_diverse_top_k(smiles, rewards, k, thresh=0.7):
    # mols is a list of (reward, mol)
    mols = []
    for i in range(len(smiles)):
        mols.append([rewards[i].item(), smiles[i]])
    mols = sorted(mols, key=lambda m: m[0], reverse=True)
    modes = [mols[0]]
    mode_fps = [Chem.RDKFingerprint(mols[0][1])]
    for i in range(1, len(mols)):
        fp = Chem.RDKFingerprint(mols[i][1])
        sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps)
        if max(sim) < thresh:
            modes.append(mols[i])
            mode_fps.append(fp)
        if len(modes) >= k:
            # last_idx = i
            break
    return np.mean([i[0] for i in modes])  # return sim

ArgumentError: Python argument types in
    rdkit.Chem.rdmolops.RDKFingerprint(str)
did not match C++ signature:
    RDKFingerprint(RDKit::ROMol mol, unsigned int minPath=1, unsigned int maxPath=7, unsigned int fpSize=2048, unsigned int nBitsPerHash=2, bool useHs=True, double tgtDensity=0.0, unsigned int minSize=128, bool branchedPaths=True, bool useBondOrder=True, boost::python::api::object atomInvariants=0, boost::python::api::object fromAtoms=0, boost::python::api::object atomBits=None, boost::python::api::object bitInfo=None)

In [1]:
import sqlite3

def read_db_data(*db_paths: str) -> list:
    """
    Reads all data from the `results` table of multiple SQLite databases.

    Parameters
    ----------
    *db_paths: str
        The sqlite3 database paths.

    Returns
    -------
    list
        A combined list of tuples containing rows from the `results` table 
        from all provided databases. Returns an empty list if none of the 
        tables exist or other errors.
    """
    combined_data = []

    for db_path in db_paths:
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()
        
        try:
            cur.execute("SELECT * FROM results")
            data = cur.fetchall()
            combined_data.extend(data)
        except sqlite3.OperationalError as e:
            if "no such table: results" in str(e):
                print(f"The table 'results' does not exist in the database at path: {db_path}")
            else:
                raise e
        finally:
            cur.close()
            conn.close()
    
    return combined_data

In [None]:
   # Step 1: Identify all .db files in the directory
    db_files = glob.glob(os.path.join(folder_path, "*.db"))

In [2]:
data = read_db_data("logs/debug_run_seh_frag/final/generated_mols_0.db", "logs/debug_run_seh_frag/final/generated_mols_1.db")
print(data)

[('NS(=O)(=O)NCC(=O)O', 0.08959098160266876, 0.08959098905324936, 6.038919925689697), ('OC(F)(F)F', 0.07960197329521179, 0.0796019658446312, 29.778169631958008), ('CC(C)(C)c1c(CS)nc(S(=O)(=O)O)nc1C1C(C(=N)N)CC(c2ccncn2)N1c1nc2c(c(=O)[nH]1)NC(C1CCOCC1)CN2', 0.16765519976615906, 0.16765519976615906, 46.8006706237793), ('CC(O)C1CCN(C(=O)[O-])CC1C(=O)NO', 0.2936750054359436, 0.2936750054359436, 46.84848403930664), ('N=C(N)Br', 9.99999901978299e-05, 9.999999747378752e-05, 62.91029739379883), ('CC(=O)NC(C)=O', 0.04960143193602562, 0.04960142821073532, 59.400081634521484), ('O=C(O)C(=O)O', 9.99999901978299e-05, 9.999999747378752e-05, 28.886520385742188), ('CN(C)c1ccn(P(=O)([O-])O)c(=O)n1', 0.259971559047699, 0.259971559047699, 25.0684871673584), ('NS(=O)(=O)[N+](=O)[O-]', 0.007436044048517942, 0.0074360426515340805, 50.54048156738281), ('CC(=O)NCC1CCC(c2cc3[nH]c4nc(=O)[nH]c(=O)c-4nc3cc2S)N1c1cc(-c2cccc(C=CN3CCN(I)CC3)n2)ccn1', 0.3991965651512146, 0.3991965353488922, 45.961151123046875), ('CC(

In [3]:
import sqlite3
import pandas as pd

def read_db_data(*db_paths: str) -> pd.DataFrame:
    """
    Reads all data from the `results` table of multiple SQLite databases.

    Parameters
    ----------
    *db_paths: str
        The sqlite3 database paths.

    Returns
    -------
    pd.DataFrame
        A combined dataframe containing rows from the `results` table 
        from all provided databases. Returns an empty dataframe if none of the 
        tables exist or other errors.
    """
    combined_data = []
    columns = None

    for db_path in db_paths:
        conn = sqlite3.connect(db_path)
        
        try:
            df = pd.read_sql("SELECT * FROM results", conn)
            
            if columns is None:
                columns = df.columns.tolist()
            
            combined_data.append(df)
        except sqlite3.OperationalError as e:
            if "no such table: results" in str(e):
                print(f"The table 'results' does not exist in the database at path: {db_path}")
            else:
                raise e
        finally:
            conn.close()

    if not combined_data:
        return pd.DataFrame(columns=columns or [])
    
    return pd.concat(combined_data, ignore_index=True)

In [14]:
data = read_db_data("logs/debug_run_seh_frag/final/generated_mols_0.db", "logs/debug_run_seh_frag/final/generated_mols_1.db")
data

Unnamed: 0,smi,r,fr_0,ci_beta
0,NS(=O)(=O)NCC(=O)O,0.089591,0.089591,6.038920
1,OC(F)(F)F,0.079602,0.079602,29.778170
2,CC(C)(C)c1c(CS)nc(S(=O)(=O)O)nc1C1C(C(=N)N)CC(...,0.167655,0.167655,46.800671
3,CC(O)C1CCN(C(=O)[O-])CC1C(=O)NO,0.293675,0.293675,46.848484
4,N=C(N)Br,0.000100,0.000100,62.910297
...,...,...,...,...
315,N#COCn1cnc2c(-c3cc(C4CCCO4)c(C(=O)[O-])c(-c4cc...,0.362217,0.362217,31.835855
316,NN1CCN(C2CC2)CC1,0.204442,0.204442,19.714422
317,CN(C)C1CC(c2nccs2)C(C2CCCN2CNC=N)C(c2cc(C(F)(F...,0.376826,0.376826,57.953789
318,N=C(N)n1cc(CS)c(=O)[nH]c1=O,0.240088,0.240088,40.979397


315