optional: expand current library using Molmim

input: library.txt
output: diffdock_before.txt

Need API key from Molmim

In [2]:
#verify the smiles in library.txt
import sys
from rdkit import Chem

def is_valid_smiles(smiles: str) -> bool:
    return Chem.MolFromSmiles(smiles) is not None

def main(input_path: str, report_path: str = None):
    out = open(report_path, 'w') if report_path else sys.stdout
    valid_smiles = []
    valid_count = 0
    invalid_count = 0

    with open(input_path, 'r') as f:
        lines = f.readlines()

    for lineno, line in enumerate(lines, 1):
        original_line = line.strip()
        if not original_line or original_line.startswith('#'):
            continue
        smiles = original_line.split()[0]  # Assume first token is SMILES
        valid = is_valid_smiles(smiles)
        status = "VALID" if valid else "INVALID"
        out.write(f"{lineno:4d}  {smiles:20s}  {status:7s}\n")
        
        if valid:
            valid_smiles.append(original_line)
            valid_count += 1
        else:
            invalid_count += 1

    # Overwrite input file with valid SMILES only
    with open(input_path, 'w') as f:
        for line in valid_smiles:
            f.write(line + '\n')

    out.write(f"\nSummary:\n")
    out.write(f"Valid SMILES: {valid_count}\n")
    out.write(f"Invalid SMILES: {invalid_count}\n")
    out.write(f"Total SMILES: {valid_count + invalid_count}\n")

    if report_path:
        out.close()
        print(f"Validation report written to {report_path}")
    print(f"{invalid_count} invalid SMILES removed from {input_path}")

if __name__ == "__main__":
    inp = "library.txt"
    rpt = "library_2.txt"
    main(inp, rpt)


Validation report written to library_2.txt
0 invalid SMILES removed from library.txt


In [None]:
#expand library via Molmim
import os
import ast
import requests
import re
from rdkit import Chem
from rdkit.Chem.QED import qed as rdkit_qed
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import TanimotoSimilarity
from dotenv import load_dotenv

# --- Load API key ---
load_dotenv()
API_KEY = os.getenv("API_KEY")
if not API_KEY:
    raise ValueError("API_KEY is not set in the .env file.")

# --- API setup ---
invoke_url = "https://health.api.nvidia.com/v1/biology/nvidia/molmim/generate"
headers = {"Authorization": f"Bearer {API_KEY}", "Accept": "application/json"}
session = requests.Session()

# --- Tanimoto similarity (optional if you want to sort or filter) ---
def tanimoto_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    if not mol1 or not mol2:
        return 0.0
    fp1 = GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
    fp2 = GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    return TanimotoSimilarity(fp1, fp2)

# --- Process a single SMILES via the API ---
def generate_optimized_smiles(original_smiles):
    if not Chem.MolFromSmiles(original_smiles):
        print(f"Invalid SMILES: {original_smiles}")
        return []

    generated_set = set()
    min_sims = [0.1, 0.4, 0.7]

    for min_sim in min_sims:
        payload = {
            "smi": original_smiles,
            "algorithm": "CMA-ES",
            "num_molecules": 10,
            "property_name": "QED",
            "minimize": False,
            "min_similarity": min_sim,
            "particles": 20,
            "iterations": 2,
            "scaled_radius": 1,
        }

        try:
            response = session.post(invoke_url, headers=headers, json=payload)
            response.raise_for_status()
            molecules = ast.literal_eval(response.json().get('molecules', '[]'))
            for mol in molecules:
                gen = mol.get('sample')
                if gen and Chem.MolFromSmiles(gen):
                    canonical = Chem.MolToSmiles(Chem.MolFromSmiles(gen), canonical=True)
                    generated_set.add(canonical)
        except Exception as e:
            print(f"Error during API request for SMILES '{original_smiles}': {e}")
            continue

    return list(generated_set)

# --- Main processing block ---
def main(input_file="library.txt", output_file="diffdock_before.txt"):
    if not os.path.exists(input_file):
        print(f"Input file '{input_file}' not found.")
        return

    total_added = 0
    all_generated = []

    with open(input_file, 'r') as infile:
        for line in infile:
            smiles = line.strip()
            if not smiles or smiles.startswith('#'):
                continue
            new_smiles = generate_optimized_smiles(smiles)
            total_added += len(new_smiles)
            all_generated.extend(new_smiles)

    # Remove duplicates
    all_generated = sorted(set(all_generated))

    with open(output_file, 'w') as outfile:
        for smi in all_generated:
            outfile.write(f"{smi}\n")

    print(f"{total_added} new SMILES strings generated and written to '{output_file}'.")

if __name__ == "__main__":
    main()


In [None]:
#verify the smiles in library.txt
import sys
from rdkit import Chem

def is_valid_smiles(smiles: str) -> bool:
    return Chem.MolFromSmiles(smiles) is not None

def main(input_path: str, report_path: str = None):
    out = open(report_path, 'w') if report_path else sys.stdout
    valid_smiles = []
    valid_count = 0
    invalid_count = 0

    with open(input_path, 'r') as f:
        lines = f.readlines()

    for lineno, line in enumerate(lines, 1):
        original_line = line.strip()
        if not original_line or original_line.startswith('#'):
            continue
        smiles = original_line.split()[0]  # Assume first token is SMILES
        valid = is_valid_smiles(smiles)
        status = "VALID" if valid else "INVALID"
        out.write(f"{lineno:4d}  {smiles:20s}  {status:7s}\n")
        
        if valid:
            valid_smiles.append(original_line)
            valid_count += 1
        else:
            invalid_count += 1

    # Overwrite input file with valid SMILES only
    with open(input_path, 'w') as f:
        for line in valid_smiles:
            f.write(line + '\n')

    out.write(f"\nSummary:\n")
    out.write(f"Valid SMILES: {valid_count}\n")
    out.write(f"Invalid SMILES: {invalid_count}\n")
    out.write(f"Total SMILES: {valid_count + invalid_count}\n")

    if report_path:
        out.close()
        print(f"Validation report written to {report_path}")
    print(f"{invalid_count} invalid SMILES removed from {input_path}")

if __name__ == "__main__":
    inp = "diffdock_before.txt"
    rpt = "library_4.txt"
    main(inp, rpt)