optional: expand current library using Molmim

input: library.txt
output: diffdock_before.txt

Need: API key from Molmim

In [3]:
#verify the smiles in library.txt
import sys
from rdkit import Chem

def is_valid_smiles(smiles: str) -> bool:
    return Chem.MolFromSmiles(smiles) is not None

def main(input_path: str, report_path: str = None):
    out = open(report_path, 'w') if report_path else sys.stdout
    valid_smiles = []
    valid_count = 0
    invalid_count = 0

    with open(input_path, 'r') as f:
        lines = f.readlines()

    for lineno, line in enumerate(lines, 1):
        original_line = line.strip()
        if not original_line or original_line.startswith('#'):
            continue
        smiles = original_line.split()[0]  # Assume first token is SMILES
        valid = is_valid_smiles(smiles)
        status = "VALID" if valid else "INVALID"
        out.write(f"{lineno:4d}  {smiles:20s}  {status:7s}\n")
        
        if valid:
            valid_smiles.append(original_line)
            valid_count += 1
        else:
            invalid_count += 1

    # Overwrite input file with valid SMILES only
    with open(input_path, 'w') as f:
        for line in valid_smiles:
            f.write(line + '\n')

    out.write(f"\nSummary:\n")
    out.write(f"Valid SMILES: {valid_count}\n")
    out.write(f"Invalid SMILES: {invalid_count}\n")
    out.write(f"Total SMILES: {valid_count + invalid_count}\n")

    if report_path:
        out.close()
        print(f"Validation report written to {report_path}")
    print(f"{invalid_count} invalid SMILES removed from {input_path}")

if __name__ == "__main__":
    inp = "library.txt"
    rpt = "library_2.txt"
    main(inp, rpt)


Validation report written to library_2.txt
0 invalid SMILES removed from library.txt


In [4]:
#expand library via Molmim
import os
import ast
import requests
import re
from rdkit import Chem
from rdkit.Chem.QED import qed as rdkit_qed
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import TanimotoSimilarity
# from dotenv import load_dotenv

# # --- Load API key ---
# load_dotenv()
# API_KEY = os.getenv("API_KEY")
# if not API_KEY:
#     raise ValueError("API_KEY is not set in the .env file.")

# --- Set API key directly ---
API_KEY = "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"
if not API_KEY:
    raise ValueError("API_KEY is not set.")


# --- API setup ---
invoke_url = "https://health.api.nvidia.com/v1/biology/nvidia/molmim/generate"
headers = {"Authorization": f"Bearer {API_KEY}", "Accept": "application/json"}
session = requests.Session()

# --- Tanimoto similarity (optional if you want to sort or filter) ---
def tanimoto_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    if not mol1 or not mol2:
        return 0.0
    fp1 = GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
    fp2 = GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    return TanimotoSimilarity(fp1, fp2)

# --- Process a single SMILES via the API ---
def generate_optimized_smiles(original_smiles):
    if not Chem.MolFromSmiles(original_smiles):
        print(f"Invalid SMILES: {original_smiles}")
        return []

    generated_set = set()
    min_sims = [0.1, 0.4, 0.7]

    for min_sim in min_sims:
        payload = {
            "smi": original_smiles,
            "algorithm": "CMA-ES",
            "num_molecules": 10,
            "property_name": "QED",
            "minimize": False,
            "min_similarity": min_sim,
            "particles": 20,
            "iterations": 2,
            "scaled_radius": 1,
        }

        try:
            response = session.post(invoke_url, headers=headers, json=payload)
            response.raise_for_status()
            molecules = ast.literal_eval(response.json().get('molecules', '[]'))
            for mol in molecules:
                gen = mol.get('sample')
                if gen and Chem.MolFromSmiles(gen):
                    canonical = Chem.MolToSmiles(Chem.MolFromSmiles(gen), canonical=True)
                    generated_set.add(canonical)
        except Exception as e:
            print(f"Error during API request for SMILES '{original_smiles}': {e}")
            continue

    return list(generated_set)

# --- Main processing block ---
def main(input_file="library.txt", output_file="diffdock_before.txt"):
    if not os.path.exists(input_file):
        print(f"Input file '{input_file}' not found.")
        return

    total_added = 0
    all_generated = []

    with open(input_file, 'r') as infile:
        for line in infile:
            smiles = line.strip()
            if not smiles or smiles.startswith('#'):
                continue
            new_smiles = generate_optimized_smiles(smiles)
            total_added += len(new_smiles)
            all_generated.extend(new_smiles)

    # Remove duplicates
    all_generated = sorted(set(all_generated))

    with open(output_file, 'w') as outfile:
        for smi in all_generated:
            outfile.write(f"{smi}\n")

    print(f"{total_added} new SMILES strings generated and written to '{output_file}'.")

if __name__ == "__main__":
    main()


1060 new SMILES strings generated and written to 'diffdock_before.txt'.


In [None]:
#verify the smiles in library.txt
import sys
from rdkit import Chem

def is_valid_smiles(smiles: str) -> bool:
    return Chem.MolFromSmiles(smiles) is not None

def main(input_path: str, report_path: str = None):
    out = open(report_path, 'w') if report_path else sys.stdout
    valid_smiles = []
    valid_count = 0
    invalid_count = 0

    with open(input_path, 'r') as f:
        lines = f.readlines()

    for lineno, line in enumerate(lines, 1):
        original_line = line.strip()
        if not original_line or original_line.startswith('#'):
            continue
        smiles = original_line.split()[0]  # Assume first token is SMILES
        valid = is_valid_smiles(smiles)
        status = "VALID" if valid else "INVALID"
        out.write(f"{lineno:4d}  {smiles:20s}  {status:7s}\n")
        
        if valid:
            valid_smiles.append(original_line)
            valid_count += 1
        else:
            invalid_count += 1

    # Overwrite input file with valid SMILES only
    with open(input_path, 'w') as f:
        for line in valid_smiles:
            f.write(line + '\n')

    out.write(f"\nSummary:\n")
    out.write(f"Valid SMILES: {valid_count}\n")
    out.write(f"Invalid SMILES: {invalid_count}\n")
    out.write(f"Total SMILES: {valid_count + invalid_count}\n")

    if report_path:
        out.close()
        print(f"Validation report written to {report_path}")
    print(f"{invalid_count} invalid SMILES removed from {input_path}")

if __name__ == "__main__":
    inp = "diffdock_before.txt"
    rpt = "library_4.txt"
    main(inp, rpt)

Validation report written to library_4.txt
0 invalid SMILES removed from diffdock_before.txt


step 1: Docking (DiffDock)

input: diffdock_before.txt, receptor_clean.pdb

output: Lipinski_before.txt

In [None]:
#convert smiles string to sdf
import os
from rdkit import Chem
from rdkit.Chem import AllChem

# Create output directory
output_dir = "sdf_output"
os.makedirs(output_dir, exist_ok=True)

# Read SMILES from file
with open("diffdock_before.txt", "r") as file:
    smiles_list = [line.strip() for line in file if line.strip()]

# Generate SDF files
for idx, smiles in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Skipping invalid SMILES at line {idx + 1}: {smiles}")
        continue
    mol = Chem.AddHs(mol)
    if AllChem.EmbedMolecule(mol, AllChem.ETKDG()) != 0:
        print(f"Embedding failed for SMILES at line {idx + 1}: {smiles}")
        continue
    AllChem.UFFOptimizeMolecule(mol)
    
    sdf_path = os.path.join(output_dir, f"ligand_{idx}.sdf")
    writer = Chem.SDWriter(sdf_path)
    writer.write(mol)
    writer.close()

print(f"Finished writing {len(smiles_list)} SDF files to '{output_dir}'")

Finished writing 36 SDF files to 'sdf_output'


In [None]:
#run diffdock
import os
import requests
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---- CONFIG ----
input_dir = "sdf_output"
output_dir = "results_output"
receptor_path = "receptor_clean.pdb"

url = "https://health.api.nvidia.com/v1/biology/mit/diffdock"
header_auth = "Bearer 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"

# ---- ASSET UPLOAD FUNCTION ----
def _upload_asset(input_data):
    assets_url = "https://api.nvcf.nvidia.com/v2/nvcf/assets"
    headers = {
        "Authorization": header_auth,
        "Content-Type": "application/json",
        "accept": "application/json",
    }
    s3_headers = {
        "x-amz-meta-nvcf-asset-description": "diffdock-file",
        "content-type": "text/plain",
    }
    payload = {
        "contentType": "text/plain",
        "description": "diffdock-file"
    }

    for attempt in range(5):  # retry up to 5 times
        try:
            response = requests.post(assets_url, headers=headers, json=payload, timeout=30)
            response.raise_for_status()
            asset_url = response.json()["uploadUrl"]
            asset_id = response.json()["assetId"]

            response = requests.put(asset_url, data=input_data, headers=s3_headers, timeout=300)
            response.raise_for_status()

            return asset_id
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                wait = 2 ** attempt + random.uniform(0, 1)
                print(f"[WARN] Rate limited. Retrying after {wait:.2f}s...")
                time.sleep(wait)
            else:
                raise
    raise RuntimeError("Failed to upload asset after multiple attempts")

# ---- UPLOAD PROTEIN ONCE ----
with open(receptor_path, "rb") as f:
    protein_id = _upload_asset(f.read())
print(f"Protein uploaded: {protein_id}")

# ---- PROCESS ONE LIGAND ----
def process_ligand(idx, sdf_file):
    try:
        ligand_path = os.path.join(input_dir, sdf_file)
        out_folder = os.path.join(output_dir, f"ligand_{idx}")
        os.makedirs(out_folder, exist_ok=True)

        with open(ligand_path, "rb") as f:
            ligand_id = _upload_asset(f.read())

        print(f"Ligand {sdf_file} uploaded: {ligand_id}")

        headers = {
            "Content-Type": "application/json",
            "NVCF-INPUT-ASSET-REFERENCES": f"{protein_id},{ligand_id}",
            "Authorization": header_auth
        }

        payload = {
            "ligand": ligand_id,
            "ligand_file_type": "sdf",
            "protein": protein_id,
            "num_poses": 20,
            "time_divisions": 20,
            "steps": 18,
            "save_trajectory": True,
            "is_staged": True
        }

        for attempt in range(5):  # Retry logic for rate-limited inference
            response = requests.post(url, headers=headers, json=payload)
            if response.status_code != 429:
                break
            wait = 2 ** attempt + random.uniform(0, 1)
            print(f"[WARN] Inference rate-limited. Retrying after {wait:.2f}s...")
            time.sleep(wait)

        with open(os.path.join(out_folder, "response_status.txt"), "w") as f:
            f.write(str(response))

        with open(os.path.join(out_folder, "request_url.txt"), "w") as f:
            f.write(url)

        with open(os.path.join(out_folder, "response_text.txt"), "w") as f:
            f.write(response.text)

        print(f"Completed ligand_{idx}: {response.status_code}")
    except Exception as e:
        print(f"[ERROR] Ligand {idx} failed: {e}")

# ---- MULTITHREADING EXECUTION ----
os.makedirs(output_dir, exist_ok=True)
sdf_files = [f for f in os.listdir(input_dir) if f.endswith(".sdf")]

# Reduce concurrency to avoid 429 errors
max_workers = min(3, len(sdf_files))  # Try 2–3 threads instead of 8

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_ligand, idx, sdf_file) for idx, sdf_file in enumerate(sdf_files)]
    for future in as_completed(futures):
        pass  # Ensures we wait for all tasks

Protein uploaded: de5d3702-1f31-4ed0-a87b-2d7f3156ec6c
Ligand ligand_1.sdf uploaded: 567faa38-e86b-48d8-bd15-180f9cc7d9de
Ligand ligand_0.sdf uploaded: 35f0c1e4-8539-4cee-a6ae-58e09806a3da
Ligand ligand_10.sdf uploaded: 04e3ce23-5504-481d-a180-126222e377b6
Completed ligand_0: 200
Completed ligand_2: 200
Completed ligand_1: 200
Ligand ligand_11.sdf uploaded: ca2b1e33-3bcd-407c-9948-64c89d581eb5
Ligand ligand_13.sdf uploaded: 7691c5ae-bd60-4122-b7fa-4df9b3811d5b
Ligand ligand_12.sdf uploaded: d0c5bccf-7808-4952-8f5e-d8a245dde667
Completed ligand_3: 200
Completed ligand_5: 200
Completed ligand_4: 200
Ligand ligand_14.sdf uploaded: 28f1c181-8340-446b-a1fc-e65bacb6d705
Ligand ligand_15.sdf uploaded: a83f8f5c-98d4-4f44-8de3-907a655b884e
Ligand ligand_16.sdf uploaded: b09540a1-4e02-4f70-9291-ed14a02ca14b
Completed ligand_6: 200
Completed ligand_7: 200
Completed ligand_8: 200
Ligand ligand_17.sdf uploaded: 97f51610-1903-498e-ae60-f9bde52f6f4a
Ligand ligand_18.sdf uploaded: 85f251b2-d00d-4808-a

In [None]:
import json
import os

base_path = r"results_output"
ligand_confidences = []


for i in range(37):
    ligand_folder = os.path.join(base_path, f"ligand_{i}")
    input_file = os.path.join(ligand_folder, "response_text.txt")
    output_folder = os.path.join(ligand_folder, "diffdock_actual_outcome")

    if not os.path.exists(input_file):
        print(f"Missing file in {ligand_folder}, skipping.")
        continue

    os.makedirs(output_folder, exist_ok=True)

    with open(input_file, "r") as f:
        data = json.load(f)

    # Write PDB files
    for j, pose in enumerate(data.get("trajectory", []), start=1):
        with open(os.path.join(output_folder, f"pose_{j}.pdb"), "w") as pdb_file:
            pdb_file.write(pose)

    # Write SDF files
    for j, sdf in enumerate(data.get("ligand_positions", []), start=1):
        with open(os.path.join(output_folder, f"ligand_pose_{j}.sdf"), "w") as sdf_file:
            sdf_file.write(sdf)

In [None]:
#reformatting the output, ranking them based on the confidence score, and extracting their smiles string.



# --- Load SMILES from diffdock_before.txt ---
smiles_path = os.path.join(base_path, "..\diffdock_before.txt")
with open(smiles_path, "r") as f:
    smiles_list = [line.strip() for line in f if line.strip()]

# --- Step 1: Collect confidence scores ---
for i in range(len(smiles_list)):  # only process ligands with SMILES
    ligand_folder = os.path.join(base_path, f"ligand_{i}")
    input_file = os.path.join(ligand_folder, "response_text.txt")
    output_folder = os.path.join(ligand_folder, "diffdock_actual_outcome")

    if not os.path.exists(input_file):
        continue

    os.makedirs(output_folder, exist_ok=True)

    with open(input_file, "r") as f:
        data = json.load(f)

    # Write confidence scores
    confidences = data.get("position_confidence", [])
    with open(os.path.join(output_folder, "pose_confidences.txt"), "w") as out_file:
        out_file.write("Rank \t Pose Confidence\n\n")
        for j, conf in enumerate(confidences, start=1):
            out_file.write(f"{j} \t {conf}\n")

    valid_confidences = [c for c in confidences if c is not None]
    if valid_confidences:
        highest = max(valid_confidences)
        ligand_confidences.append((i, highest))

# --- Step 2: Sort by confidence score ---
ligand_confidences.sort(key=lambda x: x[1], reverse=True)

# --- Step 3: Write output with ranking ---
output_file = os.path.join(base_path, "..\Lipinski_before.txt")
with open(output_file, "w") as out:
    out.write("Rank\tLigand Number\tSMILES\tConfidence Score\n")
    for rank, (ligand_num, confidence) in enumerate(ligand_confidences, start=1):
        try:
            smiles = smiles_list[ligand_num]
        except IndexError:
            smiles = "ERROR: No SMILES available"
        out.write(f"{rank}\t{ligand_num}\t{smiles}\t{confidence:.4f}\n")

print("Output written to:")
print(output_file)




Output written to:
results_output\..\Lipinski_before.txt


# "Lipinski's Rule of 5" checking
input: Lipinski_before.txt
output:Lipinski_after.txt

In [None]:
#"Lipinski's Rule of 5" checking
from rdkit import Chem
from rdkit.Chem import Descriptors

def lipinski_violations(mol):
    """Return the count of Lipinski rule violations."""
    mw   = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd  = Descriptors.NumHDonors(mol)
    hba  = Descriptors.NumHAcceptors(mol)

    violations = 0
    if mw   >= 500: violations += 1
    if logp >= 5:   violations += 1
    if hbd  >= 5:   violations += 1
    if hba  >= 10:  violations += 1

    return violations

def filter_lipinski(input_path='Lipinski_before.txt', output_path='Lipinski_after.txt'):
    passed = []

    with open(input_path, 'r') as infile:
        header = infile.readline()  # skip header
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) != 4:
                continue  # skip malformed lines

            rank, ligand_num, smi, confidence = parts
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                continue

            if lipinski_violations(mol) <= 1:
                passed.append((rank, ligand_num, smi, confidence))

    with open(output_path, 'w') as outfile:
        outfile.write("Rank\tLigand Number\tSMILES\tConfidence Score\n")
        for rank, ligand_num, smi, confidence in passed:
            outfile.write(f"{rank}\t{ligand_num}\t{smi}\t{confidence}\n")

    print(f"Saved {len(passed)} molecules that passed Lipinski’s rule to `{output_path}`")

if __name__ == '__main__':
    filter_lipinski()


Saved 35 molecules that passed Lipinski’s rule to `Lipinski_after.txt`


Check binding pocket manually

# TxGemma Toxicity Predictor
input: Lipinski_after.txt
output: Lipinski_after_toxicity_checked.txt
Need: HF token and Gemini API key

In [23]:
import os, re
import google.generativeai as genai

os.environ["HF_TOKEN"] = ""
genai.configure(api_key="AIzaSyC88Lstivmi4JrPb4znZtmU0l3WqG-DJLY")


In [24]:
import json
from huggingface_hub import hf_hub_download

tdc_prompts_filepath = hf_hub_download(
    repo_id="google/txgemma-27b-predict",
    filename="tdc_prompts.json",
)

with open(tdc_prompts_filepath, "r") as f:
    tdc_prompts_json = json.load(f)

In [25]:
dataset = "ClinTox"  #@param ["AMES", "ClinTox"]

# now use it:
prompt = tdc_prompts_json[dataset]
print(prompt)

Instructions: Answer the following question about drug properties.
Context: Humans are exposed to a variety of chemicals through food, household products, and medicines, some of which can be toxic, leading to over 30% of promising pharmaceuticals failing in human trials due to toxicity. Toxic drugs can be identified from clinical trials that failed due to toxicity, while non-toxic drugs can be identified from FDA approval status or from clinical trials that report no toxicity.
Question: Given a drug SMILES string, predict whether it
(A) is not toxic (B) is toxic
Drug SMILES: {Drug SMILES}
Answer:


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

PREDICT_VARIANT = "9b-predict"  # @param ["2b-predict", "9b-predict", "27b-predict"]
CHAT_VARIANT = "9b-chat" # @param ["9b-chat", "27b-chat"]
USE_CHAT = True # @param {type: "boolean"}

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

predict_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{PREDICT_VARIANT}")
predict_model = AutoModelForCausalLM.from_pretrained(
    f"google/txgemma-{PREDICT_VARIANT}",
    device_map="auto",
    quantization_config=quantization_config,
)

if USE_CHAT:
    chat_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{CHAT_VARIANT}")
    chat_model = AutoModelForCausalLM.from_pretrained(
        f"google/txgemma-{CHAT_VARIANT}",
        device_map="auto",
        quantization_config=quantization_config,
    )

In [27]:
## Example task and input
task_name = "AMES"
smiles = "{Drug SMILES}"
sequence = "{Target amino acid sequence}"
drug_smiles = "C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F"
AA_sequence = "MTAEKEKKRCSSERRKEKSRDAARCRRSKETEVFYELAHQLPIPHSISSHLDKASIMRLAISFLRTRKLLTSGCVAATETTDVDRLMDSWYLKPLGGFITVVTSDGDMIFLSENINKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKAGMGKKGKELNTERDFFMRMKCTVTNRGRTVNLKSASWKVLHCTGHLKVCNGCPARVLCGFKEPPLTCVVMMCEPIPHPSNIDTPLDSKAFLSRHSMDMKFTYCDDRVTELMGYSPEDLLGRSAYDFYHALDSDNVTKSHQNLCTKGQAVSGQYRMLAKNGGYVWVETQGTVIYNSRNSQPQCIVCVNYVLSDVEEKSMIFSMDQTESLFKPHNLNSFFSPSKRSLGSDQSEALFTKLKEEPEDLTQLAPTPGDTIISLDFGQPQYEEHPMYSKVSSVAPPVSHSIHDGHKASYAGDMPKMAATFSVPQAPPPSSATPSLSSCSTPSSPGDYYTPVDSDLKVELTEKLFSLDTQETKASCNQENDLSDLDLETLAPYIPMDGEDFQLNPICQEEPASEIGGLVTNQQSFSNITSLFQPLGSSSAAHFQPNMSSGGDKKSISGGSVGSWPSIPCSRGPMQMPPYHDPASTPLSSMGGRQNLQWPPDPPLPSKAGMMDPLAAKRSCQTMPANRMPLYLQRPVENFVQNYRDMSPARLALTNGFKRSFTQMTMGESPPTKSQQTLWKRLRNESCAVMDRKSLSTSALSDKGMAHNRGMDHQHRKTQYSGNQTGQAAKCYREQCCNYREFSMQPSSKMDGIASRLIGPSFETYSLPELTRYDCEVNVPLQGNLHLLQGSDLLRALDQST"
TDC_PROMPT = tdc_prompts_json[task_name].replace(smiles, drug_smiles).replace(sequence, AA_sequence)

def txgemma_predict(prompt):
    input_ids = predict_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = predict_model.generate(**input_ids, max_new_tokens=8)
    return predict_tokenizer.decode(outputs[0], skip_special_tokens=True)

def txgemma_chat(prompt):
    input_ids = chat_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = chat_model.generate(**input_ids, max_new_tokens=32)
    return chat_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prediction model response: {txgemma_predict(TDC_PROMPT)}")
if USE_CHAT: print(f"Chat model response: {txgemma_chat(TDC_PROMPT)}")



Prediction model response: Instructions: Answer the following question about drug properties.
Context: Mutagenicity means the ability of a drug to induce genetic alterations. Drugs that can cause damage to the DNA can result in cell death or other severe adverse effects. Nowadays, the most widely used assay for testing the mutagenicity of compounds is the Ames experiment which was invented by a professor named Ames. The Ames test is a short-term bacterial reverse mutation assay detecting a large number of compounds which can induce genetic damage and frameshift mutations.
Question: Given a drug SMILES string, predict whether it
(A) is not mutagenic (B) is mutagenic
Drug SMILES: C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F
Answer:B)
Chat model response: Instructions: Answer the following question about drug properties.
Context: Mutagenicity means the ability of a drug to induce genetic alterations. Drugs that can cause damage to the DNA can result in cell death or

# Tool to allow our Agentic-Tx to ask TxGemma therapeutically relevant questions

Making a tool for our agent to use: a chat interface for our llama-based Agentic-Tx and TxGemma-Chat.

In [28]:
# This will allow us to extract content from inside of ticks
def extract_prompt(text, word):
    code_block_pattern = rf"```{word}(.*?)```"
    code_blocks = re.findall(code_block_pattern, text, re.DOTALL)
    extracted_code = "\n".join(code_blocks).strip()
    return extracted_code

# This class will allow us to inferface with TxGemma
class TxGemmaChatTool:
    def __init__(self):
      self.tool_name = "Chat Tool"

    def use_tool(self, question):
        # Here, we are submitting a question to TxGemma
        response = txgemma_chat(question)
        return response

    def tool_is_used(self, query):
        # This just checks to see if the tool call was evoked
        return "```TxGemmaChat" in query

    def process_query(self, query):
        # Here, we clean to query to remove the tool call
        return extract_prompt(query, word="TxGemmaChat")

    def instructions(self):
        # Here, we are **very** descriptively explaining how the tool works to the agent
        # This will be useful later on
        return (
            "=== Therapeutic Chat Tool Instructions ===\n"
            "### What This Tool Does\n"
            "The Therapeutic Chat Tool allows you to chat with a knowledgeable large language model named TxGemma trained on many therapeutics datasets."
            "### When and Why You Should Use It\n"
            "- If you have therapeutics related questions that you would benefit from asking TxGemma from.\n"
            "### How to Use It\n"
            "Format your query with triple backticks (```), and start with `TxGemmaChat`. Then on a new line:\n"
            "1) **Any question you would like to ask**\n\n"
            "Example:\n"
            "```TxGemmaChat\n"
            "What is a common drug used to treat ovarian cancer?\n"
            "```\n")


# Making a TxGemma prediction

AMES Mutagenicity

In [29]:
# This class will allow us to predict binding affinity using TxGemma
class Mutagenicity:
    def __init__(self):
      self.tool_name = "AMES Mutagenicity Prediction"

    def use_tool(self, smiles_string):
        # Here, we are submitting the smiles to TxGemma, and returning the response
        prediction = txgemma_predict(tdc_prompts_json["AMES"].replace("{Drug SMILES}", smiles_string))
        if "(A)" in prediction:   prediction = f"{smiles_string} is not mutagenic!"
        elif "(B)" in prediction: prediction = f"{smiles_string} is mutagenic!"
        return prediction

    def tool_is_used(self, query):
        # This just checks to see if the tool call was evoked
        return "```MutagenicityPred" in query

    def process_query(self, query):
        # Here, we clean to query to remove the tool call
        return extract_prompt(query, word="MutagenicityPred")

    def instructions(self):
        # Here, we are explaining how the tool works to the agent
        return (
        "=== AMES Mutagenicity Prediction Instructions ===\n"
            "The AMES Mutagenicity Prediction Tool is designed to predict potential for mutagenicity for humans in clinicial trials.\n"
            "You can test the mutagenicity of different SMILES strings as they might affect humans.\n"
            "To properly use this tool, follow the format outlined below:\n"
            "1. **Form a AMES Mutagenicity Prediction query**:\n"
            "```MutagenicityPred\n\n```\n"
            "Example: ```MutagenicityPred\nCN(C)C(=N)N=C(N)N\n```\n"
            "- Replace `` with an exact smiles string. "
            "A result will be returned to you describing the AMES Mutagenicity Prediction.\n"
            "**Important Formatting Details**:\n"
            "- Use `MutagenicityPred` as the exact keyword to begin your query.\n"
            "- Place your text after `MutagenicityPred` on a new line.\n"
            "- Enclose the entire query using three backticks (```), as shown in the example above.\n")

ClinTox

In [30]:
class ClinTox:
    def __init__(self):
      self.tool_name = "Clinical Toxicology Prediction"

    def use_tool(self, smiles_string):
        # Here, we are submitting the smiles to TxGemma, and returning the response
        prediction = txgemma_predict(tdc_prompts_json["ClinTox"].replace("{Drug SMILES}", smiles_string))
        if "(A)" in prediction:   prediction = f"{smiles_string} is predicted to be toxic!"
        elif "(B)" in prediction: prediction = f"{smiles_string} is predicted to be not toxic!"
        return prediction

    def tool_is_used(self, query):
        # This just checks to see if the tool call was evoked
        return "```ClinToxPred" in query

    def process_query(self, query):
        # Here, we clean to query to remove the tool call
        return extract_prompt(query, word="ClinToxPred")

    def instructions(self):
        # Here, we are explaining how the tool works to the agent
        return (
        "=== Clinical Toxicology Prediction Instructions ===\n"
            "The Clinical Toxicology Prediction Tool is designed to predict potential for toxicology for humans in clinicial trials.\n"
            "You can test the toxicology of different SMILES strings as they might affect humans.\n"
            "To properly use this tool, follow the format outlined below:\n"
            "1. **Form a Clinical Toxicology Prediction query**:\n"
            "```ClinToxPred\n\n```\n"
            "Example: ```ClinToxPred\nCN(C)C(=N)N=C(N)N\n```\n"
            "- Replace `` with an exact smiles string. "
            "A result will be returned to you describing the Clinical Toxicology Prediction.\n"
            "**Important Formatting Details**:\n"
            "- Use `ClinToxPred` as the exact keyword to begin your query.\n"
            "- Place your text after `ClinToxPred` on a new line.\n"
            "- Enclose the entire query using three backticks (```), as shown in the example above.\n")

# PubMed search tool

In [31]:
! pip install --upgrade --quiet biopython


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
from Bio import Medline, Entrez

# This class will allow us to interface with PubMed
class PubMedSearch:
    def __init__(self):
      self.tool_name = "PubMed Search"

    def tool_is_used(self, query: str):
        # This just checks to see if the tool call was evoked
        return "```PubMedSearch" in query

    def process_query(self, query: str):
        # Here, we clean to query to remove the tool call
        search_text = extract_prompt(query, word="PubMedSearch")
        return search_text.strip()

    def use_tool(self, search_text):
        # Here, we are searching through PubMed and returning relevant articles
        pmids = list()
        handle = Entrez.esearch(db="pubmed", sort="relevance", term=search_text, retmax=3)
        record = Entrez.read(handle)
        pmids = record.get("IdList", [])
        handle.close()

        if not pmids:
            return f"No PubMed articles found for '{search_text}' Please try a simpler search query."

        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pmids), rettype="medline", retmode="text")
        records = list(Medline.parse(fetch_handle))
        fetch_handle.close()

        result_str = f"=== PubMed Search Results for: '{search_text}' ===\n"
        for i, record in enumerate(records, start=1):
            pmid = record.get("PMID", "N/A")
            title = record.get("TI", "No title available")
            abstract = record.get("AB", "No abstract available")
            journal = record.get("JT", "No journal info")
            pub_date = record.get("DP", "No date info")
            authors = record.get("AU", [])
            authors_str = ", ".join(authors[:3])
            result_str += (
                f"\n--- Article #{i} ---\n"
                f"PMID: {pmid}\n"
                f"Title: {title}\n"
                f"Authors: {authors_str}\n"
                f"Journal: {journal}\n"
                f"Publication Date: {pub_date}\n"
                f"Abstract: {abstract}\n")
        return f"Query: {search_text}\nResults: {result_str}"

    def instructions(self):
        # Here, we are explaining how the tool works to the agent
        return (
            f"{'@' * 10}\n@@@ PubMed Search Tool Instructions @@@\n\n"
            "### What This Tool Does\n"
            "The PubMed Search Tool queries the NCBI Entrez API (PubMed) for a given search phrase, "
            "and retrieves metadata for a few of the top articles (PMID, title, authors, journal, date, abstract).\n\n"
            "### When / Why You Should Use It\n"
            "- To find **scientific literature** references on a specific biomedical topic.\n"
            "- To retrieve **abstracts, titles, authors**, and other metadata.\n\n"
            "### Query Format\n"
            "Wrap your request with triple backticks, starting with `PubMedSearch`. For example:\n\n"
            "```PubMedSearch\ncancer immunotherapy\n```\n\n"
            "### Example\n"
            "```PubMedSearch\nmachine learning in drug discovery\n```\n"
            "- This will search PubMed for articles related to 'machine learning in drug discovery', "
            "fetch up to 3 PMIDs, and return their titles, abstracts, etc.\n\n")

In [33]:
pubmed_tool = PubMedSearch()
search_results = pubmed_tool.use_tool("Is aspirin toxic?")
print(search_results)

            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Query: Is aspirin toxic?
Results: === PubMed Search Results for: 'Is aspirin toxic?' ===

--- Article #1 ---
PMID: 39092921
Title: Pharmacokinetics of aspirin: evaluating shortcomings in the literature.
Authors: Visagie JL, Aruwajoye GS, van der Sluis R
Journal: Expert opinion on drug metabolism & toxicology
Publication Date: 2024 Aug
Abstract: INTRODUCTION: Aspirin is known for its therapeutic benefits in preventing strokes and relieving pain. However, it is toxic to some individuals, and the biological mechanisms causing toxicity are unknown. Limited literature is available on the role of glycine conjugation as the principal pathway in aspirin detoxification. Previous studies have quantified this two-step enzyme reaction as a singular enzymatic process. Consequently, the individual contributions of these enzymes to the kinetics remain unclear. AREAS COVERED: This review summarized the available information on the pharmacokinetics and detoxification of aspirin by the glycine conjugati

# Wrapping it all together

### Creating a tool manager

In [34]:
# The tool manager will hold all of the tools, and provide an interface for the agent
class ToolManager:
    def __init__(self, toolset):
        self.toolset = toolset

    def tool_prompt(self):
        # This will let the agent know what tools it has access to
        tool_names = ", ".join([tool.tool_name for tool in self.toolset])
        return f"You have access to the following tools: {tool_names}\n{self.tool_instructions()}. You can only use one tool at a time. These are the only tools you have access to nothing else."

    def tool_instructions(self):
        # This allows the agent to know how to use the tools
        tool_instr = "\n".join([tool.instructions() for tool in self.toolset])
        return f"The following is a set of instructions on how to use each tool.\n{tool_instr}"

    def use_tool(self, query):
        # This will iterate through all of the tools
        # and find the correct tool that the agent requested
        for tool in self.toolset:
            if tool.tool_is_used(query):
                # use the tool and return the output
                return tool.use_tool(tool.process_query(query))
        return f"No tool match for search: {query}"

if USE_CHAT:
    tools = ToolManager([TxGemmaChatTool(), Mutagenicity(), ClinTox(), PubMedSearch()])
else:
    tools = ToolManager([Mutagenicity(), ClinTox(), PubMedSearch()])

### Creating a gemini inference tool

In [35]:
def inference_gemini(prompt, system_prompt, model_str):
  # Check to see that our model string matches
  if model_str == "gemini-2.5-flash":
    model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20", system_instruction=system_prompt)
    response = model.generate_content(prompt)
    answer = response.text
  return answer

In [36]:
# def inference_llama(prompt: str, system_prompt: str, model_str: str) -> str:
#     if model_str == "llama-3.1-8B":
#         full_prompt = system_prompt.strip() + "\n" + prompt.strip()
#         outputs = pipe(full_prompt, return_full_text=False)
#         return outputs[0]["generated_text"]
#     else:
#         raise ValueError(f"Unsupported model_str: {model_str}")

# Creating a therapeutics agent

In [None]:
import time
import re
# This class defines our Agentic-Tx, wrapping together all of our tools and the orchestrator
class AgenticTx:
    def __init__(self, tool_manager, model_str, num_steps=5):
        self.curr_steps = 0
        self.num_steps = num_steps
        self.model_str = model_str
        self.tool_manager = tool_manager
        self.thoughts = []
        self.actions = []
        self.observations = []

    def reset(self):
        self.curr_steps = 0
        self.thoughts.clear()
        self.actions.clear()
        self.observations.clear()

    def system_prompt(self, use_tools=True):
        role_prompt = "You are an expert therapeutic agent. You answer accurately and thoroughly."
        prev_actions = f"You can perform a maximum of {self.num_steps} actions. You have performed {self.curr_steps} and have {self.num_steps - self.curr_steps - 1} left."
        tool_prompt = ("You can use tools to solve problems and answer questions. " + self.tool_manager.tool_prompt()) if use_tools else "You cannot use any tools right now."
        return f"{role_prompt} {prev_actions} {tool_prompt}"

    def prior_information(self, query):
        info_txt = f"Question: {query}\n" if query else ""
        for _i in range(self.curr_steps):
            info_txt += f"### Thought {_i + 1}: {self.thoughts[_i]}\n"
            info_txt += f"### Action {_i + 1}: {self.actions[_i]}\n"
            info_txt += f"### Observation {_i + 1}: {self.observations[_i]}\n\n"
            info_txt += "@" * 20
        return info_txt

    def step(self, question):
        self.reset()
        for _i in range(self.num_steps):
            if self.curr_steps == self.num_steps - 1:
                return inference_gemini(
                    model_str=self.model_str,
                    prompt=f"{self.prior_information(question)}\nYou must now provide an answer to this question {question}",
                    system_prompt=self.system_prompt(use_tools=False))
            else:
                thought = inference_gemini(
                    model_str=self.model_str,
                    prompt=f"{self.prior_information(question)}\nYou cannot currently use tools but you can think about the problem and what tools you want to use. This was the question, think about plans for how to use tools to answer this {question}. Let's think step by step (respond with only 1-2 sentences).\nThought: ",
                    system_prompt=self.system_prompt(use_tools=False))
                action = inference_gemini(
                    model_str=self.model_str,
                    prompt=f"{self.prior_information(question)}\n{thought}\nNow you must use tools to answer the following user query [{question}], closely following the tool instructions. Tool",
                    system_prompt=self.system_prompt(use_tools=True))
                obs = self.tool_manager.use_tool(action)

                print("Thought:", thought)
                print("Action:", action)
                print("Observation:", obs)

                self.thoughts.append(thought)
                self.actions.append(action)
                self.observations.append(obs)

                self.curr_steps += 1


# Instantiate your agent
agentictx = AgenticTx(tool_manager=tools, model_str="gemini-2.5-flash")

# Process the SMILES list from file
input_file = "Lipinski_after.txt"
output_file = "Lipinski_after_toxicity_checked.txt"

if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file '{input_file}' not found.")

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    count = 0
    for line in infile:
        smiles = line.strip()
        if not smiles or smiles.startswith("#"):
            continue

        question = f"Is this drug toxic {smiles}? If it is toxic, what are the properties that make it toxic? If it is toxic, can you suggest what functional groups I should rreplace the toxic sections with?"
        print(f"\nProcessing SMILES #{count + 1}: {smiles}")
        
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = agentictx.step(question)
                outfile.write(f"SMILES: {smiles}\nResponse: {response}\n{'='*60}\n")
                count += 1
                break
            except Exception as e:
                if "429" in str(e):
                    wait_time = 30
                    print(f"Rate limit hit. Waiting {wait_time} seconds (Attempt {attempt+1}/{max_retries})...")
                    time.sleep(wait_time)
                else:
                    outfile.write(f"SMILES: {smiles}\nError: {e}\n{'='*60}\n")
                    break

print(f"\nToxicity analysis completed. {count} SMILES processed. Results written to '{output_file}'.")


# TxGemma PK Properties Predictor
input: Lipinski_after.txt
output: Lipinski_after_pk_checked.txt
Need: HF token and Gemini API key

In [39]:
import os, re
import google.generativeai as genai

os.environ["HF_TOKEN"] = ""
genai.configure(api_key="AIzaSyCrL36wexWM9S4lQ3rAA9VND13b6MGgH3g")

In [40]:
import json
from huggingface_hub import hf_hub_download

tdc_prompts_filepath = hf_hub_download(
    repo_id="google/txgemma-27b-predict",
    filename="tdc_prompts.json",
)

with open(tdc_prompts_filepath, "r", encoding='utf-8') as f:
    tdc_prompts_json = json.load(f)

In [41]:
## Clearance Hepatocyte AZ: Given a drug SMILES, predict the activity of hepatocyte clearance.
tdc_prompts_json["Clearance_Hepatocyte_AZ"]

'Instructions: Answer the following question about drug properties.\nContext: Drug clearance is defined as the volume of plasma cleared of a drug over a specified time period and it measures the rate at which the active drug is removed from the body.\nQuestion: Given a drug SMILES string, predict its normalized hepatocyte clearance from 000 to 1000, where 000 is minimum hepatocyte clearance and 1000 is maximum hepatocyte clearance.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [42]:
#### Clearance Microsome AZ: Given a drug SMILES, predict the activity of microsome clearance.
tdc_prompts_json["Clearance_Microsome_AZ"]

'Instructions: Answer the following question about drug properties.\nContext: Drug clearance is defined as the volume of plasma cleared of a drug over a specified time period and it measures the rate at which the active drug is removed from the body.\nQuestion: Given a drug SMILES string, predict its normalized microsome clearance activity from 000 to 1000, where 000 is minimum microsome clearance and 1000 is maximum microsome clearance.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [43]:
#### Half Life Obach: Given a drug SMILES, predict the half life duration.
tdc_prompts_json["Half_Life_Obach"]

'Instructions: Answer the following question about drug properties.\nContext: Half life of a drug is the duration for the concentration of the drug in the body to be reduced by half. It measures the duration of actions of a drug. \nQuestion: Given a drug SMILES string, predict its normalized half life from 000 to 1000, where 000 is minimum half life and 1000 is maximum half life.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [44]:
####VDss Lombardo: Given a drug SMILES, predict the volume of distributon.
tdc_prompts_json["VDss_Lombardo"]

"Instructions: Answer the following question about drug properties.\nContext: The volume of distribution at steady state (VDss) measures the degree of a drug's concentration in body tissue compared to concentration in blood. Higher VD indicates a higher distribution in the tissue and usually indicates the drug with high lipid solubility, low plasma protein binding rate.\nQuestion: Given a drug SMILES string, predict its normalized volume of distribution from 000 to 1000, where 000 is minimum volume of distribution and 1000 is maximum volume of distribution.\nDrug SMILES: {Drug SMILES}\nAnswer:"

In [45]:
####Bioavailability Ma : Given a drug SMILES, predict whether it is orally available.
tdc_prompts_json["Bioavailability_Ma"]

'Instructions: Answer the following question about drug properties.\nContext: Oral bioavailability is defined as “the rate and extent to which the active ingredient or active moiety is absorbed from a drug product and becomes available at the site of action”.\n\n\nQuestion: Given a drug SMILES string, predict whether it\n(A) has oral bioavailability < 20% (B) has oral bioavailability ≥ 20%\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

PREDICT_VARIANT = "9b-predict"  # @param ["2b-predict", "9b-predict", "27b-predict"]
CHAT_VARIANT = "9b-chat" # @param ["9b-chat", "27b-chat"]
USE_CHAT = True # @param {type: "boolean"}

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

predict_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{PREDICT_VARIANT}")
predict_model = AutoModelForCausalLM.from_pretrained(
    f"google/txgemma-{PREDICT_VARIANT}",
    device_map="auto",
    quantization_config=quantization_config,
)

if USE_CHAT:
    chat_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{CHAT_VARIANT}")
    chat_model = AutoModelForCausalLM.from_pretrained(
        f"google/txgemma-{CHAT_VARIANT}",
        device_map="auto",
        quantization_config=quantization_config,
    )

In [None]:
## Example task and input
task_name = "Half_Life_Obach"
smiles = "{Drug SMILES}"
drug_smiles = "C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F"

TDC_PROMPT = tdc_prompts_json[task_name].replace(smiles, drug_smiles)

def txgemma_predict(prompt):
    input_ids = predict_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = predict_model.generate(**input_ids, max_new_tokens=8)
    return predict_tokenizer.decode(outputs[0], skip_special_tokens=True)

def txgemma_chat(prompt):
    input_ids = chat_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = chat_model.generate(**input_ids, max_new_tokens=32)
    return chat_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prediction model response: {txgemma_predict(TDC_PROMPT)}")
if USE_CHAT: print(f"Chat model response: {txgemma_chat(TDC_PROMPT)}")

# Tool to allow our Agentic-Tx to ask TxGemma therapeutically relevant questions

In [48]:
# This will allow us to extract content from inside of ticks
def extract_prompt(text, word):
    code_block_pattern = rf"```{word}(.*?)```"
    code_blocks = re.findall(code_block_pattern, text, re.DOTALL)
    extracted_code = "\n".join(code_blocks).strip()
    return extracted_code

# This class will allow us to inferface with TxGemma
class TxGemmaChatTool:
    def __init__(self):
      self.tool_name = "Chat Tool"

    def use_tool(self, question):
        # Here, we are submitting a question to TxGemma
        response = txgemma_chat(question)
        return response

    def tool_is_used(self, query):
        # This just checks to see if the tool call was evoked
        return "```TxGemmaChat" in query

    def process_query(self, query):
        # Here, we clean to query to remove the tool call
        return extract_prompt(query, word="TxGemmaChat")

    def instructions(self):
        # Here, we are **very** descriptively explaining how the tool works to the agent
        # This will be useful later on
        return (
            "=== Therapeutic Chat Tool Instructions ===\n"
            "### What This Tool Does\n"
            "The Therapeutic Chat Tool allows you to chat with a knowledgeable large language model named TxGemma trained on many therapeutics datasets."
            "### When and Why You Should Use It\n"
            "- If you have therapeutics related questions that you would benefit from asking TxGemma from.\n"
            "### How to Use It\n"
            "Format your query with triple backticks (```), and start with `TxGemmaChat`. Then on a new line:\n"
            "1) **Any question you would like to ask**\n\n"
            "Example:\n"
            "```TxGemmaChat\n"
            "What is a common drug used to treat ovarian cancer?\n"
            "```\n")
     

# Making a TxGemma prediction

In [49]:
## Bioavailability Ma
class BioavailabilityPred:
    def __init__(self):
        self.tool_name = "Oral Bioavailability Prediction"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for Bioavailability
        prediction = txgemma_predict(tdc_prompts_json["Bioavailability_Ma"].replace("{Drug SMILES}", smiles_string))
        if "(A)" in prediction:   prediction = f"{smiles_string} is predicted to have oral bioavailability < 20%!"
        elif "(B)" in prediction: prediction = f"{smiles_string} is predicted to have oral bioavailability ≥ 20%!"
        return prediction

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```BioavailabilityPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="BioavailabilityPred")

    def instructions(self):
        return (
            "=== Oral Bioavailability Prediction Instructions ===\n"
            "This tool predicts whether a small molecule (given as SMILES) is orally bioavailable.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```BioavailabilityPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `BioavailabilityPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your ligand.\n\n"
            "**Example:**\n"
            "```BioavailabilityPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return a prediction on oral bioavailability for that molecule.\n"
        )

In [50]:
bioavailPred = BioavailabilityPred()

# Use only the SMILES string since BioavailabilityPred takes just that
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_bioavail = bioavailPred.use_tool(smiles)
print(prediction_bioavail)

COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 is predicted to have oral bioavailability < 20%!


In [51]:
# class ClearanceHepatocyteAZPred:
#     def __init__(self):
#         self.tool_name = "Clearance Hepatocyte AZ Prediction"

#     def use_tool(self, smiles_string):
#         # Assuming txgemma_predict and tdc_prompts_json have an entry for ClearanceHepatocyteAZ
#         prediction = txgemma_predict(
#             tdc_prompts_json["Clearance_Hepatocyte_AZ"].replace("{Drug SMILES}", smiles_string)
#         )
#         # Example expected output: "Answer: Clearance value: 45.6"
#         match = re.search(r"Answer:\s*(?:Clearance value:\s*)?([0-9]*\.?[0-9]+)", prediction)
        
#         if match:
#             clearance_value = match.group(1)
#             return f"{smiles_string} is predicted to have hepatocyte clearance with a value of {clearance_value}(L/min)."
#         else:
#             return "Prediction output format unrecognized."

#     def tool_is_used(self, query):
#         return "```ClearanceHepatocyteAZPred" in query

#     def process_query(self, query):
#         return extract_prompt(query, word="ClearanceHepatocyteAZPred")

#     def instructions(self):
#         return (
#             "=== Clearance Hepatocyte AZ Prediction Instructions ===\n"
#             "This tool predicts the hepatocyte clearance of a small molecule (given as SMILES).\n\n"
#             "To use this tool, invoke it exactly like this:\n"
#             "```ClearanceHepatocyteAZPred\n"
#             "{Drug SMILES}\n"
#             "```\n\n"
#             "• **Keyword**: `ClearanceHepatocyteAZPred` (must match exactly).\n"
#             "• **Line 2**: the SMILES string of your ligand.\n\n"
#             "**Example:**\n"
#             "```ClearanceHepatocyteAZPred\n"
#             "CC(=O)Oc1ccccc1C(=O)O\n"
#             "```\n"
#             "This will return a predicted hepatocyte clearance value for that molecule.\n"
#         )

In [52]:
# clearancePred = ClearanceHepatocyteAZPred()

# smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
# prediction_clearance = clearancePred.use_tool(smiles)
# print(prediction_clearance)

In [53]:
# class ClearanceMicrosomeAZPred:
#     def __init__(self):
#         self.tool_name = "Clearance Microsome AZ Prediction"

#     def use_tool(self, smiles_string):
#         # Assumes tdc_prompts_json has an entry "ClearanceMicrosomeAZ" for this task
#         prediction = txgemma_predict(
#             tdc_prompts_json["Clearance_Microsome_AZ"].replace("{Drug SMILES}", smiles_string)
#         )
#         # Example output might be: "Answer: Clearance rate: 45.6"
#         match = re.search(r"Answer:*([0-9]*\.?[0-9]+)", prediction)
        
#         clearance_value = float(match.group(1))
#         # You can adjust thresholds or interpretation as needed
#         return f"{smiles_string} has a predicted microsomal clearance rate of {clearance_value} (mL·min⁻¹·g⁻¹)."

#     def tool_is_used(self, query):
#         # Check for exact keyword in query
#         return "```ClearanceMicrosomeAZPred" in query

#     def process_query(self, query):
#         # Clean query to remove tool call block and extract prompt
#         return extract_prompt(query, word="ClearanceMicrosomeAZPred")

#     def instructions(self):
#         return (
#             "=== Clearance Microsome AZ Prediction Instructions ===\n"
#             "This tool predicts the microsomal clearance rate of a small molecule (given as SMILES),\n"
#             "based on AstraZeneca data and models.\n\n"
#             "To use this tool, invoke it exactly like this:\n"
#             "```ClearanceMicrosomeAZPred\n"
#             "{Drug SMILES}\n"
#             "```\n\n"
#             "• **Keyword**: `ClearanceMicrosomeAZPred` (must match exactly).\n"
#             "• **Line 2**: the SMILES string of your molecule.\n\n"
#             "**Example:**\n"
#             "```ClearanceMicrosomeAZPred\n"
#             "CC(=O)Oc1ccccc1C(=O)O\n"
#             "```\n"
#             "This will return a predicted microsomal clearance rate for that molecule.\n"
#         )

In [54]:
# clearance_pred = ClearanceMicrosomeAZPred()

# smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
# prediction_clearance = clearance_pred.use_tool(smiles)
# print(prediction_clearance)

In [55]:
class HalfLifeObachPred:
    def __init__(self):
        self.tool_name = "Half-Life Prediction (Obach)"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for "HalfLifeObach"
        prediction = txgemma_predict(
            tdc_prompts_json["Half_Life_Obach"].replace("{Drug SMILES}", smiles_string)
        )
        # Example output might be: "Answer: Half-life (hours): 4.2"
        match = re.search(r"Answer:\s*(?:Half-life \(hours\):\s*)?([0-9]*\.?[0-9]+)", prediction)
        
        if match:
            half_life = float(match.group(1))
            return f"{smiles_string} is predicted to have a half-life of {half_life:.2f} hours."
        else:
            return "Prediction output format unrecognized."

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```HalfLifeObachPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="HalfLifeObachPred")

    def instructions(self):
        return (
            "=== Half-Life Prediction (Obach) Instructions ===\n"
            "This tool predicts the half-life (in hours) of a small molecule (given as SMILES),\n"
            "based on the Obach model.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```HalfLifeObachPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `HalfLifeObachPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your molecule.\n\n"
            "**Example:**\n"
            "```HalfLifeObachPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return a predicted half-life (in hours) for that molecule.\n"
        )

In [56]:
half_life_pred = HalfLifeObachPred()
# Only pass the SMILES string, since AA_sequence is not needed here
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_half_life = half_life_pred.use_tool(smiles)
print(prediction_half_life)

COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 is predicted to have a half-life of 10.00 hours.


In [57]:
class VDssLombardoPred:
    def __init__(self):
        self.tool_name = "VDss Lombardo Prediction"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for VDss Lombardo
        prediction = txgemma_predict(
            tdc_prompts_json["VDss_Lombardo"].replace("{Drug SMILES}", smiles_string)
        )
        # Example output might be: "Answer: VDss (L/kg): 0.85"
        match = re.search(r"Answer:\s*(?:VDss\s*\(L/kg\):\s*)?([0-9]*\.?[0-9]+)", prediction)

        if match:
            vdss_value = float(match.group(1))
            # Interpret vdss_value as you prefer, e.g.:
            return f"{smiles_string} has a predicted VDss of {vdss_value:.2f}(L/kg)."
        else:
            return "Prediction output format unrecognized."

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```VDssLombardoPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="VDssLombardoPred")

    def instructions(self):
        return (
            "=== VDss Lombardo Prediction Instructions ===\n"
            "This tool predicts the steady-state volume of distribution (VDss) in L/kg\n"
            "for a small molecule using the Lombardo method.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```VDssLombardoPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `VDssLombardoPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your molecule.\n\n"
            "**Example:**\n"
            "```VDssLombardoPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return the predicted VDss value for that molecule.\n"
        )

In [58]:
# Instantiate the VDss Lombardo predictor
vdssPred = VDssLombardoPred()

# Use only the SMILES string since VDssLombardoPred takes just that
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_vdss = vdssPred.use_tool(smiles)
print(prediction_vdss)


COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 has a predicted VDss of 3.00(L/kg).


# PubMed search tool

In [59]:
! pip install --upgrade --quiet biopython


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [60]:
from Bio import Medline, Entrez

# This class will allow us to interface with PubMed
class PubMedSearch:
    def __init__(self):
      self.tool_name = "PubMed Search"

    def tool_is_used(self, query: str):
        # This just checks to see if the tool call was evoked
        return "```PubMedSearch" in query

    def process_query(self, query: str):
        # Here, we clean to query to remove the tool call
        search_text = extract_prompt(query, word="PubMedSearch")
        return search_text.strip()

    def use_tool(self, search_text):
        # Here, we are searching through PubMed and returning relevant articles
        pmids = list()
        handle = Entrez.esearch(db="pubmed", sort="relevance", term=search_text, retmax=3)
        record = Entrez.read(handle)
        pmids = record.get("IdList", [])
        handle.close()

        if not pmids:
            return f"No PubMed articles found for '{search_text}' Please try a simpler search query."

        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pmids), rettype="medline", retmode="text")
        records = list(Medline.parse(fetch_handle))
        fetch_handle.close()

        result_str = f"=== PubMed Search Results for: '{search_text}' ===\n"
        for i, record in enumerate(records, start=1):
            pmid = record.get("PMID", "N/A")
            title = record.get("TI", "No title available")
            abstract = record.get("AB", "No abstract available")
            journal = record.get("JT", "No journal info")
            pub_date = record.get("DP", "No date info")
            authors = record.get("AU", [])
            authors_str = ", ".join(authors[:3])
            result_str += (
                f"\n--- Article #{i} ---\n"
                f"PMID: {pmid}\n"
                f"Title: {title}\n"
                f"Authors: {authors_str}\n"
                f"Journal: {journal}\n"
                f"Publication Date: {pub_date}\n"
                f"Abstract: {abstract}\n")
        return f"Query: {search_text}\nResults: {result_str}"

    def instructions(self):
        # Here, we are explaining how the tool works to the agent
        return (
            f"{'@' * 10}\n@@@ PubMed Search Tool Instructions @@@\n\n"
            "### What This Tool Does\n"
            "The PubMed Search Tool queries the NCBI Entrez API (PubMed) for a given search phrase, "
            "and retrieves metadata for a few of the top articles (PMID, title, authors, journal, date, abstract).\n\n"
            "### When / Why You Should Use It\n"
            "- To find **scientific literature** references on a specific biomedical topic.\n"
            "- To retrieve **abstracts, titles, authors**, and other metadata.\n\n"
            "### Query Format\n"
            "Wrap your request with triple backticks, starting with `PubMedSearch`. For example:\n\n"
            "```PubMedSearch\ncancer immunotherapy\n```\n\n"
            "### Example\n"
            "```PubMedSearch\nmachine learning in drug discovery\n```\n"
            "- This will search PubMed for articles related to 'machine learning in drug discovery', "
            "fetch up to 3 PMIDs, and return their titles, abstracts, etc.\n\n")

# Wrapping it all together

### Creating a tool manager

In [61]:
# The tool manager will hold all of the tools, and provide an interface for the agent
class ToolManager:
    def __init__(self, toolset):
        self.toolset = toolset

    def tool_prompt(self):
        # This will let the agent know what tools it has access to
        tool_names = ", ".join([tool.tool_name for tool in self.toolset])
        return f"You have access to the following tools: {tool_names}\n{self.tool_instructions()}. You can only use one tool at a time. These are the only tools you have access to nothing else."

    def tool_instructions(self):
        # This allows the agent to know how to use the tools
        tool_instr = "\n".join([tool.instructions() for tool in self.toolset])
        return f"The following is a set of instructions on how to use each tool.\n{tool_instr}"

    def use_tool(self, query):
        # This will iterate through all of the tools
        # and find the correct tool that the agent requested
        for tool in self.toolset:
            if tool.tool_is_used(query):
                # use the tool and return the output
                return tool.use_tool(tool.process_query(query))
        return f"No tool match for search: {query}"

if USE_CHAT:
    tools = ToolManager([TxGemmaChatTool(), BioavailabilityPred(), HalfLifeObachPred(), VDssLombardoPred(), PubMedSearch()])
else:
    tools = ToolManager([BioavailabilityPred(), HalfLifeObachPred(), VDssLombardoPred(), PubMedSearch()])    

### Creating a Gemini inference tool

In [62]:
import os
import time
import re

def inference_gemini(prompt, system_prompt, model_str):
    if model_str == "gemini-2.5-flash":
        model = genai.GenerativeModel(
            model_name="gemini-2.5-flash-preview-05-20",
            system_instruction=system_prompt
        )
        response = model.generate_content(prompt)
        return response.text
    raise ValueError(f"Unsupported model string: {model_str}")

def safe_inference(prompt, system_prompt, model_str, retries=5):
    for attempt in range(retries):
        try:
            return inference_gemini(prompt, system_prompt, model_str)
        except Exception as e:
            error_msg = str(e)
            print(f"[Attempt {attempt + 1}] Error: {error_msg}")

            delay_match = re.search(r"retry_delay {\s*seconds: (\d+)", error_msg)
            if delay_match:
                wait_time = int(delay_match.group(1))
            elif "ResourceExhausted" in error_msg:
                wait_time = 60
            else:
                raise

            if attempt < retries - 1:
                print(f"Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Raising exception.")
                raise




# Creating a therapeutics agent

In [None]:
class AgenticTx:
    def __init__(self, tool_manager, model_str, num_steps=5):
        self.curr_steps = 0
        self.num_steps = num_steps
        self.model_str = model_str
        self.tool_manager = tool_manager
        self.thoughts = []
        self.actions = []
        self.observations = []

    def reset(self):
        self.curr_steps = 0
        self.thoughts.clear()
        self.actions.clear()
        self.observations.clear()

    def system_prompt(self, use_tools=True):
        role_prompt = "You are an expert therapeutic agent. You answer accurately and thoroughly."
        prev_actions = f"You can perform a maximum of {self.num_steps} actions. You have performed {self.curr_steps} and have {self.num_steps - self.curr_steps - 1} left."
        tool_prompt = ("You can use tools to solve problems and answer questions. " + self.tool_manager.tool_prompt()) if use_tools else "You cannot use any tools right now."
        return f"{role_prompt} {prev_actions} {tool_prompt}"

    def prior_information(self, query):
        info_txt = f"Question: {query}\n" if query else ""
        for i in range(self.curr_steps):
            info_txt += f"### Thought {i + 1}: {self.thoughts[i]}\n"
            info_txt += f"### Action {i + 1}: {self.actions[i]}\n"
            info_txt += f"### Observation {i + 1}: {self.observations[i]}\n\n"
            info_txt += "@" * 20
        return info_txt

    def step(self, question):
        self.reset()
        for _ in range(self.num_steps):
            if self.curr_steps == self.num_steps - 1:
                return safe_inference(
                    prompt=f"{self.prior_information(question)}\nYou must now provide an answer to this question {question}",
                    system_prompt=self.system_prompt(use_tools=False),
                    model_str=self.model_str
                )
            else:
                thought = safe_inference(
                    prompt=f"{self.prior_information(question)}\nYou cannot currently use tools but you can think about the problem and what tools you want to use. This was the question, think about plans for how to use tools to answer this {question}. Let's think step by step (respond with only 1-2 sentences).\nThought: ",
                    system_prompt=self.system_prompt(use_tools=False),
                    model_str=self.model_str
                )
                action = safe_inference(
                    prompt=f"{self.prior_information(question)}\n{thought}\nNow you must use tools to answer the following user query [{question}], closely following the tool instructions. Tool",
                    system_prompt=self.system_prompt(use_tools=True),
                    model_str=self.model_str
                )
                obs = self.tool_manager.use_tool(action)

                print("Thought:", thought)
                print("Action:", action)
                print("Observation:", obs)

                self.thoughts.append(thought)
                self.actions.append(action)
                self.observations.append(obs)
                self.curr_steps += 1

# Initialize agent
agentictx = AgenticTx(tool_manager=tools, model_str="gemini-2.5-flash")

# File paths
input_file = "Lipinski_after.txt"
output_file = "Lipinski_after_pk_checked.txt"

if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file '{input_file}' not found.")

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    count = 0
    for line in infile:
        smiles = line.strip()
        if not smiles or smiles.startswith("#"):
            continue

        print(f"\nProcessing SMILES #{count + 1}: {smiles}")
        question = f"What are the PK properties of this drug {smiles}?"

        try:
            response = agentictx.step(question)
            outfile.write(f"SMILES: {smiles}\nResponse: {response}\n{'=' * 60}\n")
            count += 1
        except Exception as e:
            outfile.write(f"SMILES: {smiles}\nError: {e}\n{'=' * 60}\n")

print(f"\nPK property analysis completed. {count} SMILES processed. Results saved to '{output_file}'.")


# Visualizing the results in 2D
input: Lipinski_after.txt
output: rank_{rank}_ligand_{ligand_number}.png

In [66]:
# Visualizing the results in 2D

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import os

# Load the combined data file
input_file = r'Lipinski_after.txt'
df = pd.read_csv(input_file, sep='\t')

# Output directory for images
output_dir = r'ligand_images'
os.makedirs(output_dir, exist_ok=True)

# Generate and save images with rank-based filenames
for _, row in df.iterrows():
    rank = row['Rank']
    ligand_number = row['Ligand Number']
    confidence = row['Confidence Score']
    smiles = row['SMILES']

    mol = Chem.MolFromSmiles(smiles)
    if mol:
        img = Draw.MolToImage(
            mol, size=(300, 300),
            legend=f'Rank: {rank}\nLigand: {ligand_number}\nScore: {confidence:.4f}'
        )
        filename = f'rank_{rank}_ligand_{ligand_number}.png'
        img.save(os.path.join(output_dir, filename))

