In [None]:
import os, re
import google.generativeai as genai

os.environ["HF_TOKEN"] = ""
genai.configure(api_key="")

In [27]:
import json
from huggingface_hub import hf_hub_download

tdc_prompts_filepath = hf_hub_download(
    repo_id="google/txgemma-27b-predict",
    filename="tdc_prompts.json",
)

with open(tdc_prompts_filepath, "r", encoding='utf-8') as f:
    tdc_prompts_json = json.load(f)

In [28]:
## Clearance Hepatocyte AZ: Given a drug SMILES, predict the activity of hepatocyte clearance.
tdc_prompts_json["Clearance_Hepatocyte_AZ"]

'Instructions: Answer the following question about drug properties.\nContext: Drug clearance is defined as the volume of plasma cleared of a drug over a specified time period and it measures the rate at which the active drug is removed from the body.\nQuestion: Given a drug SMILES string, predict its normalized hepatocyte clearance from 000 to 1000, where 000 is minimum hepatocyte clearance and 1000 is maximum hepatocyte clearance.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [29]:
#### Clearance Microsome AZ: Given a drug SMILES, predict the activity of microsome clearance.
tdc_prompts_json["Clearance_Microsome_AZ"]

'Instructions: Answer the following question about drug properties.\nContext: Drug clearance is defined as the volume of plasma cleared of a drug over a specified time period and it measures the rate at which the active drug is removed from the body.\nQuestion: Given a drug SMILES string, predict its normalized microsome clearance activity from 000 to 1000, where 000 is minimum microsome clearance and 1000 is maximum microsome clearance.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [30]:
#### Half Life Obach: Given a drug SMILES, predict the half life duration.
tdc_prompts_json["Half_Life_Obach"]

'Instructions: Answer the following question about drug properties.\nContext: Half life of a drug is the duration for the concentration of the drug in the body to be reduced by half. It measures the duration of actions of a drug. \nQuestion: Given a drug SMILES string, predict its normalized half life from 000 to 1000, where 000 is minimum half life and 1000 is maximum half life.\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [31]:
####VDss Lombardo: Given a drug SMILES, predict the volume of distributon.
tdc_prompts_json["VDss_Lombardo"]

"Instructions: Answer the following question about drug properties.\nContext: The volume of distribution at steady state (VDss) measures the degree of a drug's concentration in body tissue compared to concentration in blood. Higher VD indicates a higher distribution in the tissue and usually indicates the drug with high lipid solubility, low plasma protein binding rate.\nQuestion: Given a drug SMILES string, predict its normalized volume of distribution from 000 to 1000, where 000 is minimum volume of distribution and 1000 is maximum volume of distribution.\nDrug SMILES: {Drug SMILES}\nAnswer:"

In [32]:
####Bioavailability Ma : Given a drug SMILES, predict whether it is orally available.
tdc_prompts_json["Bioavailability_Ma"]

'Instructions: Answer the following question about drug properties.\nContext: Oral bioavailability is defined as “the rate and extent to which the active ingredient or active moiety is absorbed from a drug product and becomes available at the site of action”.\n\n\nQuestion: Given a drug SMILES string, predict whether it\n(A) has oral bioavailability < 20% (B) has oral bioavailability ≥ 20%\nDrug SMILES: {Drug SMILES}\nAnswer:'

In [33]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

PREDICT_VARIANT = "9b-predict"  # @param ["2b-predict", "9b-predict", "27b-predict"]
CHAT_VARIANT = "9b-chat" # @param ["9b-chat", "27b-chat"]
USE_CHAT = True # @param {type: "boolean"}

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

predict_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{PREDICT_VARIANT}")
predict_model = AutoModelForCausalLM.from_pretrained(
    f"google/txgemma-{PREDICT_VARIANT}",
    device_map="auto",
    quantization_config=quantization_config,
)

if USE_CHAT:
    chat_tokenizer = AutoTokenizer.from_pretrained(f"google/txgemma-{CHAT_VARIANT}")
    chat_model = AutoModelForCausalLM.from_pretrained(
        f"google/txgemma-{CHAT_VARIANT}",
        device_map="auto",
        quantization_config=quantization_config,
    )

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [35]:
## Example task and input
task_name = "Half_Life_Obach"
smiles = "{Drug SMILES}"
drug_smiles = "C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F"

TDC_PROMPT = tdc_prompts_json[task_name].replace(smiles, drug_smiles)

def txgemma_predict(prompt):
    input_ids = predict_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = predict_model.generate(**input_ids, max_new_tokens=8)
    return predict_tokenizer.decode(outputs[0], skip_special_tokens=True)

def txgemma_chat(prompt):
    input_ids = chat_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = chat_model.generate(**input_ids, max_new_tokens=32)
    return chat_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prediction model response: {txgemma_predict(TDC_PROMPT)}")
if USE_CHAT: print(f"Chat model response: {txgemma_chat(TDC_PROMPT)}")

Prediction model response: Instructions: Answer the following question about drug properties.
Context: Half life of a drug is the duration for the concentration of the drug in the body to be reduced by half. It measures the duration of actions of a drug. 
Question: Given a drug SMILES string, predict its normalized half life from 000 to 1000, where 000 is minimum half life and 1000 is maximum half life.
Drug SMILES: C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F
Answer:1
Chat model response: Instructions: Answer the following question about drug properties.
Context: Half life of a drug is the duration for the concentration of the drug in the body to be reduced by half. It measures the duration of actions of a drug. 
Question: Given a drug SMILES string, predict its normalized half life from 000 to 1000, where 000 is minimum half life and 1000 is maximum half life.
Drug SMILES: C[C@@H]1Cc2c([nH]c3ccccc23)[C@H](N1CC(F)(F)F)c1c(F)ccc(NCCNCCCF)c1F
Answer:10


# Tool to allow our Agentic-Tx to ask TxGemma therapeutically relevant questions

In [36]:
# This will allow us to extract content from inside of ticks
def extract_prompt(text, word):
    code_block_pattern = rf"```{word}(.*?)```"
    code_blocks = re.findall(code_block_pattern, text, re.DOTALL)
    extracted_code = "\n".join(code_blocks).strip()
    return extracted_code

# This class will allow us to inferface with TxGemma
class TxGemmaChatTool:
    def __init__(self):
      self.tool_name = "Chat Tool"

    def use_tool(self, question):
        # Here, we are submitting a question to TxGemma
        response = txgemma_chat(question)
        return response

    def tool_is_used(self, query):
        # This just checks to see if the tool call was evoked
        return "```TxGemmaChat" in query

    def process_query(self, query):
        # Here, we clean to query to remove the tool call
        return extract_prompt(query, word="TxGemmaChat")

    def instructions(self):
        # Here, we are **very** descriptively explaining how the tool works to the agent
        # This will be useful later on
        return (
            "=== Therapeutic Chat Tool Instructions ===\n"
            "### What This Tool Does\n"
            "The Therapeutic Chat Tool allows you to chat with a knowledgeable large language model named TxGemma trained on many therapeutics datasets."
            "### When and Why You Should Use It\n"
            "- If you have therapeutics related questions that you would benefit from asking TxGemma from.\n"
            "### How to Use It\n"
            "Format your query with triple backticks (```), and start with `TxGemmaChat`. Then on a new line:\n"
            "1) **Any question you would like to ask**\n\n"
            "Example:\n"
            "```TxGemmaChat\n"
            "What is a common drug used to treat ovarian cancer?\n"
            "```\n")
     

# Making a TxGemma prediction

In [37]:
## Bioavailability Ma
class BioavailabilityPred:
    def __init__(self):
        self.tool_name = "Oral Bioavailability Prediction"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for Bioavailability
        prediction = txgemma_predict(tdc_prompts_json["Bioavailability_Ma"].replace("{Drug SMILES}", smiles_string))
        if "(A)" in prediction:   prediction = f"{smiles_string} is predicted to have oral bioavailability < 20%!"
        elif "(B)" in prediction: prediction = f"{smiles_string} is predicted to have oral bioavailability ≥ 20%!"
        return prediction

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```BioavailabilityPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="BioavailabilityPred")

    def instructions(self):
        return (
            "=== Oral Bioavailability Prediction Instructions ===\n"
            "This tool predicts whether a small molecule (given as SMILES) is orally bioavailable.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```BioavailabilityPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `BioavailabilityPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your ligand.\n\n"
            "**Example:**\n"
            "```BioavailabilityPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return a prediction on oral bioavailability for that molecule.\n"
        )

In [38]:
bioavailPred = BioavailabilityPred()

# Use only the SMILES string since BioavailabilityPred takes just that
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_bioavail = bioavailPred.use_tool(smiles)
print(prediction_bioavail)

COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 is predicted to have oral bioavailability < 20%!


In [39]:
# class ClearanceHepatocyteAZPred:
#     def __init__(self):
#         self.tool_name = "Clearance Hepatocyte AZ Prediction"

#     def use_tool(self, smiles_string):
#         # Assuming txgemma_predict and tdc_prompts_json have an entry for ClearanceHepatocyteAZ
#         prediction = txgemma_predict(
#             tdc_prompts_json["Clearance_Hepatocyte_AZ"].replace("{Drug SMILES}", smiles_string)
#         )
#         # Example expected output: "Answer: Clearance value: 45.6"
#         match = re.search(r"Answer:\s*(?:Clearance value:\s*)?([0-9]*\.?[0-9]+)", prediction)
        
#         if match:
#             clearance_value = match.group(1)
#             return f"{smiles_string} is predicted to have hepatocyte clearance with a value of {clearance_value}(L/min)."
#         else:
#             return "Prediction output format unrecognized."

#     def tool_is_used(self, query):
#         return "```ClearanceHepatocyteAZPred" in query

#     def process_query(self, query):
#         return extract_prompt(query, word="ClearanceHepatocyteAZPred")

#     def instructions(self):
#         return (
#             "=== Clearance Hepatocyte AZ Prediction Instructions ===\n"
#             "This tool predicts the hepatocyte clearance of a small molecule (given as SMILES).\n\n"
#             "To use this tool, invoke it exactly like this:\n"
#             "```ClearanceHepatocyteAZPred\n"
#             "{Drug SMILES}\n"
#             "```\n\n"
#             "• **Keyword**: `ClearanceHepatocyteAZPred` (must match exactly).\n"
#             "• **Line 2**: the SMILES string of your ligand.\n\n"
#             "**Example:**\n"
#             "```ClearanceHepatocyteAZPred\n"
#             "CC(=O)Oc1ccccc1C(=O)O\n"
#             "```\n"
#             "This will return a predicted hepatocyte clearance value for that molecule.\n"
#         )

In [40]:
# clearancePred = ClearanceHepatocyteAZPred()

# smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
# prediction_clearance = clearancePred.use_tool(smiles)
# print(prediction_clearance)

In [41]:
# class ClearanceMicrosomeAZPred:
#     def __init__(self):
#         self.tool_name = "Clearance Microsome AZ Prediction"

#     def use_tool(self, smiles_string):
#         # Assumes tdc_prompts_json has an entry "ClearanceMicrosomeAZ" for this task
#         prediction = txgemma_predict(
#             tdc_prompts_json["Clearance_Microsome_AZ"].replace("{Drug SMILES}", smiles_string)
#         )
#         # Example output might be: "Answer: Clearance rate: 45.6"
#         match = re.search(r"Answer:*([0-9]*\.?[0-9]+)", prediction)
        
#         clearance_value = float(match.group(1))
#         # You can adjust thresholds or interpretation as needed
#         return f"{smiles_string} has a predicted microsomal clearance rate of {clearance_value} (mL·min⁻¹·g⁻¹)."

#     def tool_is_used(self, query):
#         # Check for exact keyword in query
#         return "```ClearanceMicrosomeAZPred" in query

#     def process_query(self, query):
#         # Clean query to remove tool call block and extract prompt
#         return extract_prompt(query, word="ClearanceMicrosomeAZPred")

#     def instructions(self):
#         return (
#             "=== Clearance Microsome AZ Prediction Instructions ===\n"
#             "This tool predicts the microsomal clearance rate of a small molecule (given as SMILES),\n"
#             "based on AstraZeneca data and models.\n\n"
#             "To use this tool, invoke it exactly like this:\n"
#             "```ClearanceMicrosomeAZPred\n"
#             "{Drug SMILES}\n"
#             "```\n\n"
#             "• **Keyword**: `ClearanceMicrosomeAZPred` (must match exactly).\n"
#             "• **Line 2**: the SMILES string of your molecule.\n\n"
#             "**Example:**\n"
#             "```ClearanceMicrosomeAZPred\n"
#             "CC(=O)Oc1ccccc1C(=O)O\n"
#             "```\n"
#             "This will return a predicted microsomal clearance rate for that molecule.\n"
#         )

In [42]:
# clearance_pred = ClearanceMicrosomeAZPred()

# smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
# prediction_clearance = clearance_pred.use_tool(smiles)
# print(prediction_clearance)

In [43]:
class HalfLifeObachPred:
    def __init__(self):
        self.tool_name = "Half-Life Prediction (Obach)"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for "HalfLifeObach"
        prediction = txgemma_predict(
            tdc_prompts_json["Half_Life_Obach"].replace("{Drug SMILES}", smiles_string)
        )
        # Example output might be: "Answer: Half-life (hours): 4.2"
        match = re.search(r"Answer:\s*(?:Half-life \(hours\):\s*)?([0-9]*\.?[0-9]+)", prediction)
        
        if match:
            half_life = float(match.group(1))
            return f"{smiles_string} is predicted to have a half-life of {half_life:.2f} hours."
        else:
            return "Prediction output format unrecognized."

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```HalfLifeObachPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="HalfLifeObachPred")

    def instructions(self):
        return (
            "=== Half-Life Prediction (Obach) Instructions ===\n"
            "This tool predicts the half-life (in hours) of a small molecule (given as SMILES),\n"
            "based on the Obach model.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```HalfLifeObachPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `HalfLifeObachPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your molecule.\n\n"
            "**Example:**\n"
            "```HalfLifeObachPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return a predicted half-life (in hours) for that molecule.\n"
        )

In [44]:
half_life_pred = HalfLifeObachPred()
# Only pass the SMILES string, since AA_sequence is not needed here
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_half_life = half_life_pred.use_tool(smiles)
print(prediction_half_life)

COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 is predicted to have a half-life of 10.00 hours.


In [45]:
class VDssLombardoPred:
    def __init__(self):
        self.tool_name = "VDss Lombardo Prediction"

    def use_tool(self, smiles_string):
        # Assuming txgemma_predict and tdc_prompts_json have an entry for VDss Lombardo
        prediction = txgemma_predict(
            tdc_prompts_json["VDss_Lombardo"].replace("{Drug SMILES}", smiles_string)
        )
        # Example output might be: "Answer: VDss (L/kg): 0.85"
        match = re.search(r"Answer:\s*(?:VDss\s*\(L/kg\):\s*)?([0-9]*\.?[0-9]+)", prediction)

        if match:
            vdss_value = float(match.group(1))
            # Interpret vdss_value as you prefer, e.g.:
            return f"{smiles_string} has a predicted VDss of {vdss_value:.2f}(L/kg)."
        else:
            return "Prediction output format unrecognized."

    def tool_is_used(self, query):
        # Check for exact keyword in query
        return "```VDssLombardoPred" in query

    def process_query(self, query):
        # Clean query to remove tool call block and extract prompt
        return extract_prompt(query, word="VDssLombardoPred")

    def instructions(self):
        return (
            "=== VDss Lombardo Prediction Instructions ===\n"
            "This tool predicts the steady-state volume of distribution (VDss) in L/kg\n"
            "for a small molecule using the Lombardo method.\n\n"
            "To use this tool, invoke it exactly like this:\n"
            "```VDssLombardoPred\n"
            "{Drug SMILES}\n"
            "```\n\n"
            "• **Keyword**: `VDssLombardoPred` (must match exactly).\n"
            "• **Line 2**: the SMILES string of your molecule.\n\n"
            "**Example:**\n"
            "```VDssLombardoPred\n"
            "CC(=O)Oc1ccccc1C(=O)O\n"
            "```\n"
            "This will return the predicted VDss value for that molecule.\n"
        )

In [46]:
# Instantiate the VDss Lombardo predictor
vdssPred = VDssLombardoPred()

# Use only the SMILES string since VDssLombardoPred takes just that
smiles = "COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1"
prediction_vdss = vdssPred.use_tool(smiles)
print(prediction_vdss)


COc1c(F)cccc1C1=C(c2ccc(O[C@H]3CCN(CCCF)C3)cc2)c2ccc(O)cc2CCC1 has a predicted VDss of 3.00(L/kg).


# PubMed search tool

In [47]:
! pip install --upgrade --quiet biopython


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [48]:
from Bio import Medline, Entrez

# This class will allow us to interface with PubMed
class PubMedSearch:
    def __init__(self):
      self.tool_name = "PubMed Search"

    def tool_is_used(self, query: str):
        # This just checks to see if the tool call was evoked
        return "```PubMedSearch" in query

    def process_query(self, query: str):
        # Here, we clean to query to remove the tool call
        search_text = extract_prompt(query, word="PubMedSearch")
        return search_text.strip()

    def use_tool(self, search_text):
        # Here, we are searching through PubMed and returning relevant articles
        pmids = list()
        handle = Entrez.esearch(db="pubmed", sort="relevance", term=search_text, retmax=3)
        record = Entrez.read(handle)
        pmids = record.get("IdList", [])
        handle.close()

        if not pmids:
            return f"No PubMed articles found for '{search_text}' Please try a simpler search query."

        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pmids), rettype="medline", retmode="text")
        records = list(Medline.parse(fetch_handle))
        fetch_handle.close()

        result_str = f"=== PubMed Search Results for: '{search_text}' ===\n"
        for i, record in enumerate(records, start=1):
            pmid = record.get("PMID", "N/A")
            title = record.get("TI", "No title available")
            abstract = record.get("AB", "No abstract available")
            journal = record.get("JT", "No journal info")
            pub_date = record.get("DP", "No date info")
            authors = record.get("AU", [])
            authors_str = ", ".join(authors[:3])
            result_str += (
                f"\n--- Article #{i} ---\n"
                f"PMID: {pmid}\n"
                f"Title: {title}\n"
                f"Authors: {authors_str}\n"
                f"Journal: {journal}\n"
                f"Publication Date: {pub_date}\n"
                f"Abstract: {abstract}\n")
        return f"Query: {search_text}\nResults: {result_str}"

    def instructions(self):
        # Here, we are explaining how the tool works to the agent
        return (
            f"{'@' * 10}\n@@@ PubMed Search Tool Instructions @@@\n\n"
            "### What This Tool Does\n"
            "The PubMed Search Tool queries the NCBI Entrez API (PubMed) for a given search phrase, "
            "and retrieves metadata for a few of the top articles (PMID, title, authors, journal, date, abstract).\n\n"
            "### When / Why You Should Use It\n"
            "- To find **scientific literature** references on a specific biomedical topic.\n"
            "- To retrieve **abstracts, titles, authors**, and other metadata.\n\n"
            "### Query Format\n"
            "Wrap your request with triple backticks, starting with `PubMedSearch`. For example:\n\n"
            "```PubMedSearch\ncancer immunotherapy\n```\n\n"
            "### Example\n"
            "```PubMedSearch\nmachine learning in drug discovery\n```\n"
            "- This will search PubMed for articles related to 'machine learning in drug discovery', "
            "fetch up to 3 PMIDs, and return their titles, abstracts, etc.\n\n")

# Wrapping it all together

### Creating a tool manager

In [None]:
# The tool manager will hold all of the tools, and provide an interface for the agent
class ToolManager:
    def __init__(self, toolset):
        self.toolset = toolset

    def tool_prompt(self):
        # This will let the agent know what tools it has access to
        tool_names = ", ".join([tool.tool_name for tool in self.toolset])
        return f"You have access to the following tools: {tool_names}\n{self.tool_instructions()}. You can only use one tool at a time. These are the only tools you have access to nothing else."

    def tool_instructions(self):
        # This allows the agent to know how to use the tools
        tool_instr = "\n".join([tool.instructions() for tool in self.toolset])
        return f"The following is a set of instructions on how to use each tool.\n{tool_instr}"

    def use_tool(self, query):
        # This will iterate through all of the tools
        # and find the correct tool that the agent requested
        for tool in self.toolset:
            if tool.tool_is_used(query):
                # use the tool and return the output
                return tool.use_tool(tool.process_query(query))
        return f"No tool match for search: {query}"

if USE_CHAT:
    tools = ToolManager([TxGemmaChatTool(), BioavailabilityPred(), HalfLifeObachPred(), VDssLombardoPred(), PubMedSearch()])
else:
    tools = ToolManager([BioavailabilityPred(), HalfLifeObachPred(), VDssLombardoPred(), PubMedSearch()])    

### Creating a Gemini inference tool

In [61]:
import os
import time
import re

def inference_gemini(prompt, system_prompt, model_str):
    if model_str == "gemini-2.5-flash":
        model = genai.GenerativeModel(
            model_name="gemini-2.5-flash-preview-05-20",
            system_instruction=system_prompt
        )
        response = model.generate_content(prompt)
        return response.text
    raise ValueError(f"Unsupported model string: {model_str}")

def safe_inference(prompt, system_prompt, model_str, retries=5):
    for attempt in range(retries):
        try:
            return inference_gemini(prompt, system_prompt, model_str)
        except Exception as e:
            error_msg = str(e)
            print(f"[Attempt {attempt + 1}] Error: {error_msg}")

            delay_match = re.search(r"retry_delay {\s*seconds: (\d+)", error_msg)
            if delay_match:
                wait_time = int(delay_match.group(1))
            elif "ResourceExhausted" in error_msg:
                wait_time = 60
            else:
                raise

            if attempt < retries - 1:
                print(f"Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Raising exception.")
                raise




# Creating a therapeutics agent

In [63]:
class AgenticTx:
    def __init__(self, tool_manager, model_str, num_steps=5):
        self.curr_steps = 0
        self.num_steps = num_steps
        self.model_str = model_str
        self.tool_manager = tool_manager
        self.thoughts = []
        self.actions = []
        self.observations = []

    def reset(self):
        self.curr_steps = 0
        self.thoughts.clear()
        self.actions.clear()
        self.observations.clear()

    def system_prompt(self, use_tools=True):
        role_prompt = "You are an expert therapeutic agent. You answer accurately and thoroughly."
        prev_actions = f"You can perform a maximum of {self.num_steps} actions. You have performed {self.curr_steps} and have {self.num_steps - self.curr_steps - 1} left."
        tool_prompt = ("You can use tools to solve problems and answer questions. " + self.tool_manager.tool_prompt()) if use_tools else "You cannot use any tools right now."
        return f"{role_prompt} {prev_actions} {tool_prompt}"

    def prior_information(self, query):
        info_txt = f"Question: {query}\n" if query else ""
        for i in range(self.curr_steps):
            info_txt += f"### Thought {i + 1}: {self.thoughts[i]}\n"
            info_txt += f"### Action {i + 1}: {self.actions[i]}\n"
            info_txt += f"### Observation {i + 1}: {self.observations[i]}\n\n"
            info_txt += "@" * 20
        return info_txt

    def step(self, question):
        self.reset()
        for _ in range(self.num_steps):
            if self.curr_steps == self.num_steps - 1:
                return safe_inference(
                    prompt=f"{self.prior_information(question)}\nYou must now provide an answer to this question {question}",
                    system_prompt=self.system_prompt(use_tools=False),
                    model_str=self.model_str
                )
            else:
                thought = safe_inference(
                    prompt=f"{self.prior_information(question)}\nYou cannot currently use tools but you can think about the problem and what tools you want to use. This was the question, think about plans for how to use tools to answer this {question}. Let's think step by step (respond with only 1-2 sentences).\nThought: ",
                    system_prompt=self.system_prompt(use_tools=False),
                    model_str=self.model_str
                )
                action = safe_inference(
                    prompt=f"{self.prior_information(question)}\n{thought}\nNow you must use tools to answer the following user query [{question}], closely following the tool instructions. Tool",
                    system_prompt=self.system_prompt(use_tools=True),
                    model_str=self.model_str
                )
                obs = self.tool_manager.use_tool(action)

                print("Thought:", thought)
                print("Action:", action)
                print("Observation:", obs)

                self.thoughts.append(thought)
                self.actions.append(action)
                self.observations.append(obs)
                self.curr_steps += 1

# Initialize agent
agentictx = AgenticTx(tool_manager=tools, model_str="gemini-2.5-flash")

# File paths
input_file = "Lipinski_after.txt"
output_file = "pk_checked.txt"

if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file '{input_file}' not found.")

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    count = 0
    for line in infile:
        smiles = line.strip()
        if not smiles or smiles.startswith("#"):
            continue

        print(f"\nProcessing SMILES #{count + 1}: {smiles}")
        question = f"What are the PK properties of this drug {smiles}?"

        try:
            response = agentictx.step(question)
            outfile.write(f"SMILES: {smiles}\nResponse: {response}\n{'=' * 60}\n")
            count += 1
        except Exception as e:
            outfile.write(f"SMILES: {smiles}\nError: {e}\n{'=' * 60}\n")

print(f"\nPK property analysis completed. {count} SMILES processed. Results saved to '{output_file}'.")



Processing SMILES #1: CC(C)(C)O[C@H]1C[C@@H](NS(=O)(=O)Cc2ccc(F)cc2C#N)C12CCC2
Thought: To accurately determine the PK properties of this drug, I would use cheminformatics tools to calculate its molecular properties (MW, LogP, TPSA, H-bond donors/acceptors) and predict ADME parameters like oral bioavailability, plasma protein binding, and likely metabolic pathways (e.g., CYP-mediated oxidation at the tert-butyl ether, aliphatic carbons, or nitrile metabolism). I would also search drug databases for any existing experimental PK data on this specific compound or structurally similar ones to provide a comprehensive answer.
Action: ```BioavailabilityPred
CC(C)(C)O[C@H]1C[C@@H](NS(=O)(=O)Cc2ccc(F)cc2C#N)C12CCC2
```
Observation: CC(C)(C)O[C@H]1C[C@@H](NS(=O)(=O)Cc2ccc(F)cc2C#N)C12CCC2 is predicted to have oral bioavailability < 20%!
Thought: I need to analyze the drug's molecular descriptors (MW, LogP, HBD/HBA, TPSA, rotatable bonds) to understand its physicochemical properties. Then, I wil