In [None]:
#%%capture --no-stderr
%run docIndex.ipynb
#!pip install --upgrade openai
!pip install bitsandbytes
!pip install accelerate
!pip install rank_bm25
!pip install markdown

In [None]:
import pandas as pd
import numpy as np

import argparse
import openai
from openai import OpenAI

from langchain import PromptTemplate
from botocore.exceptions import ClientError

# import sagemaker
# from sagemaker import get_execution_role

from langchain.llms import BaseLLM
from ipywidgets import Dropdown
# from sagemaker.jumpstart.notebook_utils import list_jumpstart_models

import markdown

from collections import Counter
import re
import os, gc, torch

from datetime import datetime
import pytz
import pickle
import json

import chromadb 
from chromadb import Settings
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer

import requests

# from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
#from langchain_community.vectorstores import Chroma   # instead of langchain_chroma

from transformers import set_seed

# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

from transformers.utils import is_bitsandbytes_available

from langchain.retrievers import BM25Retriever
from langchain.retrievers.ensemble import EnsembleRetriever

In [None]:
#run this for the first time to process pdfs, no need to run again after that except there are new files added
s3_bucket = 'plfs-han-llm-experiment'
s3_source_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
s3_output_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
process_pdfs_in_s3_folder(s3_bucket, s3_source_folder, s3_bucket, s3_output_folder, model="o4-mini")  #gpt-5

In [None]:
# # Open and save the PDF to potentially fix issues
# doc1 = fitz.open("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531.pdf")
# doc1.save("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531_repaired.pdf", garbage=4, deflate=True, clean=True)
# doc1.close()

# # # Now try with the repaired version
# doc1 = fitz.open("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531_repaired.pdf")
# text1 = doc1[1].get_text()

In [None]:
#if there are new files added, run this to process unprocessed pdfs
s3_bucket = 'plfs-han-llm-experiment'
s3_source_folder = 'factsheet-generation/Euhearing/internal/jarvis_docs/'
s3_output_folder = 'factsheet-generation/Euhearing/internal/jarvis_docs/'
process_unprocessed_pdfs_in_s3_folder(s3_bucket, s3_source_folder, s3_bucket, s3_output_folder, model="o4-mini")  #gpt-5

In [None]:
def copy_files_flattened_and_renamed_within_s3(bucket_name, source_prefix, destination_prefix):
    
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=source_prefix)
    if 'Contents' not in response:
        print(f"No files found in {source_prefix}")
        return

    for obj in response['Contents']:
        source_key = obj['Key']
        if source_key.endswith('/'):
            continue  # Skip directories

        # Extract the relative path from source_prefix
        relative_path = os.path.relpath(source_key, source_prefix)
        
        # Extract the top folder name
        top_folder_name = relative_path.split('/')[0]
        
        # Extract the original file name
        original_file_name = os.path.basename(source_key)
        
        # Create the new file name with the folder name included
        new_file_name = f"{top_folder_name}_{original_file_name}"
        
        # Construct the destination key
        destination_key = f"{destination_prefix.rstrip('/')}/{new_file_name}"
        
        # Copy the object to the new location
        copy_source = {'Bucket': bucket_name, 'Key': source_key}
        s3.copy_object(CopySource=copy_source, Bucket=bucket_name, Key=destination_key)
        
        print(f"Copied {source_key} to {destination_key}")


In [None]:
# s3_bucket = 'plfs-han-llm-experiment'
# dir1 = '2pager/Oculis/docs/'
# dir2 = '2pager/Oculis/internal/'
# # dir2 = '2pager/Mythic/internal/jarvis_docs/'
# # dir3 = '2pager/Mythic/internal/jarvis_tables/'
# # copy_files_from_s3_to_s3(s3_bucket, dir2, s3_bucket, dir1)
# # copy_files_from_s3_to_s3(s3_bucket, dir3, s3_bucket, dir1)

# copy_files_flattened_and_renamed_within_s3(s3_bucket, dir2, dir1)

In [None]:
# %%capture --no-stderr
# !pip uninstall -y torch torchvision
# # !pip install torch torchvision --index-url https://download.pytorch.org/whl/cu117
# !pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# import torchvision.transforms as transforms

In [None]:
## RecursiveCharacterTextSplitter
def process_documents_from_s3(s3_bucket: str, s3_folder: str, ignored_files: List[str] = []) -> List[Document]:
    """
    Load documents from S3 and split in chunks
    """
    print(f"Loading documents from s3://{s3_bucket}/{s3_folder}")
    documents = load_documents_from_s3(s3_bucket, s3_folder, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from s3://{s3_bucket}/{s3_folder}")
    
    #the chunk_size parameter in RecursiveCharacterTextSplitter refers to the number of characters
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    
    # Add filename to each chunk after splitting
    for chunk in texts:
        filename = chunk.metadata.get('source', 'unknown_file')
        chunk.page_content = f"From file: {filename}\n\n{chunk.page_content}"
    
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts


In [None]:
USE_INT8 = True          # flip to True after `pip install bitsandbytes`
embeddings_model_name = "Qwen/Qwen3-Embedding-4B"
#RecursiveCharacterTextSplitter, size of chars
chunk_size = 1000
chunk_overlap = 200


class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self,
                 model_name: str,
                 device: str = "cpu",
                 batch_size: int = 8,
                 max_length: int = 8192,):
        self.device     = device
        self.batch_size = batch_size

        # minimise CUDA fragmentation
        os.environ.setdefault(
            "PYTORCH_CUDA_ALLOC_CONF",
            "expandable_segments:True,max_split_size_mb:128"
        )
        gc.collect(); torch.cuda.empty_cache()

        # --------------------- build kwargs ------------------------------
        load_kwargs = {"trust_remote_code": True}
        if device == "cuda":
            if USE_INT8 and is_bitsandbytes_available():
                load_kwargs["model_kwargs"] = {
                    "load_in_8bit": True,
                    "device_map": "auto",
                }                
            else:
                load_kwargs["model_kwargs"] = {"torch_dtype": torch.float16}

        # --------------------- load model -------------------------------
        self.model = SentenceTransformer(model_name, **load_kwargs)

        # If we took the fp16 path, cast & move once
        if device == "cuda" and not (USE_INT8 and is_bitsandbytes_available()):
            self.model.half()          # weights → fp16
            self.model.to(device)      # onto GPU

    # --------------------- LangChain hooks ------------------------------
    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        vecs = self.model.encode(
            documents,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=False,
            device=self.device,
        )
        return vecs.tolist()

    def embed_query(self, query: str) -> List[float]:
        return self.embed_documents([query])[0]


In [None]:
!nvidia-smi

In [None]:
%%time
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
set_seed(42)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_model_name,
    model_kwargs={'device': DEVICE},
    encode_kwargs={'batch_size': 8, 'normalize_embeddings': True}
)

# ChromaDB setup - this should work now
current_dir = os.getcwd()
persist_directory_jd = os.path.join(current_dir, "chroma_db_jarvis_docs")
os.makedirs(persist_directory_jd, exist_ok=True)
chroma_client_jd = chromadb.PersistentClient(path=persist_directory_jd)

# Drop & recreate collection 
try:
    chroma_client_jd.delete_collection("jarvis_docs")
except ValueError:
    pass

db_jd = Chroma(
    collection_name="jarvis_docs",
    embedding_function=embeddings,  # Using external embeddings
    client=chroma_client_jd,
    collection_metadata={"hnsw:space": "cosine"},
)


s3_bucket = "plfs-han-llm-experiment"
s3_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
ignored_files = []
texts = process_documents_from_s3(s3_bucket, s3_folder, ignored_files)

CHROMA_BATCH_SIZE = 1000
print(f"Total documents to process: {len(texts)} (batch {CHROMA_BATCH_SIZE})")

for i in range(0, len(texts), CHROMA_BATCH_SIZE):
    batch_texts = texts[i : i + CHROMA_BATCH_SIZE]
    print(f"Adding docs {i}-{i+len(batch_texts)-1}")
    db_jd.add_documents(batch_texts)
    
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    gc.collect()

print("Vector database creation completed successfully!")
print("Final document count:", chroma_client_jd.get_collection("jarvis_docs").count())

In [None]:
# %%time

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# set_seed(42)

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# embeddings = SentenceTransformerEmbeddings(
#     model_name = embeddings_model_name,
#     device     = DEVICE,
#     batch_size = 16,
#     max_length = 8192
# )

# # ───────────────────────────  CHROMA SETUP ─────────────────────────
# current_dir            = os.getcwd()
# persist_directory_jd   = os.path.join(current_dir, "chroma_db_jarvis_docs")
# os.makedirs(persist_directory_jd, exist_ok=True)
# chroma_client_jd       = chromadb.PersistentClient(path=persist_directory_jd)

# # drop & recreate collection 
# try:
#     chroma_client_jd.delete_collection("jarvis_docs")
# except ValueError:
#     pass

# db_jd = Chroma(
#     collection_name   = "jarvis_docs",
#     embedding_function= embeddings,
#     client            = chroma_client_jd,
#     collection_metadata={"hnsw:space": "cosine"},
# )

# # ───────────────────────────  LOAD + INGEST ────────────────────────
# s3_bucket  = "plfs-han-llm-experiment"
# s3_folder  = 'factsheet-generation/Sidera/internal/jarvis_docs/'
# ignored_files = []

# texts = process_documents_from_s3(s3_bucket, s3_folder, ignored_files)

# CHROMA_BATCH_SIZE = 10000
# print(f"Total documents to process: {len(texts)}  (batch {CHROMA_BATCH_SIZE})")

# for i in range(0, len(texts), CHROMA_BATCH_SIZE):
#     batch_texts = texts[i : i + CHROMA_BATCH_SIZE]
#     print(f"Adding docs {i}-{i+len(batch_texts)-1}")
#     db_jd.add_documents(batch_texts)

#     # housekeeping
#     if DEVICE == "cuda":
#         torch.cuda.empty_cache()
#     gc.collect()

# print("Vector database creation completed successfully!")
# print("Final document count:", chroma_client_jd.get_collection("jarvis_docs").count())


In [None]:
## load saved vectordb
# Init the same embedding function you used to build the DB
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 8 if DEVICE == "cuda" else 64

embeddings = SentenceTransformerEmbeddings(
    model_name=embeddings_model_name,
    device=DEVICE,
    batch_size=batch_size,
    max_length=8192
)


persist_directory_jd = os.path.join(os.getcwd(), "chroma_db_jarvis_docs")
chroma_client_jd   = chromadb.PersistentClient(path=persist_directory_jd)
db_jd = Chroma(
    client=chroma_client_jd,
    collection_name="jarvis_docs",
    embedding_function=embeddings
)


# persist_directory_jt = os.path.join(os.getcwd(), "chroma_db_jarvis_tables")
# chroma_client_jt   = chromadb.PersistentClient(path=persist_directory_jt)
# db_jt = Chroma(
#     client=chroma_client_jt,
#     collection_name="jarvis_tables",
#     embedding_function=embeddings
# )


In [None]:
def get_page_contents_only(collection, batch_size=1000):
    all_contents = []
    offset = 0
    
    while True:
        print(f"Processing batch starting at offset {offset}...")
        
        # Get only documents, no metadata or embeddings
        batch_data = collection.get(
            limit=batch_size,
            offset=offset,
            include=["documents"]  # Only text content
        )
        
        if not batch_data['documents']:
            break
            
        all_contents.extend(batch_data['documents'])
        offset += batch_size
        print(f"Processed {len(all_contents)} documents so far...")
    
    return all_contents

# Use the batched approach
collection = chroma_client_jd.get_collection("jarvis_docs")
all_page_contents = get_page_contents_only(collection, batch_size=1000)  

with open('all_page_contents_docs.pkl', 'wb') as f:
    pickle.dump(all_page_contents, f)


In [None]:
with open('all_page_contents_docs.pkl', 'rb') as f:
    all_page_contents = pickle.load(f)

all_docs = [Document(page_content=content) for content in all_page_contents]
bm25_retriever = BM25Retriever.from_documents(all_docs)

In [None]:
def get_key(secret_name,region_name):

    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e

    secret = get_secret_value_response['SecretString']
    key = ast.literal_eval(secret)['key']
    
    return key

openai_api_key=get_key("openai-api-key", "us-west-2")

In [None]:
# from openai import OpenAI

# # initialize client
# client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))

# # call the chat completions endpoint
# response = client.chat.completions.create(
#     model="gpt-4o-search-preview",
#     messages=[
#         {"role": "system", "content": "You are an expert in bioventure investing."},
#         {"role": "user", "content": f"what's the weather in Shanghai today."}
#     ]
# )

# print(response.choices[0].message.content)

In [None]:
# from openai import OpenAI
# client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))

# response = client.responses.create(
#     model="gpt-4.1",
#     tools=[{
#         "type": "web_search_preview",
#         "search_context_size": "low",
#     }],
#     input="What movie won best picture in 2025?",
# )

# print(response.output_text)

In [None]:
# def answer_gpt(prompt):
    
#     from openai import OpenAI
#     client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
#     response = client.chat.completions.create(
#                   model="gpt-4.1", # or the latest version of GPT, o4-mini, gpt-4o, o3, gpt-4.1
#                   temperature=0,
#                   messages=[
#                       {"role": "system", "content": "You are an expert in bioventure investing."},
#                       {"role": "user", "content": f"answer the following question:{prompt}"}
#                     ]
#                 )
#     return response.choices[0].message.content


def answer_gpt(prompt):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                  model="gpt-4.1", # or the latest version of GPT, o4-mini, gpt-4o, o3, gpt-4.1
                  temperature=0,
                  input=f"You are an expert in bioventure investing. Answer the following question: {prompt}"
                )
    return response.output_text


# def answer_online_search(prompt):
    
#     from openai import OpenAI
#     client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
#     response = client.chat.completions.create(
#                   model="gpt-4o-search-preview",    #o4-mini, gpt-4o-search-preview
#                   messages=[
#                       {"role": "system", "content": "You are an expert in bioventure investing."},
#                       {"role": "user", "content": f"{prompt}"}
#                     ]
#                 )
#     return response.choices[0].message.content


def answer_online_search(prompt,search_model="o4-mini"):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                  model= search_model,    #o4-mini, o3
                  tools=[{"type": "web_search_preview",
                          "search_context_size": "high",}],
                  input=f"{prompt}"                    
                )
    return response.output_text



def origene_mcp(prompt, search_model="o4-mini"):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                model=search_model,
                tools=[
                    {"type": "mcp",
                     "server_label": "origene",
                     "server_url": "https://origene-uuid1752754854.app-space.dplink.cc/chembl_mcp/mcp/?token=172f53102e0a46acb20f306eceaaf6c4",
                     "require_approval": "never",
                    },],
                input= f"{prompt}"
                )
    return response.output_text

In [None]:
# ###take too long to run
# from openai import OpenAI
# client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
# response = client.responses.create(
#             model="o4-mini",
#             tools=[
#                 {"type": "mcp",
#                  "server_label": "disease-pocket-molecule-mcp",
#                  "server_url": "https://disease-pocket-molecule-mcp-uuid1751524197.app-space.dplink.cc/sse?token=4218554daa854930947d4986b1fb35e9",
#                  "require_approval": "never",
#                 },],
#             input= "find all targets associated with obesity"
#             )
# print(response.output_text)

In [None]:
print(answer_online_search("Comprehensively list all the biotech companies that are competitors to PPInnova (Peak Perform Innova).","o4-mini"))

In [None]:
print(answer_online_search("Comprehensively list all the biotech companies with active programs targeting ALK7 for indication Obesity, T2D.","o4-mini"))

In [None]:
# print(answer_online_search("tell me about Sidera Bio","o3"))

In [None]:
# print(answer_online_search("For company Trevi Therapeutics, search google finance to provide the following\
#                                     information for the recent past 3, 6, 12 months: The highest and lowest stock prices,\
#                                     along with the corresponding dates. The largest single-day stock price change (either gain\
#                                     or loss) during that period, including the amount and the date it occurred."))

In [None]:
def answer_perplexity_search(prompt):
    
    perplexity_api_key = get_key("perplexity-api-key", "us-west-2")

    # Build the payload for the Perplexity ai API.
    payload = {
        "model": "sonar-reasoning-pro",
        "messages": [
            {"role": "system", "content": """You have extensive expertise in biotech investments.\
            Always format responses with clear '**Final Answer**' section at the end,\
            using bold markdown, with no additional commentary after it.\ 
            Format responses WITHOUT ANY citations or reference numbers.
            Provide only the final answer. It is important that you do not include any explanation on the steps below.\
            Do not show the intermediate steps information.
            """},
            {"role": "user", "content": f"answer the following question:{prompt}"}
        ],
        "max_tokens": 8000,
        "temperature": 0.2,
        "top_p": 0.9,
        "search_domain_filter": None,
        "return_images": False,
        "return_related_questions": False,
        "stream": False,
        "response_format": None
    }
    
    headers = {
        "Authorization": f"Bearer {perplexity_api_key}",
        "Content-Type": "application/json"
    }
    
    # Call the Perplexity ai API.
    response = requests.post("https://api.perplexity.ai/chat/completions", json=payload, headers=headers)
    response.raise_for_status()  # Ensure that an HTTP error raises an exception.
    
    result_json = response.json()
    
    # Extract the answer from the API response.
    full_response = result_json['choices'][0]['message']['content']
    
    # Split response into thinking process and final answer
    if "**Final Answer**" in full_response:
        answer = full_response.split("**Final Answer**")[-1].strip()
    else:  # Fallback if formatting changes
        answer = full_response.split("\n\n")[-1].strip()
        
    return answer    




In [None]:
# ##for testing purpose
# answer_perplexity_search("what's the weather in Shanghai today")

In [None]:
def answer_with_search(question, db_jd, k_jd, priority_order=['perplexity', 'jarvis_docs']):
        
    # Retrieve documents from docs
    jarvis_docs_docs = db_jd.similarity_search(question, k_jd)
    
#     bm25_retriever.k = 25
#     vector_retriever = db_jd.as_retriever(search_kwargs={"k": k_jd})
    
#     ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
#                                  weights=[0.5, 0.5])

#     jarvis_docs_docs = ensemble.get_relevant_documents(question)
    
    # Get perplexity response if required
    perplexity_response = answer_perplexity_search(question) if 'perplexity' in priority_order else ""
    
    # Create source-to-context mapping
    source_contexts = {
        'jarvis_docs': [d.page_content for d in jarvis_docs_docs],
        'perplexity': [perplexity_response] if perplexity_response else []
    }    
    
#     combined_contexts = []
#     for source in priority_order:
#         if source in source_contexts:
#             combined_contexts += (source_contexts[source])
    
    # Build the knowledge base from each source
    knowledge_base = {
        'jarvis_docs': "\n\n".join(d.page_content for d in jarvis_docs_docs) if 'jarvis_docs' in priority_order else "",
        'perplexity': perplexity_response if 'perplexity' in priority_order else ""
    }
    
    # Build prioritized context using the given priority order.
    priority_context = []
    for idx, source in enumerate(priority_order, 1):
        heading = {
            'jarvis_docs': f"{idx}. JARVIS Docs",
            'perplexity': f"{idx}. External Search"
        }[source]
        
        content = knowledge_base[source] or f"No {source} data available"
        priority_context.append(f"{heading}:\n{content}")
    
    # Generate source counts for jarvis_tables and jarvis_docs
    source_counts = {
        'jarvis_docs': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_docs_docs) if knowledge_base['jarvis_docs'] else Counter(),
        'external_sources': "perplexity:1" if knowledge_base['perplexity'] else "perplexity:0"
    }
    
    overview_images = list({
        m['overview_image'] for m in (doc.metadata for doc in jarvis_docs_docs) if 'overview_image' in m
    })
    
    # Precompute the joined priority context to avoid issues with backslashes in f-string expressions.
    joined_priority_context = "\n\n".join(priority_context)
    
    prompt = f"""
**Analysis Directive**: Answer using this priority sequence: {', '.join(priority_order).upper()}

**Knowledge Base**:
{joined_priority_context}

**Conflict Resolution Rules**:
- Follow {priority_order[0].upper()} for numerical disputes
- Resolve conceptual conflicts using {priority_order[0].upper()}
- Use most recent context when dates conflict

**Question**: {question}

**Response Requirements**:
Do not fabricate any information that is not in the given content.
Answer in formal written English. Please provide a response with a concise introductory phrase,
but avoid meaningless fillers like 'ok', 'sure' or 'certainly'. Focus on delivering a direct and informative answer.
Do not include reference filenames in the answer.
"""
    
    return answer_gpt(prompt), source_counts, overview_images, perplexity_response    #combined_contexts


In [None]:
# def answer_with_search_ensemble(question, bm25_retriever, k_bm, db_jd, k_jd, priority_order=['perplexity', 'jarvis_docs']):
        
#     # Retrieve documents from docs    
#     bm25_retriever.k = k_bm
#     vector_retriever = db_jd.as_retriever(search_kwargs={"k": k_jd})
    
#     ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
#                                  weights=[0.5, 0.5])

#     jarvis_docs_docs = ensemble.get_relevant_documents(question)
    
#     # Get perplexity response if required
#     perplexity_response = answer_perplexity_search(question) if 'perplexity' in priority_order else ""
    
#     # Create source-to-context mapping
#     source_contexts = {
#         'jarvis_docs': [d.page_content for d in jarvis_docs_docs],
#         'perplexity': [perplexity_response] if perplexity_response else []
#     }    
    
# #     combined_contexts = []
# #     for source in priority_order:
# #         if source in source_contexts:
# #             combined_contexts += (source_contexts[source])
    
#     # Build the knowledge base from each source
#     knowledge_base = {
#         'jarvis_docs': "\n\n".join(d.page_content for d in jarvis_docs_docs) if 'jarvis_docs' in priority_order else "",
#         'perplexity': perplexity_response if 'perplexity' in priority_order else ""
#     }
    
#     # Build prioritized context using the given priority order.
#     priority_context = []
#     for idx, source in enumerate(priority_order, 1):
#         heading = {
#             'jarvis_docs': f"{idx}. JARVIS Docs",
#             'perplexity': f"{idx}. External Search"
#         }[source]
        
#         content = knowledge_base[source] or f"No {source} data available"
#         priority_context.append(f"{heading}:\n{content}")
    
#     # Generate source counts for jarvis_tables and jarvis_docs
# #     source_counts = {
# #         'jarvis_tables': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_tables_docs) if knowledge_base['jarvis_tables'] else Counter(),
# #         'jarvis_docs': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_docs_docs) if knowledge_base['jarvis_docs'] else Counter(),
# #         'external_sources': "perplexity:1" if knowledge_base['perplexity'] else "perplexity:0"
# #     }
    
# #     overview_images = list({
# #         m['overview_image'] for m in (doc.metadata for doc in jarvis_docs_docs) if 'overview_image' in m
# #     })
    
#     # Precompute the joined priority context to avoid issues with backslashes in f-string expressions.
#     joined_priority_context = "\n\n".join(priority_context)
    
#     prompt = f"""
# **Analysis Directive**: Answer using this priority sequence: {', '.join(priority_order).upper()}

# **Knowledge Base**:
# {joined_priority_context}

# **Conflict Resolution Rules**:
# - Follow {priority_order[0].upper()} for numerical disputes
# - Resolve conceptual conflicts using {priority_order[0].upper()}
# - Use most recent context when dates conflict

# **Question**: {question}

# **Response Requirements**:
# Do not fabricate any information that is not in the given content.
# Answer in formal written English. Please provide a response with a concise introductory phrase,
# but avoid meaningless fillers like 'ok', 'sure' or 'certainly'. Focus on delivering a direct and informative answer.
# Do not include reference filenames in the answer.
# """
    
#     return answer_gpt(prompt), perplexity_response    #combined_contexts


In [None]:
def answer_with_search_ensemble(question, bm25_retriever, k_bm, db_jd, k_jd, search_model="gpt-4.1", priority_order=['online_search', 'jarvis_docs']):
        
    # Retrieve documents from docs    
    bm25_retriever.k = k_bm
    vector_retriever = db_jd.as_retriever(search_kwargs={"k": k_jd})
    
    ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
                                 weights=[0.5, 0.5])

    jarvis_docs_docs = ensemble.get_relevant_documents(question)
    
    # Get online_search response if required
    online_search_response = answer_online_search(question, search_model) if 'online_search' in priority_order else ""
    
    # Create source-to-context mapping
    source_contexts = {
        'jarvis_docs': [d.page_content for d in jarvis_docs_docs],
        'online_search': [online_search_response] if online_search_response else []
    }    
    
#     combined_contexts = []
#     for source in priority_order:
#         if source in source_contexts:
#             combined_contexts += (source_contexts[source])
    
    # Build the knowledge base from each source
    knowledge_base = {
        'jarvis_docs': "\n\n".join(d.page_content for d in jarvis_docs_docs) if 'jarvis_docs' in priority_order else "",
        'online_search': online_search_response if 'online_search' in priority_order else ""
    }
    
    # Build prioritized context using the given priority order.
    priority_context = []
    for idx, source in enumerate(priority_order, 1):
        heading = {
            'jarvis_docs': f"{idx}. JARVIS Docs",
            'online_search': f"{idx}. External Search"
        }[source]
        
        content = knowledge_base[source] or f"No {source} data available"
        priority_context.append(f"{heading}:\n{content}")
    
    # Generate source counts for jarvis_tables and jarvis_docs
#     source_counts = {
#         'jarvis_tables': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_tables_docs) if knowledge_base['jarvis_tables'] else Counter(),
#         'jarvis_docs': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_docs_docs) if knowledge_base['jarvis_docs'] else Counter(),
#         'external_sources': "online_search:1" if knowledge_base['online_search'] else "online_search:0"
#     }
    
#     overview_images = list({
#         m['overview_image'] for m in (doc.metadata for doc in jarvis_docs_docs) if 'overview_image' in m
#     })
    
    # Precompute the joined priority context to avoid issues with backslashes in f-string expressions.
    joined_priority_context = "\n\n".join(priority_context)
    
    prompt = f"""
**Analysis Directive**: Answer using this priority sequence: {', '.join(priority_order).upper()}

**Knowledge Base**:
{joined_priority_context}

**Conflict Resolution Rules**:
- Follow {priority_order[0].upper()} for numerical disputes
- Resolve conceptual conflicts using {priority_order[0].upper()}
- Use most recent context when dates conflict

**Question**: {question}

**Response Requirements**:
Do not fabricate any information that is not in the given content.
Answer in formal written English, be objectively and factually, avoid subjective adjectives or exaggerations.\
Please provide a response with a concise introductory phrase,
but avoid meaningless fillers like 'ok', 'sure' or 'certainly'. Focus on delivering a direct and informative answer.
Please bold the most important facts or conclusions in your answer to help readers quickly identify key information,\
especially when the response is long.
Do not include reference filenames in the answer.
"""
    
    return answer_gpt(prompt), online_search_response    #combined_contexts


In [None]:
# question1=f"""Which financing round (e.g. seed, series A, series B, etc) of company {COMPANY_NAME} is currently in."""
# result, source_counts, overview_images = answer_with_image_old(question1, db, 50)
# print(result, f"\n\n{source_counts}", f"\n\nnum of images: {len(overview_images)}")

In [None]:
###test
def answer_perplexity_search_test(prompt):
    
    perplexity_api_key = get_key("perplexity-api-key", "us-west-2")

    # Build the payload for the Perplexity ai API.
    payload = {
        "model": "sonar-reasoning-pro",
        "messages": [
            {"role": "system", "content": """You have extensive expertise in biotech investments.\
            At the end of your answer, make sure to include all the online sources' full URLs you used for your think process.
            """},
            {"role": "user", "content": f"answer the following question:{prompt}"}
        ],
        "max_tokens": 8000,
        "temperature": 0.2,
        "top_p": 0.9,
        "search_domain_filter": None,
        "return_images": False,
        "return_related_questions": False,
        "stream": False,
        "response_format": None
    }
    
    headers = {
        "Authorization": f"Bearer {perplexity_api_key}",
        "Content-Type": "application/json"
    }
    
    # Call the Perplexity ai API.
    response = requests.post("https://api.perplexity.ai/chat/completions", json=payload, headers=headers)
    response.raise_for_status()  # Ensure that an HTTP error raises an exception.
    
    result_json = response.json()
    
    # Extract the answer from the API response.
    full_response = result_json['choices'][0]['message']['content']
    
#     # Split response into thinking process and final answer
#     if "**Final Answer**" in full_response:
#         answer = full_response.split("**Final Answer**")[-1].strip()
#     else:  # Fallback if formatting changes
#         answer = full_response.split("\n\n")[-1].strip()
        
    return full_response    


In [None]:
def company_helper(company_name):
    
    question = f"""
    Extract structured facts about the drug pipelines of company {COMPANY_NAME}.
    Return ONLY a JSON object with company-level information and its key assets.
    Include only assets with a known asset name.
    If no assets are found, return an empty assets list.

    JSON structure:
    {{
      "company name": "{COMPANY_NAME}",
      "has platform": true | false | null,
      "platform name": "<name, else null>",
      "platform is core asset": true | false | null,
      "assets": [
        {{
          "asset name": "<name, else null>",
          "modality": "<name, else null>",
          "targets": ["..."],
          "targeted therapeutic areas": ["..."],  
          "targeted indications": ["..."],
          "current development stage": "<name, else null>", 
          "brief trial result": "<brief description, else null>",
          "companies with competing asset":["..."],
        }}
      ]
    }}
    """

    result, online_search_response = answer_with_search_ensemble(question, bm25_retriever, 100, db_jd, 100, search_model="o4-mini", priority_order=["jarvis_docs"])
    
    return result

    

def prompt_format(json_string):
    
    from openai import OpenAI
    client = OpenAI(api_key=get_key("openai-api-key", "us-west-2"))

    response = client.chat.completions.create(
        model="gpt-4.1",  # Use the latest GPT-4 model you have access to
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": ("You are an expert in json structure.")
            },
            {
                "role": "user",
                "content": f"""I want you to review the json string and make sure it's properly formatted.\
                Return the correct formatted json string of {json_string} as the output.\
                Please only return the json string, do not add any introductory phrase."""
            }
        ]
    )
    
    response_string=response.choices[0].message.content
    json_match = re.search(r'{.*}', response_string, re.DOTALL)
    if json_match:
        json_content = json_match.group()
        # Parse the JSON content to ensure it is valid
        try:
            parsed_json = json.loads(json_content)
            return parsed_json
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None
    else:
        print("No JSON object found in the input string.")
        return None    


def company_extractor(COMPANY_NAME):
    
    info = company_helper(COMPANY_NAME)
    format_info = prompt_format(info)
    
    return format_info



In [None]:
# COMPANY_NAME = 'Euhearing Therapeutics'
# question1=f"""
#     Extract structured facts about the drug pipelines of company {COMPANY_NAME}.
#     Return ONLY a JSON object with company-level information and its key assets.
#     Include only assets with a known asset name.
#     If no assets are found, return an empty assets list.

#     JSON structure:
#     {{
#       "company name": "{COMPANY_NAME}",
#       "has platform": true | false | null,
#       "platform name": "<name, else null>",
#       "platform is core asset": true | false | null,
#       "assets": [
#         {{
#           "asset name": "<name, else null>",
#           "modality": "<name, else null>",
#           "targets": ["..."],
#           "targeted therapeutic areas": ["..."],  
#           "targeted indications": ["..."],
#           "current development stage": "<name, else null>", 
#           "brief trial result": "<brief description, else null>",
#           "companies with competing asset":["..."],
#         }}
#       ]
#     }}
#     """
# bm25_retriever.k = 50
# vector_retriever = db_jd.as_retriever(search_kwargs={"k": 100})

# ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
#                              weights=[0.5, 0.5])

# jarvis_docs_docs = ensemble.get_relevant_documents(question1)
# [d.page_content for d in jarvis_docs_docs]

In [None]:
COMPANY_NAME = "PPInnova (Peak Perform Innova)"
company_info = company_extractor(COMPANY_NAME)
company_info    

In [None]:
### human review to update some info
company_info["platform is core asset"] = True
substrings = ['STAT6', 'RBM39']
company_info['assets'] = [asset for key in substrings for asset in company_info['assets'] if key in asset['asset name']]
for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         to_remove = ["RRGENE", "Fudan University"]
#         to_add = ["HuidaGene Therapeutics",]
#         asset["companies with competing asset"] = [s for s in asset["companies with competing asset"] if not any(sub in s for sub in to_remove)]
    
    asset['competitor_valuation'] = []   
    for company in asset['companies with competing asset']:
        print(f"Getting valuation for {company}...")        
        query = f"""If {company} is a private company, provide its latest post-money valuation.
                    If {company} was acquired, provide the acquisition deal size.
                    If {company} is public, fetch its latest market cap from Google Finance."""
        result = answer_online_search(query, "o4-mini")
        asset['competitor_valuation'].append(result)
        time.sleep(1)
        
# for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         asset["targeted indications"] = ["Obesity","T2D"]
#     if "FGF21" in asset["asset name"]:
#         asset["targeted indications"] = ['Chronic Kidney Disease (CKD)', 'Diabetic Kidney Disease (DKD)', 'Metabolic dysfunction-associated steatohepatitis (MASH)']
        
company_info        

In [None]:
# ### human review to update some info
# company_info["platform is core asset"] = False
# company_info['assets'] = [asset for asset in company_info['assets'] if asset['asset name'] in ['EHT102','EHT201']]

# for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         to_remove = ["RRGENE", "Fudan University"]
#         to_add = ["Emaygene"]
#         asset["companies with competing asset"] = [s for s in asset["companies with competing asset"] if not any(sub in s for sub in to_remove)]
#         asset["companies with competing asset"].extend(to_add)
    
#     if "EHT201" in asset["asset name"]:
#         to_add = ["Decibel Therapeutics",
#                   "Otonomy & AGTC (OTO-825)",
#                   "Harvard Medical School - David Corey",
#                   "Southeast University (Chai Renjie) Program",
#                   "Juntendo University - Kazusaku Kamiya"
#                  ]
#         asset["companies with competing asset"].extend(to_add)
    
#     # Get competitor valuations for all assets
#     asset['competitor_valuation'] = []
#     for company in asset['companies with competing asset']:
#         print(f"Getting valuation for {company}...")
#         query = f"""If {company} is a private company, provide its latest post-money valuation.
#                     If {company} was acquired, provide the acquisition deal size.
#                     If {company} is public, fetch its latest market cap from Google Finance."""
#         result = answer_online_search(query, "o4-mini")
#         asset['competitor_valuation'].append(result)
#         time.sleep(1)

# company_info

In [None]:
COMPANY_NAME = "Sidera Bio"

question1=f"""
Outline the deal terms associated with company {COMPANY_NAME}'s current financing round.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 25, db_jd, 25, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
For company Euhearing Therapeutics's asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and
total patient populations in the USA, Europe, and globally. 
Based on incidence and prevalence rates,
note whether patient numbers are growing or declining, and define the therapy-eligible population
considering line of therapy, disease stage, or biomarker subgroups. Estimate the global total
addressable market (in U.S. dollars). 
Propose an annual price relative to the standard of care,                    
estimate peak sales at a reasonable market-share penetration.

"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Describe the clinical trial process and result of asset EHT101 in company Euhearing Therapeutics.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o3", priority_order=["jarvis_docs"])
print(result)

In [None]:
question1=f"""
For company Euhearing Therapeutics's asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and prevalent patient populations in China, and the population asset EHT102 can target.
Show your estimation process step by step.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o3", priority_order=["online_search"])
print(result)

In [None]:
question1=f"""
For asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and prevalent patient populations in China, and the population asset EHT102 can target.
Show your estimation process step by step.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing"

question1=f"""
For asset EHT102's targeted indication OTOF-related deafness, DFNB9, estimate the diagnosed and
prevalent patient populations in China, and the population asset EHT102 can target in China.
Validate your data assumption with latest reports or literatures.
Show your estimation process step by step and list the reference links you used."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
company_name = "Euhearing Therapeutics"

question1=f"""
For asset EHT102, please provide a well‑organized response of its
Competitive Context: comprehensively list all biotech companies with active programs targeting OTOF
for indication OTOF-related deafness, DFNB9, such as competitors Decibel Therapeutics (DB-OTO),
Akouos (AK-OTOF), Sensorion/Pasteur Institute (SENS-501/OTOF-GT), Otovia Therapeutics (OTOV101N+OTOV101C),
HuidaGene Therapeutics (AAV-gOTOF-emxABE), EmayGene (EA0010), Katholieke Universiteit Leuven (WO2025003513A1).
For each competitor, include: program, modality, clinical phase/status, key distinguishing features,
and financial information according to the following rules:        
- If the competitor is a private company, provide its latest post-money valuation.
- If the competitor was acquired, provide the acquisition deal size.
- If the competitor is public, fetch its **current** market cap using Google Finance and include the Google Finance URL. 
Do not use any other data sources for market cap.
Present the information in a structured table, followed by a concise descriptive summary."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs","online_search"])
print(result)

In [None]:
print(online_search_response)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Asset EHT102 is targeting OTOF-related deafness via Gene therapy (dual-AAV, protein-level recombination via intein).
For asset EHT102's targeted indication OTOF-related deafness, estimate the diagnosed and
prevalent patient populations in China, and the population asset EHT102 can target in China.\
Validate your data assumption with latest reports or literatures. 
Show your estimation process step by step and list the reference links you used.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Asset EHT102 is targeting OTOF-related deafness via Gene therapy (dual-AAV, protein-level recombination via intein).
Consider the price of current SoC for OTOF-related deafness and price of gene therapy for other rare diseases in China,\
come up an reasonable price for EHT102 in China.\
Validate your data assumption with latest reports or literatures. 
Show your estimation process step by step and list the reference links you used.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
company_name = "Euhearing Therapeutics"

question1=f"""
Briefly state how we (Pivotal BioVenture Partners) learn about the company {company_name}.\
                            Then describe the previous financing history of company {company_name} prior to\
                            the current round. What rounds of funding has the company previously completed (size,\
                            investors etc.), and what were the key milestones achieved with each round? For company\
                            {company_name}'s most recent prior financing round, how much was raised and what were\
                            the pre-money and post-money valuation of the round?\
                            Please provide a single, cohesive paragraph to address all the questions.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
print(online_search_response)

In [None]:
documents=db_jd.similarity_search_with_relevance_scores(question1, 50)   #similarity_search, similarity_search_with_score, similarity_search_with_relevance_scores
for chunk, score in documents:
    print(chunk.metadata['source'])
    print(score)
    print(chunk.page_content)
    

In [None]:
bm25_retriever.k = 50
vector_retriever = db_jd.as_retriever(search_kwargs={"k": 50})

ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
                             weights=[0.5, 0.5])

jarvis_docs_docs = ensemble.get_relevant_documents(question1)
[d.page_content for d in jarvis_docs_docs]

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
The asset EHT102 in company Euhearing Therapeutics's pipeline is in development stage Phase 1/2 (IND filed, first injection Dec 2024).                            If the asset EHT102 is in clinical stage, depending on the trial phase, provide the following:                            If the asset EHT102 is in Phase 1, describe the study                            design (including randomization, blinding, and control arms if used), participant type and number                            (healthy volunteers or patients), primary endpoints, and summarize quantitative results                            for safety, dose-limiting toxicities, and tolerability; additionally, provide any                            exploratory changes of biomarker data or preliminary signals of efficacy with concrete numberic results.                             If the asset EHT102 is in Phase 2, detail the study design (e.g., dose-finding, proof-of-concept), patient                            population and inclusion/exclusion criteria, primary and secondary endpoints (often initial                            efficacy and extended safety), and present efficacy and safety results using concrete quantitative                            metrics (response rates, changes in relevant biomarkers, mean differences, etc.), p-values, and                            interpret whether the results support advancement to pivotal trials.                            If the asset EHT102 is in Phase 3, provide a comprehensive summary of the pivotal study design (randomization, control arms,                            multicenter participation), number of patients, detailed definitions of all primary and secondary                            endpoints, and full clinical results, including efficacy, safety, effect sizes, risk reductions,                            confidence intervals, and p-values, clearly assessing if the trial met its primary objectives and                            is likely to support regulatory approval or label expansion.                            For the trial, specify if the outcome was positive, negative, or inconclusive based on primary endpoints,                            and interpret the significance of the findings in light of current clinical standards                            and patient population needs. Note any missing data or information gaps.                            If the asset EHT102 is not in clinical stage, simply state that it's not in clinical stage."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
documents=db_jt.similarity_search_with_relevance_scores(question1, 100)   #similarity_search, similarity_search_with_score, similarity_search_with_relevance_scores
for chunk, score in documents:
    print(chunk.metadata['source'])
    print(score)
    print(chunk.page_content)

In [None]:
def generate_questions_with_assets(company_name, company_info):
    """
    Generate the full questions JSON with dynamic asset-specific competitive landscape questions.
    
    Args:
        company_name: Name of the company
        company_extractor: Function to extract company data
    
    Returns:
        JSON string with questions
    """
    # Generate mechanism and pre-clinical questions for each asset    
    mechanism_questions = []
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        targets = ', '.join(asset['targets'])
        indications = ', '.join(asset['targeted indications'])
        mechanism_question_text = f"""For the asset {asset_name} in company {company_name}'s pipeline, please address the following:
                    1. Mechanism-of-Action Evidence: Summarize the data supporting its mechanism of action in the\
                    targeted indication (e.g., biochemical assays, cell-based studies, animal models or clinical\
                    results); If available, describe any genetic or biomarker evidence linking the target {targets} or mechanism\
                    to the disease ({indications}) pathology. Describe the target's normal physiological function and any relevant\
                    biological pathways; Detail the target {targets}'s the tissue distribution (RNA or protein) and expression timing\
                    (e.g., fetal vs. adult, aging). Explain how the asset {asset_name} interacts with its direct biomolecular target\
                    (e.g., receptor binding); Specify the asset {asset_name}'s affinity, specificity, or selectivity for its target,\
                    and discuss how these properties may influence efficacy or safety.
                    2. Pre-clinical Experimental Data: If company {company_name} has conducted any preclinical experiments for\
                    the asset {asset_name}, summarize the key preclinical experiments, covering pharmacodynamics, safety\
                    pharmacology, pharmacokinetics, and toxicology. Include detailed numerical data where possible, such as CMC\
                    attributes (physical, chemical, formulation, drug substance and product specifications), nonclinical\
                    study readouts (e.g., PD effect sizes, PK parameters like half-life and AUC, toxicology dose levels).\
                    If animal models were used to demonstrate the asset {asset_name}'s efficacy and safety, briefly describe\
                    the study design, current status or progress, and any remaining preclinical studies. If company\
                    {company_name} hasn't conducted any preclinical experiments for the asset {asset_name}, simply state that\
                    it hasn't conducted any preclinical experiments for asset {asset_name} yet."""
        
        mechanism_questions.append({
            "question": mechanism_question_text,
            "top_k_jd": 100,
            "top_k_bm": 100,
            "include_image": False,
            "priority_order": ["jarvis_docs"]
        })
        

    # Generate clinical progress questions for each asset    
    clinical_questions = []
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        stage = asset['current development stage']
        clinical_question_text = f"""The asset {asset_name} in company {COMPANY_NAME}'s pipeline is in development stage {stage}.\
                            If the asset {asset_name} is in clinical stage, depending on the trial phase, provide the following:\
                            If the asset {asset_name} is in Phase 1, describe the study\
                            design (including randomization, blinding, and control arms if used), participant type and number\
                            (healthy volunteers or patients), primary endpoints, and summarize quantitative results\
                            for safety, dose-limiting toxicities, and tolerability; additionally, provide any\
                            exploratory changes of biomarker data or preliminary signals of efficacy with concrete numberic results. \
                            If the asset {asset_name} is in Phase 2, detail the study design (e.g., dose-finding, proof-of-concept), patient\
                            population and inclusion/exclusion criteria, primary and secondary endpoints (often initial\
                            efficacy and extended safety), and present efficacy and safety results using concrete quantitative\
                            metrics (response rates, changes in relevant biomarkers, mean differences, etc.), p-values, and\
                            interpret whether the results support advancement to pivotal trials.\
                            If the asset {asset_name} is in Phase 3, provide a comprehensive summary of the pivotal study design (randomization, control arms,\
                            multicenter participation), number of patients, detailed definitions of all primary and secondary\
                            endpoints, and full clinical results, including efficacy, safety, effect sizes, risk reductions,\
                            confidence intervals, and p-values, clearly assessing if the trial met its primary objectives and\
                            is likely to support regulatory approval or label expansion.\
                            For the trial, specify if the outcome was positive, negative, or inconclusive based on primary endpoints,\
                            and interpret the significance of the findings in light of current clinical standards\
                            and patient population needs. Note any missing data or information gaps.\
                            If the asset {asset_name} is not in clinical stage, simply state that it's not in clinical stage."""
        
        clinical_questions.append({
            "question": clinical_question_text,
            "top_k_jd": 100,
            "top_k_bm": 100,
            "include_image": False,
            "priority_order": ["jarvis_docs"]
        })
        
        
    # Generate market and commercial opportunity questions for each asset    
    market_questions = []
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        indications = ', '.join(asset['targeted indications'])
        market_question_text = f"""For company {company_name}'s asset {asset_name}'s targeted indication {indications}, estimate the diagnosed and\
                    total patient populations in the USA, Europe, and globally. Based on incidence and prevalence rates,\
                    note whether patient numbers are growing or declining, and define the therapy-eligible population\
                    considering line of therapy, disease stage, or biomarker subgroups. Estimate the global total\
                    addressable market (in U.S. dollars). Propose an annual price relative to the standard of care,\
                    estimate peak sales at a reasonable market-share penetration.\
                    Note that do not use the numbers provided by company {company_name}, use content from our Internal investment process docs."""
        
        market_questions.append({
            "question": market_question_text,
            "top_k_jd": 100,
            "top_k_bm": 100,
            "include_image": False,
            "priority_order": ["jarvis_docs"]
        })

        
    # Generate asset-specific competitive landscape questions    
    competitive_questions=[]
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        targets = ', '.join(asset['targets'])
        indications = ', '.join(asset['targeted indications'])
        therapeutic_areas = ", ".join(asset['targeted therapeutic areas'])
        modality = asset['modality']
        competitors = ', '.join(asset['companies with competing asset'])
        competitor_valuation = '; '.join(asset['competitor_valuation'])
        
        competitive_landscape_text = f"""For asset {asset_name}, please provide a well‑organized response of its\
        Competitive Context: comprehensively list all biotech companies with active programs targeting {targets}\
        for indication {indications}, such as competitors {competitors}.\
        For each competitor, include: program, modality, clinical phase/status, key distinguishing features,\
        and financial information according to the following rules:\
        - If the competitor is a private company, provide its latest post-money valuation.\
        - If the competitor was acquired, provide the acquisition deal size.\
        - If the competitor is public, get its market cap via Google Finance.\
        Additional information: {competitor_valuation}.\
        Present the information in a structured table."""
        
        strategic_interest_text = f"""For asset {asset_name}, please provide a well‑organized response of its Potential\
        Strategic Interest, where you identify large pharma companies, with pipelines in the same therapeutic areas\
        {therapeutic_areas}, that may be motivated to license or acquire the asset, especially those whose targets or modalities\
        differ from {targets} or {modality}. Structure your answer in a table and followed by a concise description."""
        
        competitive_questions.append({
            "question": competitive_landscape_text,
            "top_k_jd": 100,
            "top_k_bm": 100,
            "search_model": "o4-mini",
            "include_image": False,
            "priority_order": ["jarvis_docs", "online_search"]
        })
        competitive_questions.append({
            "question": strategic_interest_text,
            "top_k_jd": 100,
            "top_k_bm": 100,
            "search_model": "o4-mini",
            "include_image": False,
            "priority_order": ["jarvis_docs", "online_search"]
        }) 
        
        
    # Generate asset-specific market questions in appendix
    append_market_questions=[]
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        targets = ', '.join(asset['targets'])
        indications = ', '.join(asset['targeted indications'])
        therapeutic_areas = ", ".join(asset['targeted therapeutic areas'])        
        
        China_market_text = f"""For asset {asset_name}'s targeted indication {indications}, estimate the diagnosed and\
        prevalent patient populations in China, and the population asset {asset_name} can target in China.\        
        Validate your data assumption with latest reports or literatures.\
        Show your estimation process step by step and list the reference links you used."""
        
        USA_market_text = f"""For asset {asset_name}'s targeted indication {indications}, estimate the diagnosed and\
        prevalent patient populations in the USA, and the population asset {asset_name} can target in the USA.\
        Validate your data assumption with latest reports or literatures.\
        Show your estimation process step by step and list the reference links you used."""

        Global_market_text = f"""For asset {asset_name}'s targeted indication {indications}, estimate the diagnosed and\
        prevalent patient populations globally, and the population asset {asset_name} can target globally.\
        Validate your data assumption with latest reports or literatures.\
        Show your estimation process step by step and list the reference links you used."""
        
        append_market_questions.append({
            "question": China_market_text,
            "top_k_jd": 5,
            "top_k_bm": 5,
            "search_model": "o4-mini",
            "include_image": False,
            "priority_order": ["online_search"]
        })
        append_market_questions.append({
            "question": USA_market_text,
            "top_k_jd": 5,
            "top_k_bm": 5,
            "search_model": "o4-mini",
            "include_image": False,
            "priority_order": ["online_search"]
        })
        append_market_questions.append({
            "question": Global_market_text,
            "top_k_jd": 5,
            "top_k_bm": 5,
            "search_model": "o4-mini",
            "include_image": False,
            "priority_order": ["online_search"]
        }) 
        

    # Generate asset-specific investment thesis questions    
    invest_thesis_questions=[]
    for asset in company_info['assets']:
        asset_name = asset['asset name']
        targets = ', '.join(asset['targets'])
        indications = ', '.join(asset['targeted indications'])
        modality = asset['modality']
        stage = asset['current development stage']
        brief_trial_result = asset['brief trial result']
        
        
        invest_thesis_text = f"""Company {company_name}'s key asset {asset_name} is targeting indication {indications},\
        via {modality}, the asset {asset_name} is currently in development stage {stage}, and a brief trial result of asset\
        {asset_name} is as follows: {brief_trial_result}.\
        Research and provide 3 most compelling reasons as well as 3 key risks to invest in company {company_name}."""
                
        # Create the question
        invest_thesis_questions.append({
            "question": invest_thesis_text,
            "top_k_jd": 5,
            "top_k_bm": 5,
            "search_model": "gpt-5",
            "include_image": False,
            "priority_order": ["online_search"]
        })
        
        
    # Build the full JSON structure
    json_questions_string = f"""
{{
    "sections": [
        {{
            "section_title": "SECTION 1. Overview",
            "content": [
                {{
                    "questions": [
                        {{
                            "question": "A brief description of the company {company_name}: Include information on when it was\
                            founded, its founders, (if it was spun out of any university or company, mention it as well),\
                            the core product (e.g., platform, top 2 leading drugs in pipeline), the\
                            stage of the key asset(s), the disease area(s) and indication(s) the company is targeting.\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 50, "top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }},
                        {{
                            "question": "Write a brief summary recommendation explaining whether company {company_name}\
                            is an attractive investment opportunity for the current financing round.\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 100,"top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}
                    ]
                }}
            ]
        }}, 
        {{
            "section_title": "SECTION 2. Deal dynamics",
            "content": [
                {{
                    "questions": [
                        {{
                            "question": "Briefly state how we (Pivotal BioVenture Partners) learn about the company {company_name}.\
                            Then describe the previous financing history of company {company_name} prior to\
                            the current round. What rounds of funding has the company previously completed (size,\
                            investors etc.), and what were the key milestones achieved with each round? For company\
                            {company_name}'s most recent prior financing round, how much was raised and what were\
                            the pre-money and post-money valuation of the round?\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 100, "top_k_bm": 100,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }},
                        {{
                            "question": "Outline the deal terms associated with company {company_name}'s current\
                            financing round.",
                            "top_k_jd": 50,"top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}                       
                    ]
                }}
            ]
        }}, 
        {{
            "section_title": "SECTION 3. Platform",
            "content": [
                {{
                    "questions": [
                        {{
                            "question": "If company {company_name} is a Platform-Based company or has a technology\
                            platform underpinning its assets or discovery efforts, describe the platform. If the\
                            platform solves or addresses a problem primarily in any of the following areas, please\
                            concretely describe (avoid generic description): drug discovery, target discovery,\
                            target biology, drug-target biology and/or binding, preclinical experiment prediction,\
                            preclinical experiment efficiencies, clinical trial prediction, clinical trial\
                            efficiencies, patient selection. If the platform does not solve or address a problem in\
                            any of the aforementioned areas, what unique or novel insight is unlocked by the\
                            platform? If the company is not a Platform-Based company, skip this question and simply\
                            return the answer as 'Company {company_name} is not a Platform-Based company or does\
                            not have a platform'.",
                            "top_k_jd": 100, "top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}""" + (f""",
                        {{
                            "question": "Give a table of company {company_name} and its main competitor companies.\
                            Include information such as the name of the company, the year the company was founded,\
                            the company's core technology, the furthest drug discovery or development\
                            stage the company has reached, the key scientist(s) and their affiliation of the company,\
                            pipeline highlights of the company, key investors, total capital raised to date of the company,\
                            most recent round's post-money valuation (or current market cap for public company).",
                            "top_k_jd": 50,"top_k_bm": 50,
                            "search_model": "o4-mini",
                            "include_image": false,
                            "priority_order": ["jarvis_docs","online_search"]
                        }}""" if company_info.get('platform is core asset', False) else "") + f"""                       
                    ]
                }}
            ]
        }},                             
        {{
            "section_title": "SECTION 4. Product summary and pipeline analysis",
            "content": [
                {{
                    "title": "Pipeline overview",
                    "questions": [
                        {{
                            "question": "Concisely describe the drug pipeline of company {company_name}: include the name(s) of\
                            the program(s), the specific molecular target, the modality (e.g., small molecule, antibody, etc.),\
                            route of administration, the proposed indication(s) and current development status (e.g.,\
                            preclinical, phase 1, phase 2, etc.).\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 50,"top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }},                    
                        {{
                            "question": "If the key asset(s) in company {company_name} was/were in-licensed: Provide a brief statement\
                            on where it was licensed from, the previous naming of the drug, when it was licensed, and describe\
                            the results/outcomes from previous preclinical and/or clinical studies (including max phase/trial,\
                            number of patients in the max trial, efficacy results with p-values, and safety results/concerns)\
                            by the originator.\
                            If the key asset(s) was/were not licensed, skip this question and simply return the key asset(s)\
                            in company {company_name} was/were not in-licensed as the answer.\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 50, "top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}
                    ]
                }},  
                {{
                    "title": "Mechanism and pre-clinical result",
                    "questions": {json.dumps(mechanism_questions)}
                }},
                {{
                    "title": "Clinical progress",
                    "questions": {json.dumps(clinical_questions)}                 
                }},
                {{
                    "title": "Market and commercial opportunity",
                    "questions": {json.dumps(market_questions)}
                }},                        
                {{
                    "title": "Competitive landscape and potential strategic interest",
                    "questions": {json.dumps(competitive_questions)}
                }}                                                                   
            ]
        }}, 
        {{
            "section_title": "SECTION 5. Company key milestones",
            "content": [
                {{
                    "questions": [
                        {{
                            "question": "Describe {company_name}'s planned milestones, with a timeline detailed by year and\
                            quarter. Then provide the cash runway timing that will be achieved with the current financing.\
                            Please provide a single, cohesive paragraph to address all the questions.",
                            "top_k_jd": 100, "top_k_bm": 100,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}
                    ]
                }}
            ]
        }},                                     
        {{
            "section_title": "SECTION 6. Recommendations and Next Steps",
            "content": [
                {{
                    "questions": [
                        {{
                            "question": "What specific due diligence steps would you recommend next for evaluating the\
                            opportunity to invest in {company_name}? Please give concrete, well-reasoned suggestions based on\
                            the provided content, and avoid any vague or general descriptions.\
                            Provide your answer as a single, cohesive paragraph.",
                            "top_k_jd": 100, "top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs","online_search"]
                        }}
                    ]
                }}
            ]
        }},               
        {{
            "section_title": "Appendix",
            "content": [
                {{
                    "title": "Investment thesis and risks by AI",
                    "questions": {json.dumps(invest_thesis_questions)}
                }},                                            
                {{
                    "title": "Management team",
                    "questions": [
                        {{
                            "question": "Provide a section for each senior team member of company {company_name} (Include any C-level executives\
                            or founders, members of the board of directors, and members of the clinical/scientific advisory board,\
                            and, if needed, any Senior Vice President level executives.) answering the following\
                            questions. Provide the answers together a whole paragraph as a text piece, not bullet form.\
                            Education and Credentials: What are the educational backgrounds and professional credentials\
                            of the senior team member?\
                            Experience: What is the professional background and experience of each senior team member?\
                            Track Record: What notable achievements or successes have the senior team members had in\
                            their previous roles?\
                            Relevant Expertise: How does the expertise of each team member align with the company's\
                            strategic goals and needs?\
                            Leadership Skills: What are the demonstrated leadership qualities of each senior team\
                            member?\
                            If any of the senior team members mentioned above are new or incoming hires,\
                            please list them separately.",
                            "top_k_jd": 50, "top_k_bm": 50,
                            "include_image": false,
                            "priority_order": ["jarvis_docs"]
                        }}                  
                    ]
                }}
            ]
        }}
    ]
}}
"""
    
    return json_questions_string


#                 {{
#                     "title": "Market analysis by AI",
#                     "questions": {json.dumps(append_market_questions)}
#                 }},                


In [None]:
# test
COMPANY_NAME = "PPInnova (Peak Perform Innova)"
json_questions_string = generate_questions_with_assets(COMPANY_NAME, company_info)

# Parse to verify it's valid JSON
questions_dict = json.loads(json_questions_string)

# Print just the competitive landscape section to verify
for section in questions_dict['sections']:
    if section['section_title'] == 'SECTION 5. Product summary and pipeline analysis':
        for content in section['content']:
            if isinstance(content, dict) and content.get('title') == 'Competitive landscape and potential strategic interest':
                print(content)

In [None]:
COMPANY_NAME = "PPInnova (Peak Perform Innova)"
json_questions_string = generate_questions_with_assets(COMPANY_NAME, company_info)

In [None]:
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
with open(f"json_questions_string_factsheet_{now}.txt", 'w') as file:
    file.write(json_questions_string)

In [None]:
# with open("json_questions_string_2025-05-15.txt", 'r') as file:
#     json_questions_string = file.read()
# json_questions = json.loads(json_questions_string)

In [None]:
json_questions = json.loads(json_questions_string)
json_questions

## regenerate outputs for investors easily to see prompts for each sections

In [None]:
def gather_responses(content, db_jd, bm25_retriever):
    """
    Process content items and gather responses for questions and subsections.
    
    Args:
        content (list): List of content items containing questions and subsections
        db_jd (object): Database for Jarvis documents (vector store)
        bm25_retriever (object): BM25 retriever for keyword-based search
        
    Returns:
        list: Structured responses for the content
    """
    section_responses = []
    
    for item in content:
        subtitle = item.get('title')
        questions = item.get('questions', [])
        subsections = item.get('subsections', [])
        
        section_combined_response_with_prompt = ""
        section_combined_response_without_prompt = ""
        section_combined_source_counts = {}
        section_combined_images = []
        section_online_search_responses = []  # Store online search responses
        subsection_responses = []

        if questions:
            for question in questions:
                detailed_question = question['question']
                k_jd = question['top_k_jd']
                k_bm = question['top_k_bm']
                search_model = question.get('search_model', 'o4-mini')
                include_image = question['include_image']
                priority_order = question['priority_order']
                
                # Process the response using the ensemble function
                response, online_search_response = answer_with_search_ensemble(
                    detailed_question, 
                    bm25_retriever, 
                    k_bm, 
                    db_jd, 
                    k_jd, 
                    search_model,
                    priority_order
                )
                
                # Store the online search response
                section_online_search_responses.append({
                    "question": detailed_question, 
                    "online_search_response": online_search_response
                })
                
                # Include the question in the response output with prompt
                question_text = f"**Q: {detailed_question}**\n"
                section_combined_response_with_prompt += question_text + response + "\n\n"
                
                # Include the response only in the output without prompt
                section_combined_response_without_prompt += response + "\n\n"
                
                # Track which sources were used (simplified version)
                for source in priority_order:
                    if source == 'jarvis_docs':
                        section_combined_source_counts[source] = section_combined_source_counts.get(source, 0) + 1
                    elif source == 'online_search' and online_search_response:
                        section_combined_source_counts['external_sources'] = section_combined_source_counts.get('external_sources', 0) + 1

                # Handle images if needed (placeholder - adjust based on your actual image handling)
                if include_image:
                    # section_combined_images.extend(overview_images)
                    pass

        if subsections:
            for subsection in subsections:
                subsubtitle = subsection['title']
                subquestions = subsection['questions']

                sub_combined_response_with_prompt = ""
                sub_combined_response_without_prompt = ""
                sub_combined_source_counts = {}
                sub_combined_images = []
                sub_online_search_responses = []

                for subquestion in subquestions:
                    detailed_question = subquestion['question']
                    k_jd = subquestion['top_k_jd']
                    k_bm = subquestion['top_k_bm']
                    search_model = subquestion.get('search_model', 'o4-mini')
                    include_image = subquestion['include_image']
                    priority_order = subquestion['priority_order']
                    
                    # Process the response
                    response, online_search_response = answer_with_search_ensemble(
                        detailed_question, 
                        bm25_retriever, 
                        k_bm, 
                        db_jd, 
                        k_jd, 
                        search_model,
                        priority_order
                    )
                    
                    # Store the online search response
                    sub_online_search_responses.append({
                        "question": detailed_question, 
                        "online_search_response": online_search_response
                    })
                    
                    # Include the question in the response output with prompt
                    question_text = f"**Q: {detailed_question}**\n"
                    sub_combined_response_with_prompt += question_text + response + "\n\n"
                    
                    # Include the response only in the output without prompt
                    sub_combined_response_without_prompt += response + "\n\n"
                    
                    # Track sources
                    for source in priority_order:
                        if source == 'jarvis_docs':
                            sub_combined_source_counts[source] = sub_combined_source_counts.get(source, 0) + 1
                        elif source == 'online_search' and online_search_response:
                            sub_combined_source_counts['external_sources'] = sub_combined_source_counts.get('external_sources', 0) + 1

                    # Handle images if needed
                    if include_image:
                        # sub_combined_images.extend(overview_images)
                        pass

                subsection_responses.append((
                    subsubtitle,
                    sub_combined_response_with_prompt,
                    sub_combined_response_without_prompt,
                    sub_combined_source_counts,
                    sub_combined_images,
                    any(q['include_image'] for q in subquestions),
                    sub_online_search_responses
                ))

        section_responses.append((
            subtitle, 
            section_combined_response_with_prompt, 
            section_combined_response_without_prompt, 
            subsection_responses, 
            section_combined_source_counts, 
            section_combined_images, 
            any(q['include_image'] for q in questions) if questions else False,
            section_online_search_responses
        ))

    return section_responses


def gather_all_responses(json_questions_string, db_jd, bm25_retriever):
    """
    Process a JSON string containing questions and generate responses.
    
    Args:
        json_questions_string (str): JSON string containing sections, questions, and parameters
        db_jd (object): Database for Jarvis documents (vector store)
        bm25_retriever (object): BM25 retriever for keyword-based search
        
    Returns:
        list: Structured responses for all sections with their questions and answers
    """
    json_questions = json.loads(json_questions_string)
    all_sections_responses = []

    # Process each section
    for section in json_questions["sections"]:
        section_title = section["section_title"]
        section_content = section["content"]
        
        # Gather responses for this section
        section_responses = gather_responses(section_content, db_jd, bm25_retriever)
        
        # Store the section title and its responses
        all_sections_responses.append({
            "section_title": section_title,
            "content": section_responses
        })
    
    return all_sections_responses

In [None]:
def save_image(image_data, image_name):
    sanitized_image_name = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', image_name)
    image_path = os.path.join(image_dir, sanitized_image_name)
    os.makedirs(os.path.dirname(image_path), exist_ok=True)
    with open(image_path, "wb") as img_file:
        img_file.write(base64.b64decode(image_data))
    return image_path

In [None]:
all_sections_responses = gather_all_responses(json_questions_string, db_jd, bm25_retriever)

now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#responses_file_path = f"all_sections_responses_o4mini_{now}.pkl"
responses_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
#responses_file_path = f"all_sections_responses_o3_{now}.pkl"

# Save the responses_with_sources object to a file
with open(responses_file_path, "wb") as file:
    pickle.dump(all_sections_responses, file)    

In [None]:
# with open('all_sections_responses_gpt41_factsheet_2025-08-28.pkl', "rb") as file:
#     all_sections_responses = pickle.load(file)

In [None]:
def update_any_section_or_subsection(json_questions_string, existing_responses_path,
                                    section_title, subsection_title, 
                                    db_jd, bm25_retriever):
    """
    Universal function to update either a section or a subsection within a section.
    
    Args:
        section_title: The main section title (e.g., "SECTION 1. Overview" or "Appendix")
        subsection_title: The subsection title if applicable (e.g., "Market analysis by AI"),
                         or None if updating a section without subsections
    """
    # Load existing responses
    with open(existing_responses_path, "rb") as file:
        all_sections_responses = pickle.load(file)
    
    # Parse the questions JSON
    json_questions = json.loads(json_questions_string)
    
    # Find the section in questions
    target_section = None
    for section in json_questions["sections"]:
        if section["section_title"] == section_title:
            target_section = section
            break
    
    if target_section is None:
        print(f"Could not find section: {section_title}")
        return all_sections_responses
        
    if subsection_title is None:
        # Update entire section (no subsection specified)
        new_responses = gather_responses(target_section["content"], db_jd, bm25_retriever)
        
        # Replace the entire content for this section
        for section in all_sections_responses:
            if section["section_title"] == section_title:
                section["content"] = new_responses
                print(f"Updated entire section: {section_title}")
                break
    else:
        # Update specific subsection within a section
        # Find the specific subsection questions
        target_questions = None
        for item in target_section["content"]:
            if item.get("title") == subsection_title:
                target_questions = item.get("questions", [])
                break
        
        if target_questions is None:
            print(f"Could not find subsection '{subsection_title}' in section '{section_title}'")
            return all_sections_responses
        
        # Create temporary content for gathering
        temp_content = [{
            "title": subsection_title,
            "questions": target_questions
        }]
        
        # Get new responses for this subsection
        new_responses = gather_responses(temp_content, db_jd, bm25_retriever)
        
        # Update the specific subsection
        for section in all_sections_responses:
            if section["section_title"] == section_title:
                for i, content_item in enumerate(section["content"]):
                    if content_item[0] == subsection_title:
                        section["content"][i] = new_responses[0]
                        print(f"Updated subsection '{subsection_title}' in section '{section_title}'")
                        break
                break
    
    return all_sections_responses


In [None]:
def write_file(file_path, include_sources, include_prompts, all_sections_responses):
    with open(file_path, "w", encoding="utf-8") as f:
        
        f.write("<style>\n")
        font_settings = {"size": "11px",
                         "family": "Arial",  # Alternative: "Calibri"
                         "weight": "500"
                        }
        f.write(f"body {{ font-size: {font_settings['size']}; font-family: {font_settings['family']}; font-weight: {font_settings['weight']};}}\n")
        f.write("</style>\n\n")
        
        f.write(f"# Fact Sheet -- **{COMPANY_NAME}**\n\n")
        
        # Generate Table of Contents
        f.write("# Table of Contents\n\n")
        for i, section in enumerate(all_sections_responses):
            section_title = section['section_title']
            section_anchor = f"section-{i+1}"
            f.write(f"- [{section_title}](#{section_anchor})\n")
            section_content = section['content']
            
            # Add subsections to the TOC only if valid titles exist
            for j, (subtitle, _, _, subsection_responses, _, _, _, _) in enumerate(section_content):
                if subtitle:  # Add valid subtitles
                    subsection_anchor = f"{section_anchor}-subsection-{j+1}"
                    f.write(f"  - [{subtitle}](#{subsection_anchor})\n")
                    for k, (subsubtitle, _, _, _, _, _) in enumerate(subsection_responses):
                        subsubsection_anchor = f"{subsection_anchor}-subsubsection-{k+1}"
                        f.write(f"    - [{subsubtitle}](#{subsubsection_anchor})\n")
        f.write("\n")

        # Write each section and subsection
        for i, section in enumerate(all_sections_responses):
            section_title = section['section_title']
            section_content = section['content']
            section_anchor = f"section-{i+1}"

            # Section title
            f.write(f"## <a id='{section_anchor}'></a>**{section_title}**\n\n")

            for j, (subtitle, section_response_with_prompt, section_response_without_prompt, subsection_responses, section_combined_source_counts, section_images, include_image, online_search_responses) in enumerate(section_content):
                subsection_anchor = f"{section_anchor}-subsection-{j+1}"
                
                # Add subtitle if it exists
                if subtitle:
                    f.write(f"### <a id='{subsection_anchor}'></a>**{subtitle}**\n\n")
                
                # Select whether to include prompts or not
                if include_prompts:
                    f.write(f"{section_response_with_prompt}\n")
                else:
                    f.write(f"{section_response_without_prompt}\n")
                
#                 # Add sources if requested
#                 if include_sources and section_combined_source_counts:
#                     f.write("\n\n#### _Document Source and Counts_\n")
#                     for source, count in section_combined_source_counts.items():
#                         f.write(f"<p style='margin: 2px; font-size:small;'><i>- {source}: {count} occurrence(s)</i></p>\n")

                # Add sources **right after the specific response**, not at section level
                if include_sources and section_combined_source_counts:
                    f.write("\n\n#### _Document Source and Counts_\n")
                    for source, count in section_combined_source_counts.items():
                        f.write(f"<p style='margin: 2px; font-size:small;'><i>- {source}: {count} occurrence(s)</i></p>\n")

                f.write("\n")  # Ensure spacing before the next response

                # Add images if requested
                if include_image and section_images:
                    f.write("\n\n#### _Related Images_\n")
                    for idx, image_data in enumerate(section_images):
                        image_name = f"{subtitle.replace(' ', '_')}_{idx}.png"
                        image_path = save_image(image_data, image_name)
                        relative_image_path = os.path.relpath(image_path, start=os.path.dirname(file_path))
                        f.write(f"![Image]({relative_image_path})\n")
                f.write("\n")

                # Process subsection responses
                for k, (subsubtitle, subresponse_with_prompt, subresponse_without_prompt, subsource_counts, subimages, subinclude_image) in enumerate(subsection_responses):
                    subsubsection_anchor = f"{subsection_anchor}-subsubsection-{k+1}"
                    f.write(f"#### <a id='{subsubsection_anchor}'></a>**{subsubtitle}**\n\n")
                    
                    # Select whether to include prompts or not
                    if include_prompts:
                        f.write(f"{subresponse_with_prompt}\n\n")
                    else:
                        f.write(f"{subresponse_without_prompt}\n\n")
                    
                    # Add sources if requested
                    if include_sources and subsource_counts:
                        f.write("\n\n#### _Document Source and Counts_\n")
                        for source, count in subsource_counts.items():
                            f.write(f"<p style='margin: 2px; font-size:small;'><i>- {source}: {count} occurrence(s)</i></p>\n")
                    
                    # Add images if requested
                    if subinclude_image and subimages:
                        f.write("\n\n#### _Related Images_\n")
                        for idx, image_data in enumerate(subimages):
                            image_name = f"{subsubtitle.replace(' ', '_')}_{idx}.png"
                            image_path = save_image(image_data, image_name)
                            relative_image_path = os.path.relpath(image_path, start=os.path.dirname(file_path))
                            f.write(f"![Image]({relative_image_path})\n")
                    f.write("\n")


In [None]:
COMPANY_NAME = "PPInnova"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-08-25'

OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_gpt41_{now}.md"

image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=all_sections_responses)
write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=all_sections_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=all_sections_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=all_sections_responses)

In [None]:
# COMPANY_NAME = "RiboX"
# now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
# #now = '2025-05-15'
# OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_o4mini_{now}.md"
# OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_o4mini_{now}.md"
# OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_o4mini_{now}.md"
# OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_o4mini_{now}.md"

# image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

# write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=all_sections_responses)

In [None]:
# COMPANY_NAME = "RiboX"
# now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
# #now = '2025-05-15'

# OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_o3_{now}.md"
# OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_o3_{now}.md"
# OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_o3_{now}.md"
# OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_o3_{now}.md"

# image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

# write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=all_sections_responses)
# write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=all_sections_responses)

In [None]:
def write_prompts(file_path, json_questions_string):
    """
    Generates a Markdown file listing all the questions by section from the JSON string.
    
    Parameters:
      - file_path (str): The output file path for the Markdown file.
      - json_questions_string (str): A JSON string containing the sections and questions.
    """
    import json  # Ensure json is imported.
    data = json.loads(json_questions_string)
    
    def write_questions(f, questions):
        """Helper function to write questions from a list."""
        for question in questions:
            # Get the question text and strip any extra whitespace.
            prompt = question.get('question', '').strip()
            f.write(f"**Question**: {prompt}\n\n")
            # Write out the priority order for the question if available.
            priority_order = question.get('priority_order')
            if priority_order:
                # If it's a list, join the elements with a comma.
                if isinstance(priority_order, list):
                    order_str = ", ".join(priority_order)
                else:
                    order_str = str(priority_order)
                f.write(f"**Priority Order**: {order_str}\n\n")
    
    with open(file_path, "w", encoding="utf-8") as f:
        # Write a header that includes the company name.
        f.write(f"# Fact Sheet -- **{COMPANY_NAME}**\n\n")
        
        # Iterate through each section
        for i, section in enumerate(data.get('sections', []), start=1):
            section_title = section.get('section_title', f"Section {i}")
            section_anchor = f"section-{i}"
            f.write(f"## <a id='{section_anchor}'></a> {section_title}\n\n")
            
            # Each section has a list under "content". In the provided JSON these items contain the questions.
            for j, content_item in enumerate(section.get('content', []), start=1):
                # Optionally write a subtitle if available.
                subtitle = content_item.get('title')
                if subtitle:
                    content_anchor = f"{section_anchor}-content-{j}"
                    f.write(f"### <a id='{content_anchor}'></a> {subtitle}\n\n")
                
                # Write out any questions found in this content block.
                if 'questions' in content_item:
                    write_questions(f, content_item['questions'])
                
                # If there are nested subsections, write them as well.
                for k, sub_item in enumerate(content_item.get('subsections', []), start=1):
                    sub_title = sub_item.get('title', f"Subsection {k}")
                    sub_anchor = f"{section_anchor}-subsection-{k}"
                    f.write(f"#### <a id='{sub_anchor}'></a> {sub_title}\n\n")
                    
                    if 'questions' in sub_item:
                        write_questions(f, sub_item['questions'])


In [None]:
COMPANY_NAME = "Euhearing Therapeutics"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-05-15'

OUTPUT_PATH_PROMPTS = f"{COMPANY_NAME}_factsheet_prompting_questions_{now}.md"   

write_prompts(OUTPUT_PATH_PROMPTS, json_questions_string)


In [None]:
# To update a section's response
company_name = "PPInnova (Peak Perform Innova)"

# Path to your existing responses
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
existing_responses_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"

updated_responses = update_any_section_or_subsection(
    json_questions_string=json_questions_string,
    existing_responses_path=existing_responses_path,
    section_title="SECTION 1. Overview",
    subsection_title=None,
    db_jd=db_jd,
    bm25_retriever=bm25_retriever
)

# Save the updated responses
updated_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
with open(updated_file_path, "wb") as file:
    pickle.dump(updated_responses, file)
    

In [None]:
# To update a section's response
company_name = "Euhearing Therapeutics"

# Path to your existing responses
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
existing_responses_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"

updated_responses = update_any_section_or_subsection(
    json_questions_string=json_questions_string,
    existing_responses_path=existing_responses_path,
    section_title="SECTION 2. Deal dynamics",
    subsection_title=None,
    db_jd=db_jd,
    bm25_retriever=bm25_retriever
)

# Save the updated responses
updated_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
with open(updated_file_path, "wb") as file:
    pickle.dump(updated_responses, file)
    

In [None]:
# To update a section's response
company_name = "Euhearing Therapeutics"

# Path to your existing responses
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
existing_responses_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"

updated_responses = update_any_section_or_subsection(
    json_questions_string=json_questions_string,
    existing_responses_path=existing_responses_path,
    section_title="SECTION 4. Product summary and pipeline analysis",
    subsection_title="Competitive landscape and potential strategic interest",
    db_jd=db_jd,
    bm25_retriever=bm25_retriever
)

# Save the updated responses
updated_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
with open(updated_file_path, "wb") as file:
    pickle.dump(updated_responses, file)
    

In [None]:
# To update a section's response
company_name = "Euhearing Therapeutics"

# Path to your existing responses
#now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
existing_responses_path = f"all_sections_responses_gpt41_factsheet_2025-08-26.pkl"

updated_responses = update_any_section_or_subsection(
    json_questions_string=json_questions_string,
    existing_responses_path=existing_responses_path,
    section_title="Appendix",
    subsection_title="Market analysis by AI",
    db_jd=db_jd,
    bm25_retriever=bm25_retriever
)

# Save the updated responses
updated_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
with open(updated_file_path, "wb") as file:
    pickle.dump(updated_responses, file)
    

In [None]:
# To update a section's response
company_name = "Euhearing Therapeutics"

# Path to your existing responses
#now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
existing_responses_path = f"all_sections_responses_gpt41_factsheet_2025-08-26.pkl"

updated_responses = update_any_section_or_subsection(
    json_questions_string=json_questions_string,
    existing_responses_path=existing_responses_path,
    section_title="SECTION 3. Platform",
    subsection_title=None,
    db_jd=db_jd,
    bm25_retriever=bm25_retriever
)

# Save the updated responses
updated_file_path = f"all_sections_responses_gpt41_factsheet_{now}.pkl"
with open(updated_file_path, "wb") as file:
    pickle.dump(updated_responses, file)
    

In [None]:
COMPANY_NAME = "PPInnova"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-05-15'

OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_gpt41_{now}.md"

image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=updated_responses)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-05-15'

OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_gpt41_{now}_updated.md"
OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_gpt41_{now}_updated.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_gpt41_{now}_updated.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_gpt41_{now}_updated.md"

image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=updated_responses)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-05-15'

OUTPUT_PATH_PROMPTS = f"{COMPANY_NAME}_factsheet_prompting_questions_{now}.md"   

write_prompts(OUTPUT_PATH_PROMPTS, json_questions_string)


In [None]:
def update_only_direct_questions(json_questions_string, existing_responses_path,
                                section_title, db_jd, bm25_retriever):
    """
    Update only the direct questions in a mixed section, preserving all subsections.
    """
    # Load existing responses
    with open(existing_responses_path, "rb") as file:
        all_sections_responses = pickle.load(file)
    
    # Parse questions to find direct questions only
    json_questions = json.loads(json_questions_string)
    
    # Extract ONLY the direct questions from this section
    direct_questions_only = []
    for section in json_questions["sections"]:
        if section["section_title"] == section_title:
            for item in section["content"]:
                # Only get items with questions but no title (direct questions)
                if "questions" in item and "title" not in item:
                    direct_questions_only.append(item)
            break
    
    if not direct_questions_only:
        print("No direct questions found")
        return all_sections_responses
    
    # Re-run ONLY the direct questions
    new_direct_responses = gather_responses(direct_questions_only, db_jd, bm25_retriever)
    
    # Find and update the section, preserving subsections
    for section in all_sections_responses:
        if section["section_title"] == section_title:
            # Get the existing content
            existing_content = section["content"][0]  # The main content tuple
            
            # Extract the existing subsections (index 3 in the tuple)
            preserved_subsections = existing_content[3]
            
            # Create updated content with new direct questions but old subsections
            updated_content = (
                new_direct_responses[0][0],  # title (usually None)
                new_direct_responses[0][1],  # new direct questions response with prompt
                new_direct_responses[0][2],  # new direct questions response without prompt
                preserved_subsections,        # KEEP EXISTING SUBSECTIONS UNCHANGED
                new_direct_responses[0][4],  # source counts
                new_direct_responses[0][5],  # images
                new_direct_responses[0][6],  # include_image flag
                new_direct_responses[0][7]   # online search responses
            )
            
            section["content"] = [updated_content]
            print(f"Updated direct questions, preserved {len(preserved_subsections)} subsections")
            break
    
    return all_sections_responses



In [None]:
existing_responses_path = f"all_sections_responses_gpt41_factsheet_2025-08-26.pkl"
updated_responses = update_only_direct_questions(
    json_questions_string,
    existing_responses_path,
    "SECTION 4. Product summary and pipeline analysis",
    db_jd,
    bm25_retriever
)

# Save the updated responses
with open("updated_responses.pkl", "wb") as f:
    pickle.dump(updated_responses, f)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"
now = datetime.now(pytz.timezone('Asia/Shanghai')).date()
#now = '2025-05-15'

OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_with_prompts_without_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE = f"{COMPANY_NAME}_factsheet_with_source_gpt41_{now}.md"
OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE = f"{COMPANY_NAME}_factsheet_without_source_gpt41_{now}.md"

image_dir = f"{os.getcwd()}/images_for_factsheet_generation"    

write_file(OUTPUT_PATH_WITH_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITH_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=True, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITH_SOURCE, include_sources=True, include_prompts=False, all_sections_responses=updated_responses)
write_file(OUTPUT_PATH_WITHOUT_PROMPTS_WITHOUT_SOURCE, include_sources=False, include_prompts=False, all_sections_responses=updated_responses)