In [None]:
#%%capture --no-stderr
%run docIndex.ipynb
#!pip install --upgrade openai
!pip install bitsandbytes
!pip install accelerate
!pip install rank_bm25
!pip install markdown

In [None]:
import pandas as pd
import numpy as np

import argparse
import openai
from openai import OpenAI

from langchain import PromptTemplate
from botocore.exceptions import ClientError

# import sagemaker
# from sagemaker import get_execution_role

from langchain.llms import BaseLLM
from ipywidgets import Dropdown
# from sagemaker.jumpstart.notebook_utils import list_jumpstart_models

import markdown

from collections import Counter
import re
import os, gc, torch

from datetime import datetime
import pytz
import pickle
import json

import chromadb 
from chromadb import Settings
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer

import requests

# from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
#from langchain_community.vectorstores import Chroma   # instead of langchain_chroma

from transformers import set_seed

# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

from transformers.utils import is_bitsandbytes_available

from langchain.retrievers import BM25Retriever
from langchain.retrievers.ensemble import EnsembleRetriever

In [None]:
#run this for the first time to process pdfs, no need to run again after that except there are new files added
s3_bucket = 'plfs-han-llm-experiment'
s3_source_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
s3_output_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
process_pdfs_in_s3_folder(s3_bucket, s3_source_folder, s3_bucket, s3_output_folder, model="o4-mini")  #gpt-5

In [None]:
# # Open and save the PDF to potentially fix issues
# doc1 = fitz.open("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531.pdf")
# doc1.save("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531_repaired.pdf", garbage=4, deflate=True, clean=True)
# doc1.close()

# # # Now try with the repaired version
# doc1 = fitz.open("Euhearing_sharepoint_selected_20250921--01 From company--项目相关资料包-Euhearing_20250817091531_repaired.pdf")
# text1 = doc1[1].get_text()

In [None]:
#if there are new files added, run this to process unprocessed pdfs
s3_bucket = 'plfs-han-llm-experiment'
s3_source_folder = 'factsheet-generation/Euhearing/internal/jarvis_docs/'
s3_output_folder = 'factsheet-generation/Euhearing/internal/jarvis_docs/'
process_unprocessed_pdfs_in_s3_folder(s3_bucket, s3_source_folder, s3_bucket, s3_output_folder, model="o4-mini")  #gpt-5

In [None]:
def copy_files_flattened_and_renamed_within_s3(bucket_name, source_prefix, destination_prefix):
    
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=source_prefix)
    if 'Contents' not in response:
        print(f"No files found in {source_prefix}")
        return

    for obj in response['Contents']:
        source_key = obj['Key']
        if source_key.endswith('/'):
            continue  # Skip directories

        # Extract the relative path from source_prefix
        relative_path = os.path.relpath(source_key, source_prefix)
        
        # Extract the top folder name
        top_folder_name = relative_path.split('/')[0]
        
        # Extract the original file name
        original_file_name = os.path.basename(source_key)
        
        # Create the new file name with the folder name included
        new_file_name = f"{top_folder_name}_{original_file_name}"
        
        # Construct the destination key
        destination_key = f"{destination_prefix.rstrip('/')}/{new_file_name}"
        
        # Copy the object to the new location
        copy_source = {'Bucket': bucket_name, 'Key': source_key}
        s3.copy_object(CopySource=copy_source, Bucket=bucket_name, Key=destination_key)
        
        print(f"Copied {source_key} to {destination_key}")


In [None]:
## RecursiveCharacterTextSplitter
def process_documents_from_s3(s3_bucket: str, s3_folder: str, ignored_files: List[str] = []) -> List[Document]:
    """
    Load documents from S3 and split in chunks
    """
    print(f"Loading documents from s3://{s3_bucket}/{s3_folder}")
    documents = load_documents_from_s3(s3_bucket, s3_folder, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from s3://{s3_bucket}/{s3_folder}")
    
    #the chunk_size parameter in RecursiveCharacterTextSplitter refers to the number of characters
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    
    # Add filename to each chunk after splitting
    for chunk in texts:
        filename = chunk.metadata.get('source', 'unknown_file')
        chunk.page_content = f"From file: {filename}\n\n{chunk.page_content}"
    
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts


In [None]:
USE_INT8 = True          # flip to True after `pip install bitsandbytes`
embeddings_model_name = "Qwen/Qwen3-Embedding-4B"
#RecursiveCharacterTextSplitter, size of chars
chunk_size = 1000
chunk_overlap = 200


class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self,
                 model_name: str,
                 device: str = "cpu",
                 batch_size: int = 8,
                 max_length: int = 8192,):
        self.device     = device
        self.batch_size = batch_size

        # minimise CUDA fragmentation
        os.environ.setdefault(
            "PYTORCH_CUDA_ALLOC_CONF",
            "expandable_segments:True,max_split_size_mb:128"
        )
        gc.collect(); torch.cuda.empty_cache()

        # --------------------- build kwargs ------------------------------
        load_kwargs = {"trust_remote_code": True}
        if device == "cuda":
            if USE_INT8 and is_bitsandbytes_available():
                load_kwargs["model_kwargs"] = {
                    "load_in_8bit": True,
                    "device_map": "auto",
                }                
            else:
                load_kwargs["model_kwargs"] = {"torch_dtype": torch.float16}

        # --------------------- load model -------------------------------
        self.model = SentenceTransformer(model_name, **load_kwargs)

        # If we took the fp16 path, cast & move once
        if device == "cuda" and not (USE_INT8 and is_bitsandbytes_available()):
            self.model.half()          # weights → fp16
            self.model.to(device)      # onto GPU

    # --------------------- LangChain hooks ------------------------------
    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        vecs = self.model.encode(
            documents,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=False,
            device=self.device,
        )
        return vecs.tolist()

    def embed_query(self, query: str) -> List[float]:
        return self.embed_documents([query])[0]


In [None]:
!nvidia-smi

In [None]:
%%time
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
set_seed(42)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_model_name,
    model_kwargs={'device': DEVICE},
    encode_kwargs={'batch_size': 8, 'normalize_embeddings': True}
)

# ChromaDB setup - this should work now
current_dir = os.getcwd()
persist_directory_jd = os.path.join(current_dir, "chroma_db_jarvis_docs")
os.makedirs(persist_directory_jd, exist_ok=True)
chroma_client_jd = chromadb.PersistentClient(path=persist_directory_jd)

# Drop & recreate collection 
try:
    chroma_client_jd.delete_collection("jarvis_docs")
except ValueError:
    pass

db_jd = Chroma(
    collection_name="jarvis_docs",
    embedding_function=embeddings,  # Using external embeddings
    client=chroma_client_jd,
    collection_metadata={"hnsw:space": "cosine"},
)


s3_bucket = "plfs-han-llm-experiment"
s3_folder = 'factsheet-generation/PPInnova/internal/jarvis_docs/'
ignored_files = []
texts = process_documents_from_s3(s3_bucket, s3_folder, ignored_files)

CHROMA_BATCH_SIZE = 1000
print(f"Total documents to process: {len(texts)} (batch {CHROMA_BATCH_SIZE})")

for i in range(0, len(texts), CHROMA_BATCH_SIZE):
    batch_texts = texts[i : i + CHROMA_BATCH_SIZE]
    print(f"Adding docs {i}-{i+len(batch_texts)-1}")
    db_jd.add_documents(batch_texts)
    
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    gc.collect()

print("Vector database creation completed successfully!")
print("Final document count:", chroma_client_jd.get_collection("jarvis_docs").count())

In [None]:
## load saved vectordb
# Init the same embedding function you used to build the DB
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 8 if DEVICE == "cuda" else 64

embeddings = SentenceTransformerEmbeddings(
    model_name=embeddings_model_name,
    device=DEVICE,
    batch_size=batch_size,
    max_length=8192
)


persist_directory_jd = os.path.join(os.getcwd(), "chroma_db_jarvis_docs")
chroma_client_jd   = chromadb.PersistentClient(path=persist_directory_jd)
db_jd = Chroma(
    client=chroma_client_jd,
    collection_name="jarvis_docs",
    embedding_function=embeddings
)


# persist_directory_jt = os.path.join(os.getcwd(), "chroma_db_jarvis_tables")
# chroma_client_jt   = chromadb.PersistentClient(path=persist_directory_jt)
# db_jt = Chroma(
#     client=chroma_client_jt,
#     collection_name="jarvis_tables",
#     embedding_function=embeddings
# )


In [None]:
def get_page_contents_only(collection, batch_size=1000):
    all_contents = []
    offset = 0
    
    while True:
        print(f"Processing batch starting at offset {offset}...")
        
        # Get only documents, no metadata or embeddings
        batch_data = collection.get(
            limit=batch_size,
            offset=offset,
            include=["documents"]  # Only text content
        )
        
        if not batch_data['documents']:
            break
            
        all_contents.extend(batch_data['documents'])
        offset += batch_size
        print(f"Processed {len(all_contents)} documents so far...")
    
    return all_contents

# Use the batched approach
collection = chroma_client_jd.get_collection("jarvis_docs")
all_page_contents = get_page_contents_only(collection, batch_size=1000)  

with open('all_page_contents_docs.pkl', 'wb') as f:
    pickle.dump(all_page_contents, f)


In [None]:
with open('all_page_contents_docs.pkl', 'rb') as f:
    all_page_contents = pickle.load(f)

all_docs = [Document(page_content=content) for content in all_page_contents]
bm25_retriever = BM25Retriever.from_documents(all_docs)

In [None]:
def get_key(secret_name,region_name):

    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e

    secret = get_secret_value_response['SecretString']
    key = ast.literal_eval(secret)['key']
    
    return key

openai_api_key=get_key("openai-api-key", "us-west-2")

In [None]:

def answer_gpt(prompt):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                  model="gpt-4.1", # or the latest version of GPT, o4-mini, gpt-4o, o3, gpt-4.1
                  temperature=0,
                  input=f"You are an expert in bioventure investing. Answer the following question: {prompt}"
                )
    return response.output_text


def answer_online_search(prompt,search_model="o4-mini"):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                  model= search_model,    #o4-mini, o3
                  tools=[{"type": "web_search_preview",
                          "search_context_size": "high",}],
                  input=f"{prompt}"                    
                )
    return response.output_text



def origene_mcp(prompt, search_model="o4-mini"):
    
    from openai import OpenAI
    client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
    response = client.responses.create(
                model=search_model,
                tools=[
                    {"type": "mcp",
                     "server_label": "origene",
                     "server_url": "https://origene-uuid1752754854.app-space.dplink.cc/chembl_mcp/mcp/?token=172f53102e0a46acb20f306eceaaf6c4",
                     "require_approval": "never",
                    },],
                input= f"{prompt}"
                )
    return response.output_text

In [None]:
# ###take too long to run
# from openai import OpenAI
# client = OpenAI(api_key = get_key("openai-api-key","us-west-2"))
# response = client.responses.create(
#             model="o4-mini",
#             tools=[
#                 {"type": "mcp",
#                  "server_label": "disease-pocket-molecule-mcp",
#                  "server_url": "https://disease-pocket-molecule-mcp-uuid1751524197.app-space.dplink.cc/sse?token=4218554daa854930947d4986b1fb35e9",
#                  "require_approval": "never",
#                 },],
#             input= "find all targets associated with obesity"
#             )
# print(response.output_text)

In [None]:
print(answer_online_search("Comprehensively list all the biotech companies that are competitors to PPInnova (Peak Perform Innova).","o4-mini"))

In [None]:
print(answer_online_search("Comprehensively list all the biotech companies with active programs targeting ALK7 for indication Obesity, T2D.","o4-mini"))

In [None]:
# print(answer_online_search("tell me about Sidera Bio","o3"))

In [None]:
# print(answer_online_search("For company Trevi Therapeutics, search google finance to provide the following\
#                                     information for the recent past 3, 6, 12 months: The highest and lowest stock prices,\
#                                     along with the corresponding dates. The largest single-day stock price change (either gain\
#                                     or loss) during that period, including the amount and the date it occurred."))

In [None]:
def answer_with_search_ensemble(question, bm25_retriever, k_bm, db_jd, k_jd, search_model="gpt-4.1", priority_order=['online_search', 'jarvis_docs']):
        
    # Retrieve documents from docs    
    bm25_retriever.k = k_bm
    vector_retriever = db_jd.as_retriever(search_kwargs={"k": k_jd})
    
    ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
                                 weights=[0.5, 0.5])

    jarvis_docs_docs = ensemble.get_relevant_documents(question)
    
    # Get online_search response if required
    online_search_response = answer_online_search(question, search_model) if 'online_search' in priority_order else ""
    
    # Create source-to-context mapping
    source_contexts = {
        'jarvis_docs': [d.page_content for d in jarvis_docs_docs],
        'online_search': [online_search_response] if online_search_response else []
    }    
    
#     combined_contexts = []
#     for source in priority_order:
#         if source in source_contexts:
#             combined_contexts += (source_contexts[source])
    
    # Build the knowledge base from each source
    knowledge_base = {
        'jarvis_docs': "\n\n".join(d.page_content for d in jarvis_docs_docs) if 'jarvis_docs' in priority_order else "",
        'online_search': online_search_response if 'online_search' in priority_order else ""
    }
    
    # Build prioritized context using the given priority order.
    priority_context = []
    for idx, source in enumerate(priority_order, 1):
        heading = {
            'jarvis_docs': f"{idx}. JARVIS Docs",
            'online_search': f"{idx}. External Search"
        }[source]
        
        content = knowledge_base[source] or f"No {source} data available"
        priority_context.append(f"{heading}:\n{content}")
    
    # Generate source counts for jarvis_tables and jarvis_docs
#     source_counts = {
#         'jarvis_tables': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_tables_docs) if knowledge_base['jarvis_tables'] else Counter(),
#         'jarvis_docs': Counter(os.path.basename(doc.metadata['source']) for doc in jarvis_docs_docs) if knowledge_base['jarvis_docs'] else Counter(),
#         'external_sources': "online_search:1" if knowledge_base['online_search'] else "online_search:0"
#     }
    
#     overview_images = list({
#         m['overview_image'] for m in (doc.metadata for doc in jarvis_docs_docs) if 'overview_image' in m
#     })
    
    # Precompute the joined priority context to avoid issues with backslashes in f-string expressions.
    joined_priority_context = "\n\n".join(priority_context)
    
    prompt = f"""
**Analysis Directive**: Answer using this priority sequence: {', '.join(priority_order).upper()}

**Knowledge Base**:
{joined_priority_context}

**Conflict Resolution Rules**:
- Follow {priority_order[0].upper()} for numerical disputes
- Resolve conceptual conflicts using {priority_order[0].upper()}
- Use most recent context when dates conflict

**Question**: {question}

**Response Requirements**:
Do not fabricate any information that is not in the given content.
Answer in formal written English, be objectively and factually, avoid subjective adjectives or exaggerations.\
Please provide a response with a concise introductory phrase,
but avoid meaningless fillers like 'ok', 'sure' or 'certainly'. Focus on delivering a direct and informative answer.
Please bold the most important facts or conclusions in your answer to help readers quickly identify key information,\
especially when the response is long.
Do not include reference filenames in the answer.
"""
    
    return answer_gpt(prompt), online_search_response    #combined_contexts


In [None]:
# question1=f"""Which financing round (e.g. seed, series A, series B, etc) of company {COMPANY_NAME} is currently in."""
# result, source_counts, overview_images = answer_with_image_old(question1, db, 50)
# print(result, f"\n\n{source_counts}", f"\n\nnum of images: {len(overview_images)}")

In [None]:
###test
def answer_perplexity_search_test(prompt):
    
    perplexity_api_key = get_key("perplexity-api-key", "us-west-2")

    # Build the payload for the Perplexity ai API.
    payload = {
        "model": "sonar-reasoning-pro",
        "messages": [
            {"role": "system", "content": """You have extensive expertise in biotech investments.\
            At the end of your answer, make sure to include all the online sources' full URLs you used for your think process.
            """},
            {"role": "user", "content": f"answer the following question:{prompt}"}
        ],
        "max_tokens": 8000,
        "temperature": 0.2,
        "top_p": 0.9,
        "search_domain_filter": None,
        "return_images": False,
        "return_related_questions": False,
        "stream": False,
        "response_format": None
    }
    
    headers = {
        "Authorization": f"Bearer {perplexity_api_key}",
        "Content-Type": "application/json"
    }
    
    # Call the Perplexity ai API.
    response = requests.post("https://api.perplexity.ai/chat/completions", json=payload, headers=headers)
    response.raise_for_status()  # Ensure that an HTTP error raises an exception.
    
    result_json = response.json()
    
    # Extract the answer from the API response.
    full_response = result_json['choices'][0]['message']['content']
    
#     # Split response into thinking process and final answer
#     if "**Final Answer**" in full_response:
#         answer = full_response.split("**Final Answer**")[-1].strip()
#     else:  # Fallback if formatting changes
#         answer = full_response.split("\n\n")[-1].strip()
        
    return full_response    


In [None]:
def company_helper(company_name):
    
    question = f"""
    Extract structured facts about the drug pipelines of company {COMPANY_NAME}.
    Return ONLY a JSON object with company-level information and its key assets.
    Include only assets with a known asset name.
    If no assets are found, return an empty assets list.

    JSON structure:
    {{
      "company name": "{COMPANY_NAME}",
      "has platform": true | false | null,
      "platform name": "<name, else null>",
      "platform is core asset": true | false | null,
      "assets": [
        {{
          "asset name": "<name, else null>",
          "modality": "<name, else null>",
          "targets": ["..."],
          "targeted therapeutic areas": ["..."],  
          "targeted indications": ["..."],
          "current development stage": "<name, else null>", 
          "brief trial result": "<brief description, else null>",
          "companies with competing asset":["..."],
        }}
      ]
    }}
    """

    result, online_search_response = answer_with_search_ensemble(question, bm25_retriever, 100, db_jd, 100, search_model="o4-mini", priority_order=["jarvis_docs"])
    
    return result

    

def prompt_format(json_string):
    
    from openai import OpenAI
    client = OpenAI(api_key=get_key("openai-api-key", "us-west-2"))

    response = client.chat.completions.create(
        model="gpt-4.1",  # Use the latest GPT-4 model you have access to
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": ("You are an expert in json structure.")
            },
            {
                "role": "user",
                "content": f"""I want you to review the json string and make sure it's properly formatted.\
                Return the correct formatted json string of {json_string} as the output.\
                Please only return the json string, do not add any introductory phrase."""
            }
        ]
    )
    
    response_string=response.choices[0].message.content
    json_match = re.search(r'{.*}', response_string, re.DOTALL)
    if json_match:
        json_content = json_match.group()
        # Parse the JSON content to ensure it is valid
        try:
            parsed_json = json.loads(json_content)
            return parsed_json
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None
    else:
        print("No JSON object found in the input string.")
        return None    


def company_extractor(COMPANY_NAME):
    
    info = company_helper(COMPANY_NAME)
    format_info = prompt_format(info)
    
    return format_info



In [None]:
COMPANY_NAME = "PPInnova (Peak Perform Innova)"
company_info = company_extractor(COMPANY_NAME)
company_info    

In [None]:
### human review to update some info
company_info["platform is core asset"] = True
substrings = ['STAT6', 'RBM39']
company_info['assets'] = [asset for key in substrings for asset in company_info['assets'] if key in asset['asset name']]
for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         to_remove = ["RRGENE", "Fudan University"]
#         to_add = ["HuidaGene Therapeutics",]
#         asset["companies with competing asset"] = [s for s in asset["companies with competing asset"] if not any(sub in s for sub in to_remove)]
    
    asset['competitor_valuation'] = []   
    for company in asset['companies with competing asset']:
        print(f"Getting valuation for {company}...")        
        query = f"""If {company} is a private company, provide its latest post-money valuation.
                    If {company} was acquired, provide the acquisition deal size.
                    If {company} is public, fetch its latest market cap from Google Finance."""
        result = answer_online_search(query, "o4-mini")
        asset['competitor_valuation'].append(result)
        time.sleep(1)
        
# for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         asset["targeted indications"] = ["Obesity","T2D"]
#     if "FGF21" in asset["asset name"]:
#         asset["targeted indications"] = ['Chronic Kidney Disease (CKD)', 'Diabetic Kidney Disease (DKD)', 'Metabolic dysfunction-associated steatohepatitis (MASH)']
        
company_info        

In [None]:
# ### human review to update some info
# company_info["platform is core asset"] = False
# company_info['assets'] = [asset for asset in company_info['assets'] if asset['asset name'] in ['EHT102','EHT201']]

# for asset in company_info["assets"]:
#     if "EHT102" in asset["asset name"]:
#         to_remove = ["RRGENE", "Fudan University"]
#         to_add = ["Emaygene"]
#         asset["companies with competing asset"] = [s for s in asset["companies with competing asset"] if not any(sub in s for sub in to_remove)]
#         asset["companies with competing asset"].extend(to_add)
    
#     if "EHT201" in asset["asset name"]:
#         to_add = ["Decibel Therapeutics",
#                   "Otonomy & AGTC (OTO-825)",
#                   "Harvard Medical School - David Corey",
#                   "Southeast University (Chai Renjie) Program",
#                   "Juntendo University - Kazusaku Kamiya"
#                  ]
#         asset["companies with competing asset"].extend(to_add)
    
#     # Get competitor valuations for all assets
#     asset['competitor_valuation'] = []
#     for company in asset['companies with competing asset']:
#         print(f"Getting valuation for {company}...")
#         query = f"""If {company} is a private company, provide its latest post-money valuation.
#                     If {company} was acquired, provide the acquisition deal size.
#                     If {company} is public, fetch its latest market cap from Google Finance."""
#         result = answer_online_search(query, "o4-mini")
#         asset['competitor_valuation'].append(result)
#         time.sleep(1)

# company_info

In [None]:
COMPANY_NAME = "Sidera Bio"

question1=f"""
Outline the deal terms associated with company {COMPANY_NAME}'s current financing round.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 25, db_jd, 25, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
For company Euhearing Therapeutics's asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and
total patient populations in the USA, Europe, and globally. 
Based on incidence and prevalence rates,
note whether patient numbers are growing or declining, and define the therapy-eligible population
considering line of therapy, disease stage, or biomarker subgroups. Estimate the global total
addressable market (in U.S. dollars). 
Propose an annual price relative to the standard of care,                    
estimate peak sales at a reasonable market-share penetration.

"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Describe the clinical trial process and result of asset EHT101 in company Euhearing Therapeutics.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o3", priority_order=["jarvis_docs"])
print(result)

In [None]:
question1=f"""
For company Euhearing Therapeutics's asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and prevalent patient populations in China, and the population asset EHT102 can target.
Show your estimation process step by step.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o3", priority_order=["online_search"])
print(result)

In [None]:
question1=f"""
For asset EHT102's targeted indication OTOF-related deafness, DFNB9 congenital hearing loss,
estimate the diagnosed and prevalent patient populations in China, and the population asset EHT102 can target.
Show your estimation process step by step.
"""

result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing"

question1=f"""
For asset EHT102's targeted indication OTOF-related deafness, DFNB9, estimate the diagnosed and
prevalent patient populations in China, and the population asset EHT102 can target in China.
Validate your data assumption with latest reports or literatures.
Show your estimation process step by step and list the reference links you used."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
company_name = "Euhearing Therapeutics"

question1=f"""
For asset EHT102, please provide a well‑organized response of its
Competitive Context: comprehensively list all biotech companies with active programs targeting OTOF
for indication OTOF-related deafness, DFNB9, such as competitors Decibel Therapeutics (DB-OTO),
Akouos (AK-OTOF), Sensorion/Pasteur Institute (SENS-501/OTOF-GT), Otovia Therapeutics (OTOV101N+OTOV101C),
HuidaGene Therapeutics (AAV-gOTOF-emxABE), EmayGene (EA0010), Katholieke Universiteit Leuven (WO2025003513A1).
For each competitor, include: program, modality, clinical phase/status, key distinguishing features,
and financial information according to the following rules:        
- If the competitor is a private company, provide its latest post-money valuation.
- If the competitor was acquired, provide the acquisition deal size.
- If the competitor is public, fetch its **current** market cap using Google Finance and include the Google Finance URL. 
Do not use any other data sources for market cap.
Present the information in a structured table, followed by a concise descriptive summary."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs","online_search"])
print(result)

In [None]:
print(online_search_response)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Asset EHT102 is targeting OTOF-related deafness via Gene therapy (dual-AAV, protein-level recombination via intein).
For asset EHT102's targeted indication OTOF-related deafness, estimate the diagnosed and
prevalent patient populations in China, and the population asset EHT102 can target in China.\
Validate your data assumption with latest reports or literatures. 
Show your estimation process step by step and list the reference links you used.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
Asset EHT102 is targeting OTOF-related deafness via Gene therapy (dual-AAV, protein-level recombination via intein).
Consider the price of current SoC for OTOF-related deafness and price of gene therapy for other rare diseases in China,\
come up an reasonable price for EHT102 in China.\
Validate your data assumption with latest reports or literatures. 
Show your estimation process step by step and list the reference links you used.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["online_search"])
print(result)

In [None]:
company_name = "Euhearing Therapeutics"

question1=f"""
Briefly state how we (Pivotal BioVenture Partners) learn about the company {company_name}.\
                            Then describe the previous financing history of company {company_name} prior to\
                            the current round. What rounds of funding has the company previously completed (size,\
                            investors etc.), and what were the key milestones achieved with each round? For company\
                            {company_name}'s most recent prior financing round, how much was raised and what were\
                            the pre-money and post-money valuation of the round?\
                            Please provide a single, cohesive paragraph to address all the questions.
"""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
print(online_search_response)

In [None]:
documents=db_jd.similarity_search_with_relevance_scores(question1, 50)   #similarity_search, similarity_search_with_score, similarity_search_with_relevance_scores
for chunk, score in documents:
    print(chunk.metadata['source'])
    print(score)
    print(chunk.page_content)
    

In [None]:
bm25_retriever.k = 50
vector_retriever = db_jd.as_retriever(search_kwargs={"k": 50})

ensemble = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever],
                             weights=[0.5, 0.5])

jarvis_docs_docs = ensemble.get_relevant_documents(question1)
[d.page_content for d in jarvis_docs_docs]

In [None]:
COMPANY_NAME = "Euhearing Therapeutics"

question1=f"""
The asset EHT102 in company Euhearing Therapeutics's pipeline is in development stage Phase 1/2 (IND filed, first injection Dec 2024).                            If the asset EHT102 is in clinical stage, depending on the trial phase, provide the following:                            If the asset EHT102 is in Phase 1, describe the study                            design (including randomization, blinding, and control arms if used), participant type and number                            (healthy volunteers or patients), primary endpoints, and summarize quantitative results                            for safety, dose-limiting toxicities, and tolerability; additionally, provide any                            exploratory changes of biomarker data or preliminary signals of efficacy with concrete numberic results.                             If the asset EHT102 is in Phase 2, detail the study design (e.g., dose-finding, proof-of-concept), patient                            population and inclusion/exclusion criteria, primary and secondary endpoints (often initial                            efficacy and extended safety), and present efficacy and safety results using concrete quantitative                            metrics (response rates, changes in relevant biomarkers, mean differences, etc.), p-values, and                            interpret whether the results support advancement to pivotal trials.                            If the asset EHT102 is in Phase 3, provide a comprehensive summary of the pivotal study design (randomization, control arms,                            multicenter participation), number of patients, detailed definitions of all primary and secondary                            endpoints, and full clinical results, including efficacy, safety, effect sizes, risk reductions,                            confidence intervals, and p-values, clearly assessing if the trial met its primary objectives and                            is likely to support regulatory approval or label expansion.                            For the trial, specify if the outcome was positive, negative, or inconclusive based on primary endpoints,                            and interpret the significance of the findings in light of current clinical standards                            and patient population needs. Note any missing data or information gaps.                            If the asset EHT102 is not in clinical stage, simply state that it's not in clinical stage."""
result, online_search_response = answer_with_search_ensemble(question1, bm25_retriever, 50, db_jd, 50, search_model="o4-mini", priority_order=["jarvis_docs"])
print(result)

In [None]:
documents=db_jt.similarity_search_with_relevance_scores(question1, 100)   #similarity_search, similarity_search_with_score, similarity_search_with_relevance_scores
for chunk, score in documents:
    print(chunk.metadata['source'])
    print(score)
    print(chunk.page_content)