In [2]:
from openai import OpenAI
import httpx
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
import os
import numpy as np
from ollama import Client
from langchain_text_splitters import RecursiveCharacterTextSplitter

os.environ["BASE_URL"] = "http://localhost:11434/v1"
os.environ["MODEL_NAME"] ='kamekichi128/qwen3-4b-instruct-2507'
os.environ["OPENAI_API_KEY"] = 'hey'

from trialmind.pubmed import pmid2papers, PubmedAPIWrapper, pmid2biocxml, parse_bioc_xml
from trialmind.api import StudyCharacteristicsExtraction, ScreeningCriteriaGeneration,\
                            LiteratureScreening, ScreeningCriteriaCTGeneration,\
                            CTScreening
from trialmind.retrievers import split_text_into_chunks
import extract

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
client = OpenAI(
        base_url=os.getenv("BASE_URL"),
        api_key=os.getenv("OPENAI_API_KEY"),
        http_client=httpx.Client(verify=False)
    )

In [4]:
response = client.chat.completions.create(
        model='MedAIBase/MedGemma1.5:4b',#'kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'user','content':'hey'}],
        temperature=0,
    )
print(response.choices[0].message.content)

Hello! How can I help you today?


In [5]:
fin_condition = 'Colorectal cancer'
treatements_eng =  ['Crizotinib','Lorlatinib',]
pubmed_api_key= '6892c4129cef143ff92d11533848d2e0d908'#os.getenv("PUBMED_API_KEY", '')

In [6]:
'''
search_api = PubmedAPIWrapper()
# page_size is the max number of records to return!!!! not pages!
tmp_inputs = {
        "page_size": 20,
        "keyword_map": {'conditions':[fin_condition], 
                        'treatments':[treatements_eng[0]]
                       },
        "keywords": {
            "OPERATOR": 'AND'
        }
}

response = search_api.build_search_query_and_get_pmid(tmp_inputs, 
                                                      api_key=pubmed_api_key)
'''
df_papers = pmid2papers(pmid_list=['37543570',
                                     '37773318',
                                     '38711893',
                                     '36053834',
                                     '38212428',
                                     '40369167',
                                     '36316649',
                                     '40140597',
                                     '40211189'],
                        #pmid_list=response[0], 
                        api_key=pubmed_api_key)
papers = df_papers[0]["Title"] + ": " + df_papers[0]["Abstract"].fillna("") # important to fillna
papers = papers.tolist()
papers[:1]

['Epithelial cell adhesion molecule (EpCAM) regulates HGFR signaling to promote colon cancer progression and metastasis.: Epithelial cell adhesion molecule (EpCAM) is known to highly expression and promotes cancer progression in many cancer types, including colorectal cancer. While metastasis is one of the main causes of cancer treatment failure, the involvement of EpCAM signaling in metastatic processes is unclear. We propose the potential crosstalk of EpCAM signaling with the HGFR signaling in order to govern metastatic activity in colorectal cancer.\nImmunoprecipitation (IP), enzyme-linked immunosorbent assay (ELISA), and fluorescence resonance energy transfer (FRET) was conducted to explore the extracellular domain of EpCAM (EpEX) and HGFR interaction. Western blotting was taken to determine the expression of proteins in colorectal cancer (CRC) cell lines. The functions of EpEX in CRC were investigated by proliferation, migration, and invasion analysis. The combined therapy was val

In [6]:
len(papers)

9

In [76]:

api = LiteratureScreening()
ec_predP = api.run(
    population = f"Patients with {fin_condition} undergoing treatment with {treatements_eng[0]}",
    intervention = f"{treatements_eng[0]}",
    comparator = "",
    outcome = "",
    llm = os.getenv("MODEL_NAME"),
    criteria = ["Does the content mention 'colorectal cancer'?",
                 "Does content mention 'Crizotinib'?", 
                'Is there a description of treatement results?'],#title_criteria + content_criteria,
    papers = papers, # make for the top-100 for demo
)
ec_predP

1

parsed_results in asynch [PaperEvaluation(evaluations=['YES', 'YES', 'YES'], rationale=['The paper explicitly discusses colorectal cancer (CRC) and its progression and metastasis.', 'The paper mentions crizotinib as part of a combined therapy with an anti-EpCAM antibody.', 'The paper describes treatment results, including the significant inhibition of tumor progression and prolonged survival in animal models when combining an anti-EpCAM antibody with crizotinib.'])]

parsed_results in asynch [PaperEvaluation(evaluations=['YES', 'YES', 'YES'], rationale=['The paper explicitly mentions colorectal cancer as one of the cancer types where ALK fusions/rearrangements have been reported and where ALK inhibitors have shown activity.', 'The paper clearly references crizotinib as one of the FDA-approved ALK inhibitors and discusses its use in ALK-aberrant tumors, including colorectal cancer.', 'The paper describes reports of ALK inhibitor activity in colorectal cancer with ALK fusions/rearrang

[PaperEvaluation(evaluations=['YES', 'YES', 'YES'], rationale=['The paper explicitly discusses colorectal cancer (CRC) and its progression and metastasis.', 'The paper mentions crizotinib as part of a combined therapy with an anti-EpCAM antibody.', 'The paper describes treatment results, including the significant inhibition of tumor progression and prolonged survival in animal models when combining an anti-EpCAM antibody with crizotinib.']),
 PaperEvaluation(evaluations=['YES', 'YES', 'YES'], rationale=['The paper explicitly mentions colorectal cancer as one of the cancer types where ALK fusions/rearrangements have been reported and where ALK inhibitors have shown activity.', 'The paper clearly references crizotinib as one of the FDA-approved ALK inhibitors and discusses its use in ALK-aberrant tumors, including colorectal cancer.', 'The paper describes reports of ALK inhibitor activity in colorectal cancer with ALK fusions/rearrangements, indicating treatment outcomes such as response

In [77]:
evalsP = [i.evaluations for i in ec_predP]

word2int = {"YES": 1, 
            "UNCERTAIN": 0,
            "NO": -1
           }
#rev_subs = { v:k for k, v in word2int.items()} # subs.iteritems() In Python 3
new_evalsP = []
for one_e in evalsP:
    new_evalsP.append([word2int.get(item, 0) for item in one_e ])
new_evalsP = np.array(new_evalsP)    
print(new_evalsP.sum(axis=1))
df_papers[0]['screen_eval'] = -5

df_p_e = df_papers[0]#.iloc[:5].copy()
df_p_e['screen_eval'] = new_evalsP.sum(axis=1)
df_p_e.head(3)

[ 3  3  1  2  1  2 -1  3  3  1  3  0  3  3  2  2  3  3  2  2]


Unnamed: 0,PMID,Journal,Year,Month,Day,Title,Publication Type,Authors,Abstract,screen_eval
0,37543570,Journal of translational medicine,2023,Aug,5,Epithelial cell adhesion molecule (EpCAM) regu...,"Journal Article, Research Support, Non-U.S. Gov't","Chi-Chiu Lee, Chia-Jui Yu, Sushree Shankar Pan...",Epithelial cell adhesion molecule (EpCAM) is k...,3
1,37773318,NPJ precision oncology,2023,Sep,29,ALK fusions in the pan-cancer setting: another...,"Journal Article, Review","Aditya Shreenivas, Filip Janku, Mohamed A Goud...",Anaplastic lymphoma kinase (ALK) alterations (...,3
2,36801912,NPJ precision oncology,2023,Feb,18,BRAF v600E-mutant cancers treated with vemuraf...,Journal Article,"Blessie Elizabeth Nelson, Jason Roszik, Filip ...",Combined BRAF‚Äâ+‚ÄâMEK inhibition is FDA approved...,1


In [7]:
df_p_e = df_papers[0]
df_p_e['screen_eval'] = 3

In [8]:
total_ev = 3
pmid_list = df_p_e[df_p_e.screen_eval>=total_ev
                        ].PMID.values.tolist()
papers_ch = df_p_e[df_p_e.screen_eval>=total_ev].Abstract.values
papers_ch_ta = df_p_e[df_p_e.screen_eval>=total_ev].Title.values +': ' + df_p_e[df_p_e.screen_eval>=total_ev].Abstract.values
docs =  [Document(page_content=i, 
                  metadata={"source": j}
                 ) for i,j in zip(papers_ch_ta,pmid_list)]
print(papers_ch.shape)
docs[:1]

(9,)


[Document(metadata={'source': '37543570'}, page_content='Epithelial cell adhesion molecule (EpCAM) regulates HGFR signaling to promote colon cancer progression and metastasis.: Epithelial cell adhesion molecule (EpCAM) is known to highly expression and promotes cancer progression in many cancer types, including colorectal cancer. While metastasis is one of the main causes of cancer treatment failure, the involvement of EpCAM signaling in metastatic processes is unclear. We propose the potential crosstalk of EpCAM signaling with the HGFR signaling in order to govern metastatic activity in colorectal cancer.\nImmunoprecipitation (IP), enzyme-linked immunosorbent assay (ELISA), and fluorescence resonance energy transfer (FRET) was conducted to explore the extracellular domain of EpCAM (EpEX) and HGFR interaction. Western blotting was taken to determine the expression of proteins in colorectal cancer (CRC) cell lines. The functions of EpEX in CRC were investigated by proliferation, migrati

In [11]:
vector_store._client.delete_collection(vector_store._collection.name)

In [12]:
embeddings = OllamaEmbeddings(model="all-minilm")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

text_splitter=RecursiveCharacterTextSplitter(chunk_size=256, 
                                             chunk_overlap=50)

In [13]:
all_splits = text_splitter.split_documents(docs)
document_ids = vector_store.add_documents(documents=all_splits)

In [10]:
pmid_list

['37543570',
 '37773318',
 '38711893',
 '36053834',
 '38212428',
 '40369167',
 '36316649',
 '40140597',
 '40211189']

In [14]:
vector_store.similarity_search(f"the effectiveness of treating {fin_condition} with {treatements_eng[0]}",
                               k=4,
                               filter={"source": "37543570"}
)

[Document(id='0c40220b-0ae2-4971-bf54-036f137184f9', metadata={'source': '37543570'}, page_content='we show that the combined treatment of an anti-EpCAM neutralizing antibody (EpAb2-6) and an HGFR inhibitor (crizotinib) significantly inhibits tumor progression and prolongs survival in metastatic and orthotopic animal models of colon cancer.'),
 Document(id='b3480818-2ac1-4af1-89ca-c46f7dfd1f02', metadata={'source': '37543570'}, page_content='Our findings illuminate the molecular mechanisms underlying EpCAM signaling promotion of colon cancer metastasis, further suggesting that the combination of EpAb2-6 and crizotinib may be an effective strategy for treating cancer patients with high EpCAM'),
 Document(id='07337d48-a670-48ec-9968-932829d20216', metadata={'source': '37543570'}, page_content='Western blotting was taken to determine the expression of proteins in colorectal cancer (CRC) cell lines. The functions of EpEX in CRC were investigated by proliferation, migration, and invasion an

In [15]:
len(pmid_list)

9

In [15]:
os.environ["MODEL_NAME"] ='medllama2'

In [20]:
f'Crizotinib effectiveness, string, the specific percentages and numbers as outcomes of treating {fin_condition} with {treatements_eng[0]} in the analyzed paper'
           

'Crizotinib effectiveness, string, the specific percentages and numbers as outcomes of treating Colorectal cancer with Crizotinib in the analyzed paper'

In [18]:
ii =len(pmid_list) #4

api = StudyCharacteristicsExtraction()
extracted = api.run(
    papers_inp=[pmid_list[:ii],papers_ch[:ii]],
    #fields=[f'The effectiveness of treating {fin_condition} with {treatements_eng[0]}',
    #       ],
    fields=[f'Crizotinib effectiveness, string, the specific percentages and numbers as outcomes of treating {fin_condition} with {treatements_eng[0]} in the analyzed paper'
           ],
    llm=os.getenv("MODEL_NAME"),
    chunk_size=0,
    chunk_overlap=0,
    thinking=False,
    vector_store = vector_store,
)

['37543570', '37773318', '38711893', '36053834', '38212428', '40369167', '36316649', '40140597', '40211189']
37543570
we show that the combined treatment of an anti-EpCAM neutralizing antibody (EpAb2-6) and an HGFR inhibitor (crizotinib) significantly inhibits tumor progression and prolongs survival in metastatic and orthotopic animal models of colon cancer.
Western blotting was taken to determine the expression of proteins in colorectal cancer (CRC) cell lines. The functions of EpEX in CRC were investigated by proliferation, migration, and invasion analysis. The combined therapy was validated via a tail vein
Our findings illuminate the molecular mechanisms underlying EpCAM signaling promotion of colon cancer metastasis, further suggesting that the combination of EpAb2-6 and crizotinib may be an effective strategy for treating cancer patients with high EpCAM
combined therapy was validated via a tail vein injection method for the metastasis and orthotopic colon cancer models.
37773318
a

In [19]:
for one_extr in extracted:
    for one_filed_res in one_extr.fieldresult:
        if one_filed_res.value!="NP":
            print(one_filed_res.value)
            print(one_filed_res._cited_blocks)
            print()

significantly inhibits tumor progression and prolongs survival in metastatic and orthotopic animal models of colon cancer.
['combined therapy was validated via a tail vein injection method for the metastasis and orthotopic colon cancer models.']

showed significant activity in colorectal cancer patients bearing ALK fusions/rearrangements (~30% response rate).
['lung cancer (NSCLC), a rate that may be below the viability threshold of even large-scale treatment trials. Five ALK inhibitors -alectinib, brigatinib, ceritinb, crizotinib, and lorlatinib-are FDA approved for ALK-aberrant NSCLCs, and crizotinib is also']

Partial response to crizotinib + regorafenib + PD-1 inhibitor in a metastatic Colorectal cancer (CRC) with the Raf murine sarcoma viral oncogene homolog B (Raf/B)
['A 49-year-old Chinese male was diagnosed with ascending colon adenocarcinoma (cT3N+?M1) with liver metastases. The patient performed next-generation sequencing (NGS) using tissue and circulating tumor DNA (ctDNA), 

In [None]:
os.environ["MODEL_NAME"] ='qwen3:8b'#'kamekichi128/qwen3-4b-instruct-2507'

ii =len(pmid_list) #4

api = StudyCharacteristicsExtraction()
extracted0 = api.run(
    papers_inp=[pmid_list[:ii],papers_ch[:ii]],
    #fields=[f'The effectiveness of treating {fin_condition} with {treatements_eng[0]}',
    #       ],
    fields=[f'Crizotinib effectiveness, string, the specific percentages and numbers as outcomes of treating {fin_condition} with {treatements_eng[0]} in the analyzed paper'
           ],
    llm=os.getenv("MODEL_NAME"),
    chunk_size=0,
    chunk_overlap=0,
    thinking=False,
    vector_store = vector_store,
)

['37543570', '37773318', '38711893', '36053834', '38212428', '40369167', '36316649', '40140597', '40211189']
9
You are now the following python function: ```
def extract_fields_from_input_study(inputs: Dict[str, Any]) -> str:
    """
    This function is tasked with analyzing clinical trial study reports or papers to extract specific information as structured data and provide citations for the extracted information.
    The user will provide a list of fields they are interested in, along with a natural language description for each field to guide you on what content to look for and from which parts of the report to extract it.

    IMPORTANT:
    For each field described by the user, you need to:
    1. Identify and extract the relevant information from the report, based on the provided description.
       The answer should be under 200 characters. Summarize the information without losing clarity.
    2. Structure the extracted information into a standard format whenever possible (e.g.

In [None]:
for one_extr in extracted0:
    for one_filed_res in one_extr.fieldresult:
        if one_filed_res.value!="NP":
            print(one_filed_res.value)
            print(one_filed_res._cited_blocks)
            print()

In [42]:
len('Crizotinib-IR808@BSA NPs exhibited synergistic chemophototherapy effects on tumors. In conclusion, this innovative imaging-mediated multifunctionalÁªÑÂêàt')

150

In [126]:
extracted[0].fieldresult[0]

FieldResult(name='Crizotinib Results', value='Crizotinib and cabozantinib selectively inhibited ', source_id=[0])

In [96]:
extracted[0].fieldresult[0]._cited_blocks

['colorectal cancer (CRC) cells induce synthetic lethality when treated with inhibitors of c-MET receptor tyrosine kinase. c-MET specific inhibitor PHA-665752 as well as two other FDA-approved drugs, crizotinib and cabozantinib, selectively inhibited the growth of ARID1A-deficient CRC cells in vitro and in xenograft tumor models. Mechanistically, we identified a tripartite functional association among ARID1A, c-MET, and NRF2, where ARID1A and c-MET pathways converge on the NRF2 transcription']

In [19]:
from pydantic import BaseModel, validator, Field, conlist  # This is the new version
from typing import Dict, Literal
class FieldResult(BaseModel):
    name: str = Field(description='Field name that accurately represents the content of the field based on its description.',
                     max_length=25)
    value: str = Field(description='Extracted information from the text based on the field description.',
                      max_length=200)
    source_id: conlist(int,min_length=1, max_length=3) = Field(description='Cited document IDs.')
class Results(BaseModel):
    fieldresult: list[FieldResult]#FieldResult


In [14]:
retrieved_docs = vector_store.similarity_search(f'The effectiveness of treating {fin_condition} with {treatements_eng[0]}', 
        k=2,filter={"source": '37418240'})
[i.page_content for i in retrieved_docs]

['Crizotinib-IR808@BSA NPs exhibited synergistic chemophototherapy effects on tumors. In conclusion, this innovative imaging-mediated multifunctional combination therapy strategy with good c-Met targeting ability may provide a new approach for colorectal cancer treatment.',
 'in advanced stages of CRC and is considered to be a potent tumor biomarker. Herein, based on the well-targeted inhibitory effect of Crizotinib on c-Met positive tumor cells, the dye IR808 was covalently combined with the drug molecule Crizotinib, resulting in the synthesis of a NIR fluorescent probe Crizotinib-IR808 targeting c-Met positive tumor cells. Then, water-insoluble Crizotinib-IR808 was fabricated by using bovine serum albumin (BSA)']

In [21]:
from trialmind.prompts.extraction import STUDY_FIELDS_EXTRACTION_3

In [23]:
docs_content = '\n\n'.join([i.page_content for i in retrieved_docs])
fields=[f'Crizotinib Results, string, the effectiveness of treating {fin_condition} with {treatements_eng[0]}'
           ]
user_m = f'''
            paper_content = \"\"\"{docs_content}\"\"\"
            fields = \"\"\"{fields}\"\"\"
            '''

prompt = f'''
You are a helpful assistant. Use the following context in your response:
'''

response = client.chat.completions.parse(
        model='kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'system', 
                   'content':STUDY_FIELDS_EXTRACTION_3},
                  {'role':'user','content':user_m}],
        temperature=0,
        response_format = Results
    )
print(response.choices[0].message.parsed)

fieldresult=[FieldResult(name='Crizotinib Effectiveness', value='Crizotinib-IR808 NPs showed synergistic chemophototherapy effects and good c-Met targeting ability, indicating potential for colorectal cancer treatment.', source_id=[1])]


## stuff

In [73]:
hh = Results(fieldresult=[FieldResult(name='Crizotinib Results', value='Crizotinib selectively inhibited the growth of ARD', source_id=[1]), FieldResult(name='Participants', value='NP', source_id=[1])])

In [83]:
i._c = ''

In [106]:
retrieved_docs = vector_store.similarity_search('Crizotinib results', k=2)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
docs_content


'PD-0325901/crizotinib can be given together at pharmacologically-active doses. The MTD for PD-0325901/crizotinib was 8\u2009mg B.D (days 1-21) and 200\u2009mg B.D continuously in a 28-days cycle. The combination was further explored with an alternate MEK1/2 inhibitor in RASMT CRC patients.\n2014-000463-40.\n\nTwenty-five patients were recruited in 4 cohorts up to doses of crizotinib 200\u2009mg B.D continuously with PD-0325901 8\u2009mg B.D, days 1-21 every 28 days. One in six patients exhibited a dose-limiting toxicity at this dose level. Drug-related adverse events were in keeping with single-agent toxicity profiles. The best clinical response was stable disease in seven patients (29%).'

In [141]:
Results

__main__.Results

In [142]:
docs_content = 'drugs. In this study, we report that ARID1A-deficient colorectal cancer (CRC) cells induce synthetic lethality when treated with inhibitors of c-MET receptor tyrosine kinase. c-MET specific inhibitor PHA-665752 as well as two other FDA-approved drugs, crizotinib and cabozantinib, selectively inhibited the growth of ARID1A-deficient CRC cells in vitro and in xenograft tumor models. Mechanistically, we identified a tripartite functional'

print(docs_content)


prompt = f'''
You are a helpful assistant. Use the following context in your response:
'''

response = client.chat.completions.parse(
        model='kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'system', 'content':prompt+docs_content},
                  {'role':'user','content':'Crizotinib results'}],
        temperature=0,
        response_format = Results
    )
print(response.choices[0].message.parsed)

drugs. In this study, we report that ARID1A-deficient colorectal cancer (CRC) cells induce synthetic lethality when treated with inhibitors of c-MET receptor tyrosine kinase. c-MET specific inhibitor PHA-665752 as well as two other FDA-approved drugs, crizotinib and cabozantinib, selectively inhibited the growth of ARID1A-deficient CRC cells in vitro and in xenograft tumor models. Mechanistically, we identified a tripartite functional
fieldresult=[FieldResult(name='crizotinib', value='Crizotinib results in the selective inhibition of ', source_id=[-1, -1, -1]), FieldResult(name='crizotinib', value='Crizotinib, a FDA-approved c-MET inhibitor, wasÊúâÊïàÁöÑ', source_id=[-1, -1, -1]), FieldResult(name='crizotinib', value='Crizotinib effectively suppresses the growth of AR', source_id=[-1, -1, -1])]


In [143]:
response.choices[0].message.parsed

Results(fieldresult=[FieldResult(name='crizotinib', value='Crizotinib results in the selective inhibition of ', source_id=[-1, -1, -1]), FieldResult(name='crizotinib', value='Crizotinib, a FDA-approved c-MET inhibitor, wasÊúâÊïàÁöÑ', source_id=[-1, -1, -1]), FieldResult(name='crizotinib', value='Crizotinib effectively suppresses the growth of AR', source_id=[-1, -1, -1])])

In [16]:
%%time
response = client.chat.completions.create(
        model='kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'user','content':'hey'}],
        temperature=0,
    )
print(response.choices[0].message.content)

Hi there! üòä How can I help you today?
CPU times: total: 0 ns
Wall time: 3.04 s


In [4]:
data = ['''


Introduction: In the phase 3 study entitled ALK in Lung cancer Trial of brigAtinib in 1st Line (ALTA-1L), which is a study of brigatinib in ALK inhibitor-naive advanced ALK-positive NSCLC, brigatinib exhibited superior progression-free survival (PFS) versus crizotinib in the two planned interim analyses. Here, we report the final efficacy, safety, and exploratory results.

Methods: Patients were randomized to brigatinib 180 mg once daily (7-d lead-in at 90 mg once daily) or crizotinib 250 mg twice daily. The primary end point was a blinded independent review committee-assessed PFS. Genetic alterations in plasma cell-free DNA were assessed in relation to clinical efficacy.

Results: A total of 275 patients were enrolled (brigatinib, n = 137; crizotinib, n = 138). At study end, (brigatinib median follow-up = 40.4 mo), the 3-year PFS by blinded independent review committee was 43% (brigatinib) versus 19% (crizotinib; median = 24.0 versus 11.1 mo, hazard ratio [HR] = 0.48, 95% confidence interval [CI]: 0.35-0.66). The median overall survival was not reached in either group (HR = 0.81, 95% CI: 0.53-1.22). Posthoc analyses suggested an overall survival benefit for brigatinib in patients with baseline brain metastases (HR = 0.43, 95% CI: 0.21-0.89). Detectable baseline EML4-ALK fusion variant 3 and TP53 mutation in plasma were associated with poor PFS. Brigatinib exhibited superior efficacy compared with crizotinib regardless of EML4-ALK variant and TP53 mutation. Emerging secondary ALK mutations were rare in patients progressing on brigatinib. No new safety signals were observed.

Conclusions: In the ALTA-1L final analysis, with longer follow-up, brigatinib continued to exhibit superior efficacy and tolerability versus crizotinib in patients with or without poor prognostic biomarkers. The suggested survival benefit with brigatinib in patients with brain metastases warrants future study.

''']

In [5]:
#from langchain.docstore.document import Document

doc =  Document(page_content=data[0], metadata={"source": "local"})
doc

Document(metadata={'source': 'local'}, page_content='\n\n\nIntroduction: In the phase 3 study entitled ALK in Lung cancer Trial of brigAtinib in 1st Line (ALTA-1L), which is a study of brigatinib in ALK inhibitor-naive advanced ALK-positive NSCLC, brigatinib exhibited superior progression-free survival (PFS) versus crizotinib in the two planned interim analyses. Here, we report the final efficacy, safety, and exploratory results.\n\nMethods: Patients were randomized to brigatinib 180 mg once daily (7-d lead-in at 90 mg once daily) or crizotinib 250 mg twice daily. The primary end point was a blinded independent review committee-assessed PFS. Genetic alterations in plasma cell-free DNA were assessed in relation to clinical efficacy.\n\nResults: A total of 275 patients were enrolled (brigatinib, n = 137; crizotinib, n = 138). At study end, (brigatinib median follow-up = 40.4 mo), the 3-year PFS by blinded independent review committee was 43% (brigatinib) versus 19% (crizotinib; median = 

6

OllamaEmbeddings(model='all-minilm', validate_model_on_init=False, base_url=None, client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=400, 
                                             chunk_overlap=100)
all_splits = text_splitter.split_documents([doc])
len(all_splits)

embeddings = OllamaEmbeddings(model="all-minilm")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

document_ids = vector_store.add_documents(documents=all_splits)

retrieved_docs = vector_store.similarity_search('Crizotinib results', k=2)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
print(docs_content)


prompt = f'''
You are a helpful assistant. Use the following context in your response:
'''

response = client.chat.completions.create(
        model='kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'system', 'content':prompt+docs_content},
                  {'role':'user','content':'Crizotinib results'}],
        temperature=0,
    )
print(response.choices[0].message.content)

['e2281722-8819-4760-8ddc-a030fef6426e',
 'f72a5bd2-2df8-4f93-8054-35ccfbda5e8b',
 'fa0fe031-bf16-46eb-93b7-b55b9e203665',
 '651ffb0f-de2d-4e66-ae78-5551a5df75ea',
 'b688449b-2d03-4574-ada3-17fb1b9fd208',
 'dc0bc29f-f933-402a-8d47-451cfcc00b3e']

Results: A total of 275 patients were enrolled (brigatinib, n = 137; crizotinib, n = 138). At study end, (brigatinib median follow-up = 40.4 mo), the 3-year PFS by blinded independent review committee was 43% (brigatinib) versus 19% (crizotinib; median = 24.0 versus 11.1 mo, hazard ratio [HR] = 0.48, 95% confidence interval [CI]: 0.35-0.66). The median overall survival was not reached in either

Results: A total of 275 patients were enrolled (brigatinib, n = 137; crizotinib, n = 138). At study end, (brigatinib median follow-up = 40.4 mo), the 3-year PFS by blinded independent review committee was 43% (brigatinib) versus 19% (crizotinib; median = 24.0 versus 11.1 mo, hazard ratio [HR] = 0.48, 95% confidence interval [CI]: 0.35-0.66). The median overall survival was not reached in either


In [13]:
prompt = f'''
You are a helpful assistant. Use the following context in your response:
'''

response = client.chat.completions.create(
        model='kamekichi128/qwen3-4b-instruct-2507',
        messages=[{'role':'system', 'content':prompt+docs_content},
                  {'role':'user','content':'Crizotinib results'}],
        temperature=0,
    )
print(response.choices[0].message.content)

The crizotinib results from the study show the following:

- **Progression-Free Survival (PFS):**  
  At study end, the 3-year PFS by blinded independent review committee was **19%** for patients treated with crizotinib.  
  The median PFS duration was **11.1 months**.

- **Hazard Ratio (HR):**  
  Compared to brigatinib, crizotinib had a hazard ratio of **0.48 (95% CI: 0.35‚Äì0.66)** for progression or death, indicating that crizotinib was less effective in delaying disease progression than brigatinib.

- **Overall Survival (OS):**  
  The median overall survival was **not reached** in either treatment group, suggesting that the study was not able to provide a definitive OS benefit for either drug.

In summary, crizotinib demonstrated a lower progression-free survival rate and shorter median PFS compared to brigatinib, with a statistically significant improvement in PFS observed with brigatinib. However, overall survival was not reached in either group.


In [None]:
ollama.embeddings(model='all-minilm', 
                  prompt='The sky is blue because of Rayleigh scattering')


Hello! How can I assist you today? üòä


In [30]:
PROMPT_RES_EXTRACTION  = '''
You are a clinical specialist analyzing clinical trial study reports. 
Your task is to to extract specific information as structured data.

# Reply Format: 
Return the information in the following JSON-format.
```json
{{        
    [
        {{
            "population": n,
            "time_frame": "time_frame",
            "outcomes":
                [
                    {{
                        "category_name": "category1",
                        "outcome": k1
                    }},
                    {{
                        "category_name": "category2",
                        "outcome": k2
                    }},
                    ...
                ]
         }},
        ...
    ]
}}
```
You MUST return ONLY valid JSON, Do NOT include any explanations, comments, or extra text.
"""
'''

In [29]:
to_work = [{'type': 'PRIMARY',
  'title': 'Clinical Response to Binimetinib Combined With PF-02341066',
  'description': 'To investigate response to treatment with RPII dose of Binimetinib with Crizotinib (PF-02341066), in patients with a) RASMT CRC or b) RASWT/cMET mut amplified CRC or c) RASWT/c-MET over-expressed CRC, as defined by stable, partially or completely responding disease, per Response Evaluation Criteria In Solid Tumors Criteria (RECIST v1.1) for target lesions and assessed by CT: Complete Response (CR), Disappearance of all target lesions; Partial Response (PR), \\>=30% decrease in the sum of the longest diameter of target lesions; Stable Disease (SD), Neither sufficient shrinkage to qualify for PR nor sufficient increase (\\>=20%) to qualify for Progressive Disease; Overall Response (OR) = CR + PR + SD',
  'populationDescription': 'Evaluable patients for the primary outcome are those patients who complete a response assessment after cycle 1 of treatment, or who progress early on treatment. 30 of the 36 recruited patients had a response assessment after cycle 1 or progressed early on treatment, and hence these 30 patients are evaluable for the primary analysis.',
  'reportingStatus': 'POSTED',
  'paramType': 'COUNT_OF_PARTICIPANTS',
  'unitOfMeasure': 'Participants',
  'timeFrame': 'Dose Expansion phase: change from baseline and up to 12 months.',
  'groups': [{'id': 'OG000',
    'title': 'Dose Expansion Phase',
    'description': 'Binimetinib 30mg BD interval dose administration Days 1-21 every 28 days PF-02341066 (Crizotinib) 250mg OD Days 1-28 continuously Dosage determined following the recommended Phase II dose identification in the dose escalation phase.\n\nPF-02341066: PF-02341066 (Crizotinib) 250mg OD or 200mg BD or 250mg BD Days 1-28 continuously\n\nBinimetinib: Binimetinib 30mg or 45 mg BD either continuously or Days 1-21 every 28 days'}],
  'denoms': [{'units': 'Participants',
    'counts': [{'groupId': 'OG000', 'value': '30'}]}],
  'classes': [{'categories': [{'title': 'Stable Disease',
      'measurements': [{'groupId': 'OG000', 'value': '7'}]},
     {'title': 'Progressive Disease',
      'measurements': [{'groupId': 'OG000', 'value': '22'}]},
     {'title': 'Early death from malignant disease',
      'measurements': [{'groupId': 'OG000', 'value': '1'}]}]}]}]
to_work

[{'type': 'PRIMARY',
  'title': 'Clinical Response to Binimetinib Combined With PF-02341066',
  'description': 'To investigate response to treatment with RPII dose of Binimetinib with Crizotinib (PF-02341066), in patients with a) RASMT CRC or b) RASWT/cMET mut amplified CRC or c) RASWT/c-MET over-expressed CRC, as defined by stable, partially or completely responding disease, per Response Evaluation Criteria In Solid Tumors Criteria (RECIST v1.1) for target lesions and assessed by CT: Complete Response (CR), Disappearance of all target lesions; Partial Response (PR), \\>=30% decrease in the sum of the longest diameter of target lesions; Stable Disease (SD), Neither sufficient shrinkage to qualify for PR nor sufficient increase (\\>=20%) to qualify for Progressive Disease; Overall Response (OR) = CR + PR + SD',
  'populationDescription': 'Evaluable patients for the primary outcome are those patients who complete a response assessment after cycle 1 of treatment, or who progress early on 

In [34]:
from enum import Enum
from pydantic import BaseModel, Field

class Outcome(BaseModel):
    category_name: str = Field(description='Short description of a category')
    outcome: int = Field(description='Percent of participants')

class ClinicalResult(BaseModel):
    population: int = Field(description='Total number of participants.')
    time_frame: str = Field(description='Time frame')
    outcomes: list[Outcome]

In [35]:
messages = [{'role':'system', 'content':PROMPT_RES_EXTRACTION+' \no_think'},
                {'role':'user', 'content':f"{to_work[0]}"}]

In [38]:
response = client.chat.completions.create(
                model='qwen3:8b',
                messages=messages,
                temperature=0,
                response_format={
                    "type": "json_object",
                    "json_schema": {
                        "name": "schema",
                        "schema": ClinicalResult.model_json_schema()
                    },
                },
            )
fin = response.choices[0].message.content
answer = fin.strip('<think>\n\n</think>\n\n')
print(answer)

{
    "population": 30,
    "time_frame": "Dose Expansion phase: change from baseline and up to 12 months.",
    "outcomes": [
        {
            "category_name": "Stable Disease",
            "outcome": 7
        },
        {
            "category_name": "Progressive Disease",
            "outcome": 22
        },
        {
            "category_name": "Early death from malignant disease",
            "outcome": 1
        }
    ]
}


In [42]:
response = client.chat.completions.create(
                model='qwen3:8b',
                messages=messages,
                temperature=0,
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "schema",
                        "schema": ClinicalResult.model_json_schema()
                    },
                },
            )
fin = response.choices[0].message.content
answer = fin.strip('<think>\n\n</think>\n\n')
print(answer)

{
    "population": 30,
    "time_frame": "Dose Expansion phase: change from baseline and up to 12 months.",
    "outcomes": [
        {
            "category_name": "Stable Disease",
            "outcome": 7
        },
        {
            "category_name": "Progressive Disease",
            "outcome": 22
        },
        {
            "category_name": "Early death from malignant disease",
            "outcome": 1
        }
    ]
}


In [45]:
response = client.chat.completions.create(
                model='qwen3:8b',
                messages=messages,
                temperature=0,
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "schema",
                        "schema": ClinicalResult.model_json_schema(),
                        'strict':True,
                    },
                    
                },
            )
fin = response.choices[0].message.content
answer = fin.strip('<think>\n\n</think>\n\n')
print(answer)

{
    "population": 30,
    "time_frame": "Dose Expansion phase: change from baseline and up to 12 months",
    "outcomes": [
        {
            "category_name": "Stable Disease",
            "outcome": 7
        },
        {
            "category_name": "Progressive Disease",
            "outcome": 22
        },
        {
            "category_name": "Early death from malignant disease",
            "outcome": 1
        }
    ]
}


In [51]:
response = client.chat.completions.parse(
    model='qwen3:8b',
    messages=messages,
    response_format=ClinicalResult,
)

fin = response.choices[0].message.content
answer = fin.strip('<think>\n\n</think>\n\n')
print(answer)

{
    "population": 30,
    "time_frame": "Dose Expansion phase: change from baseline and up to 12 months",
    "outcomes": [
        {
            "category_name": "Stable Disease",
            "outcome": 7
        },
        {
            "category_name": "Progressive Disease",
            "outcome": 22
        },
        {
            "category_name": "Early death from malignant disease",
            "outcome": 1
        }
    ]
}


In [56]:
hh = response.choices[0].message.parsed
hh

ClinicalResult(population=30, time_frame='Dose Expansion phase: change from baseline and up to 12 months', outcomes=[Outcome(category_name='Stable Disease', outcome=7), Outcome(category_name='Progressive Disease', outcome=22), Outcome(category_name='Early death from malignant disease', outcome=1)])

In [58]:
client.vector_stores

<openai.resources.vector_stores.vector_stores.VectorStores at 0x22ff3ca13d0>

In [40]:
ClinicalResult.model_validate_json(answer)

ClinicalResult(population=30, time_frame='Dose Expansion phase: change from baseline and up to 12 months.', outcomes=[Outcome(category_name='Stable Disease', outcome=7), Outcome(category_name='Progressive Disease', outcome=22), Outcome(category_name='Early death from malignant disease', outcome=1)])

In [41]:
import ollama

response = ollama.embed(
    model='qwen3-embedding',
    input='The sky is blue because of Rayleigh scattering',
)
print(response.embeddings)

ResponseError: model "qwen3-embedding" not found, try pulling it first (status code: 404)