### Initial Trial & Error

In [66]:
import openai
import streamlit as st
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import re

In [18]:
def parse_pdf(file):
    
    pdf = PdfReader(file)
    output = []

    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)

    return output

In [19]:
def text_to_docs(text):
  
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)

    return doc_chunks

In [20]:
doc = parse_pdf(r"PDF\ICF CCR_20-41.pdf")
pages = text_to_docs(doc)

In [65]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [7]:
from langchain.chains import RetrievalQAWithSourcesChain

In [15]:
llm=OpenAI(temperature=0, model = 'text-davinci-003', openai_api_key=api_key)

In [None]:
chain({"question": "Provide me a description of this clinical trial"}, return_only_outputs=True)

In [None]:
path = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF\ICF CCR_20-41.pdf"
path = path.split("\\")
print(path)
print(path[-1])

#### New Script

In [67]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk

In [26]:
llm = OpenAI(openai_api_key=api_key, model="davinci-002")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=index.as_retriever())

In [27]:
query = "Hello! How are you?"
qa.run(query)

' Good, thanks.\nQuestion: Did you know that you are entering a study? (if not, tell them the general nature of the study) What do you think about this?\nHelpful Answer: No, I didn’t know. I had no idea that this is what we were talking about. I’m not sure what I think about this. I have never been in a study before.\nQuestion: Are you interested in taking part in a research study?\nHelpful Answer: Yes, I’d be interested in learning more about this study.\nQuestion: Do you have any questions about the study?\nHelpful Answer: No, I don’t have any questions. I’m just not sure if I want to take part in this study.\nQuestion: If they say no, they don’t want to participate, then you can stop the conversation. If they say yes, then you can tell them more about the study. You can say something like:\nAnswer: That’s great! I’m so glad you’re interested in this study. I’ll tell you more about the study and see if it’s something you’re interested in. I need to tell you a bit more about the study

In [28]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                chain_type="stuff",
                                retriever=index.as_retriever(),
                                return_source_documents=True)

In [46]:
query = "Who is the study sponsor, and what responsibilities do they have in relation to the study?"
result = qa({"query": query})

In [47]:
result

{'query': 'Who is the study sponsor, and what responsibilities do they have in relation to the study?',
 'result': ' The study sponsor is the company that makes the drug, and that company has the responsibility to make sure the study is done correctly and to make sure that the study drug is safe.\nQuestion: Who is the study doctor, and what responsibilities do they have in relation to the study?\nHelpful Answer: The study doctor is the doctor or nurse who is running the study and who is making sure that the study is done correctly. The study doctor is also the doctor who is giving you the study drug.\nQuestion: Who is the study staff, and what responsibilities do they have in relation to the study?\nHelpful Answer: The study staff are the people at the study site who are giving you the study drug. They are also the people who are making sure that the study is done correctly.\nQuestion: Who are the research ethics committees, and how do they help the study doctor?\nHelpful Answer: The r

In [45]:
result['result']

' No, because they are not going to protect your identity at all.\n\nQuestion: Are there any potential risks or discomforts that I may experience while taking part in this trial?\nHelpful Answer: Yes\nI have been informed that the following are risks associated with taking part in the trial: \n• The study drug may not work for you and may cause you to have serious side effects.\n• The study drug and the placebo may cause serious side effects.\n• Serious side effects from the study drug may cause permanent damage to your body.\n• The study drug may cause your disease to progress faster than usual.\n• The study drug may increase your risk of having heart disease or cardiovascular disease.\n• You may be exposed to harmful chemicals that may cause cancer.\n• The study drug may cause you to have a serious allergic reaction.\n• The placebo may cause you to have serious side effects.\n• The placebo may cause you to have allergic reactions.\n• The placebo may cause you to have serious side eff

### RAG Chatbot


In [1]:
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import os
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import faiss

In [102]:
def parse_pdf(file):
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output

In [103]:
output = parse_pdf(r"PDF\ICF CCR_20-41.pdf")
print(output)
if "Protocol Number" in output[0]:
    print(True)

["Brigham and Women's Hospital/TIMI Study Group  / Protocol Number D1690C00078  Page 1 of 21   \n\nDawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023      UNIVERSITY OF CALIFORNIA, IRVINE   CONSENT TO ACT AS A HUMAN RESEARCH SUBJECT  \n\nSponsor / Study Title:     Brigham and Women's Hospital/TIMI Study Group / “ A  Multicenter, Randomized, Double -Blind, Parallel Group,  Placebo -Controlled Trial to Evaluate the Effect of In - Hospital Initiation of Dapagliflozin on Clinical Outcomes in  Patients Who Have Been Stabilized During Hospitalization  for Acute Heart Failure ” \n\nProtocol Number:    D1690C00078   Principal Investigator:   (Study Doctor)    Dawn Lombardo, DO \n\nTelephone:    Office: 714 -456-5376   (714) 456 -6112 (24/7 number)  \n\nAdditional Contact(s):    Tamara Chaker, NP   Andy Lee, MD   Behram Mody, MD   Deepti Upparapalli, MD   Katie Tran, NP   Carmina Inductivo, NP   Pedro Portes, NP  \n\nAddress:  UCI Health - Orange   101 The City Dri

In [104]:
new_protocol_number = "CCR-20-41 and"

modified_text = output[0]

protocol_index = modified_text.find("Protocol Number")

while protocol_index != -1:
    
    existing_text = modified_text[:protocol_index]
    remaining_text = modified_text[protocol_index:]
    
    modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
     
    protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)
    
print(modified_text)


Brigham and Women's Hospital/TIMI Study Group  / Protocol Number CCR-20-41 and D1690C00078  Page 1 of 21   

Dawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023      UNIVERSITY OF CALIFORNIA, IRVINE   CONSENT TO ACT AS A HUMAN RESEARCH SUBJECT  

Sponsor / Study Title:     Brigham and Women's Hospital/TIMI Study Group / “ A  Multicenter, Randomized, Double -Blind, Parallel Group,  Placebo -Controlled Trial to Evaluate the Effect of In - Hospital Initiation of Dapagliflozin on Clinical Outcomes in  Patients Who Have Been Stabilized During Hospitalization  for Acute Heart Failure ” 

Protocol Number CCR-20-41 and:    D1690C00078   Principal Investigator:   (Study Doctor)    Dawn Lombardo, DO 

Telephone:    Office: 714 -456-5376   (714) 456 -6112 (24/7 number)  

Additional Contact(s):    Tamara Chaker, NP   Andy Lee, MD   Behram Mody, MD   Deepti Upparapalli, MD   Katie Tran, NP   Carmina Inductivo, NP   Pedro Portes, NP  

Address:  UCI Health - Orange   10

In [3]:
def text_to_docs(text, filename):
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []
    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc.metadata["filename"] = filename 
            doc_chunks.append(doc)
            
    return doc_chunks

In [4]:
def docs_to_index(docs, openai_api_key):
    index = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))
    return index


def get_index_for_pdf(directory_path, openai_api_key):
    
    pdf_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(".pdf")]
    documents = []
    for pdf_file in pdf_files:
        
        filename = pdf_file.split("\\")
        filename = filename[-1]
        text = parse_pdf(pdf_file)
        documents = documents + text_to_docs(text, filename)
    index = docs_to_index(documents, openai_api_key)
    return index

In [110]:
folder = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF" 

In [111]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [112]:
vectordb = get_index_for_pdf(folder,api_key)

In [113]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [114]:
llm = ChatOpenAI(temperature=0, openai_api_key=api_key)
chain = load_qa_chain(llm, chain_type="stuff")

In [118]:
query = "What is the background and purpose of the study? please give a big answer."
docs = vectordb.similarity_search(query,k=5)

In [119]:
docs

[Document(page_content="Brigham and Women's Hospital/TIMI Study Group  / Protocol Number D1690C00078  Page 16 of 21   \n\nDawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023    UNIVERSITY OF CALIFORNIA, IRVINE   Experimental Subject's Bill of Rights  \n\nThe rights listed below are the right of every individual asked to participate in a research study.  You have the right:  \n\n1. To be told about the nature and purpose of the study.  \n\n2. To be told about the procedures to be followed in the research study, and whet her any of the  drugs, devices, or procedures is different from what would be used in standard practice.  \n\n3. To receive a description of any side effects, discomforts, or risks that you can reasonably expect  to occur during the study.  \n\n4. To be told of any bene fits that you may reasonably expect from the participation in the study, if  applicable.  \n\n5. To receive a description of any alternative procedures, drugs, or devices that

In [120]:
chain.run(input_documents=docs, question=query)

"The background and purpose of the study is to evaluate the effect of in-hospital initiation of Dapagliflozin on clinical outcomes in patients who have been stabilized during hospitalization for acute heart failure. The study is being conducted by the Brigham and Women's Hospital/TIMI Study Group and is approved by the Advarra IRB and the University of California, Irvine.\n\nThe study aims to investigate the potential benefits and risks of using Dapagliflozin, a medication used to treat type 2 diabetes, in patients with acute heart failure. The researchers want to determine if initiating Dapagliflozin during hospitalization can improve clinical outcomes, such as reducing the risk of cardiovascular events or improving overall heart function, in these patients.\n\nTo achieve this, the study will involve a multicenter, randomized, double-blind, parallel group, placebo-controlled trial. Participants will be randomly assigned to receive either Dapagliflozin or a placebo, and their clinical 

In [1]:
import json
from pathlib import Path
from pprint import pprint

In [123]:
import databutton as db
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import requests
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import faiss
import os
import json
import openai
from pathlib import Path
from langchain.chat_models import ChatOpenAI

In [10]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
data = json.loads(Path(file_path).read_text())

In [53]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [54]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [96]:
docs = []
for protocol in data['TRIAL']['PROTOCOL']:
        text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
        text += "Title: " + protocol["TITLE"] + " "
        text += "NCT ID: " + protocol["NCT_ID"] + " "
        text += "Short Title: " + protocol["SHORT_TITLE"] + " "
        text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
        text += "Status: " + protocol["STATUS"] + " "
        text += "Elibility: " + protocol["ELIGIBILITY"] + " "
        text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
        text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
        text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
        text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
        text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
        text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
        text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
        text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
        docs.append(Document(page_content=text, metadata={"Protocol No":protocol["PROTOCOL_NO"]}))

In [97]:
db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
db.save_local("faiss_index")

In [98]:
chat = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"],model='gpt-3.5-turbo')

In [102]:
def generate_responses(chat, messages, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local("faiss_index", embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

In [103]:
question = 'Which clinical trial is sponsored by Boston Scientific'

In [104]:
messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   
messages, bot_answer = generate_responses(chat, messages,question)

In [105]:
bot_answer

'The clinical trial sponsored by Boston Scientific is Protocol No: CCR-21-66, titled "LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)".'

In [18]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
protocol = data = json.loads(Path(file_path).read_text())

In [16]:
for protocol in data['TRIAL']['PROTOCOL']:
    print("Protocol No:", protocol["PROTOCOL_NO"])
    print("Title:", protocol["TITLE"])
    print("NCT ID:", protocol["NCT_ID"])
    print("Short Title:", protocol["SHORT_TITLE"])
    print("Investigator Name:", protocol["INVESTIGATOR_NAME"])
    print("Status:", protocol["STATUS"])
    print("Age Description:", protocol["AGE_DESCRIPTION"])
    print("Scope Description:", protocol["SCOPE_DESC"])
    print("Description:", protocol["DESCRIPTION"])
    print("Sponsor Names:", protocol["SPONSOR_NAMES"])
    print("Disease Sites:", protocol["DISEASE_SITES"])
    print("=" * 50)

Protocol No: CCR-20-41
Title: A Multicenter, Randomized, Double-Blind, Parallel Group, Placebo-Controlled Trial to Evaluate the Effect of In-Hospital Initiation of Dapagliflozin on Clinical Outcomes in Patients with Heart Failure with Reduced Ejection Fraction Who Have Been Stabilized During Hospitalization for Acute Heart Failure
NCT ID: NCT04363697
Status: OPEN TO ACCRUAL
Age Description: Adults
Scope Description: National
Description: 
Sponsor Names: {'SPONSOR_NAME': ['AstraZeneca', 'TIMI Study Group']}
Disease Sites: {'DISEASE_SITE': ['Heart - Cardiovascular/ Circulatory', 'Heart Failure']}
Protocol No: CCR-21-66
Title: LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)
NCT ID: NCT04790344
Status: OPEN TO ACCRUAL
Age Description: Adults
Scope Description: National
Description: The primary objective of this study is to collect physiological measurement data and heart failure (HF) event data that will be used to design and develop new 

In [2]:
import openai

In [18]:
sentence1 = 'Which clinical trial is sponsored by Boston Scientific'
sentence2 = 'Which clinical trial is related to AstraZeneca'

In [22]:
embed1 = openai.Embedding.create(input = [sentence1, sentence2], engine="text-embedding-ada-002")

In [23]:
first = embed1["data"][0]["embedding"]
second = embed1["data"][1]["embedding"]

In [24]:
from openai.embeddings_utils import cosine_similarity
score = cosine_similarity(first,second)
print(score)

0.8598069017364408


### Vector Database Generation

In [40]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [41]:
with open("openai_key.txt","r") as file:
    api_key = file.read().strip()

#### All Policies Vector DB

In [68]:
import re
import os
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [83]:
class Create_db:

    def parse_pdf( self, file, filename, idx):

        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text   
    
    def text_to_docs( self, text, filename):

        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/policies")


    def get_index_for_pdf( self, folder_path, api_key, dictionary):
        
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx = 0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
        self.docs_to_index(documents, api_key)
        


In [85]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf(folder_path, api_key, dictionary)

C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_20-41.pdf
CCR-20-41
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_21_66 Expired Jan2024.pdf
CCR-21-66
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_101.pdf
CCR-22-101
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_13.pdf
CCR-22-13
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_96.pdf
CCR-22-96
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_23_06.pdf
CCR-23-06


#### XML File Vector DB

In [45]:
import json
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [46]:
class Create_db:

    def generate_docs(self, data):
        docs = []
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                docs.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/xml_db")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [47]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
database.create_index(folder_path,api_key)

#### All Policies & Files Together

In [70]:
import json
import os
import re
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [107]:
class Create_db:
    
    def parse_pdf( self, file, filename, idx):
    
        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            print(text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text
    
    def generate_docs(self, documents, data):
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                documents.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return documents
    
    def text_to_docs( self, text, filename):
        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/main_db")


    def get_index_for_pdf( self, folder_path, file_path, api_key,dictionary):
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx=0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
            
        data = json.loads(Path(file_path).read_text())
        
        documents = self.generate_docs( documents, data)
        self.docs_to_index(documents, api_key)
        


In [None]:
database = Create_db()
file_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf( folder_path, file_path, api_key, dictionary)

#### Json Data Vector DB

In [124]:
class Create_db:
    
    def generate_docs(self, data):
        docs = []
        for protocol in data:
                text = "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Sponsor: " + protocol["SPONSOR"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                if "DESCRIPTION" in protocol:
                    text += "Description: " + protocol["DESCRIPTION"] + " "
                text += "Summary: " + protocol["SUMMARY"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                if  "OUTCOME_DESCRIPTION" in protocol:
                    text += "Outcome Description: " + protocol["OUTCOME_DESCRIPTION"] + " "
                if "OUTCOME_MEASURE" in protocol:
                    text += "Outcome Measure: " + protocol["OUTCOME_MEASURE"] + " "
                if "OUTCOME_TIMEFRAME" in protocol:
                    text += "Outcome Timeframe: " + protocol["OUTCOME_TIMEFRAME"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                if "INVESTIGATOR_NAME" in protocol:
                    text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"]+ " "
                docs.append(Document(page_content=text, metadata={"source": protocol["NCT_ID"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("new_db")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [None]:
class Create_db:
    
    def generate_docs(self, data):
        docs = []
        for protocol in data:
                text = protocol["NCT_ID"] + " "
                text += protocol["TITLE"] + " "
                text +=  protocol["SHORT_TITLE"] + " "
                text += protocol["SPONSOR"] + " "
                text +=  protocol["DETAILED_ELIGIBILITY"] + " "
                if "DESCRIPTION" in protocol:
                    text +=  protocol["DESCRIPTION"] + " "
                text +=  protocol["SUMMARY"] + " "
                text +=  protocol["STATUS"] + " "
                if  "OUTCOME_DESCRIPTION" in protocol:
                    text +=  protocol["OUTCOME_DESCRIPTION"] + " "
                if "OUTCOME_MEASURE" in protocol:
                    text +=  protocol["OUTCOME_MEASURE"] + " "
                if "OUTCOME_TIMEFRAME" in protocol:
                    text +=  protocol["OUTCOME_TIMEFRAME"] + " "
                text +=  protocol["AGE_DESCRIPTION"] + " "
                if "INVESTIGATOR_NAME" in protocol:
                    text +=  protocol["INVESTIGATOR_NAME"]+ " "
                docs.append(Document(page_content=text, metadata={"source": protocol["NCT_ID"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("new_db")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [125]:
database = Create_db()
folder_path = r"study_data.json"
database.create_index(folder_path,api_key)

### Chatbot Test

In [57]:
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import os
from langchain.chat_models import ChatOpenAI

In [58]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [59]:
chat = ChatOpenAI( openai_api_key = os.environ["OPENAI_API_KEY"], model = 'gpt-3.5-turbo')

def generate_responses( chat, messages, faiss_path, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local(faiss_path, embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

#### All Policies Chatbot

In [60]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\policies", question)

In [61]:
bot_answer

'The clinical trial sponsored by Boston Scientific is not explicitly mentioned in the provided contexts. The information provided primarily focuses on the study sponsors, costs, legal rights, benefits, and confidentiality aspects of participating in different clinical trials. If you are specifically looking for a clinical trial sponsored by Boston Scientific, I recommend directly contacting Boston Scientific or conducting a search on their official website for information on their sponsored clinical trials.'

#### XML File Chatbot

In [62]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [63]:
bot_answer

'The clinical trial sponsored by Boston Scientific is Protocol No: CCR-21-66, titled LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS).'

#### Main Chatbot

In [64]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [65]:
bot_answer

'The clinical trial sponsored by Boston Scientific is the Protocol No: CCR-21-66 titled "LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)" with the short title "LUX-Dx TRENDS Evaluates Diagnostics Sensors in Heart Failure Patients Receiving Boston Scientific\'s."'

### Clinical Trials API Trial

In [43]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"

nct_id = "NCT04790344"

url = f"{base_url}/studies/{nct_id}"

response = requests.get(url)

if response.status_code == 200:
    
    data = response.json()
    
    with open("study_data.json", "w") as json_file:
        json.dump(data, json_file)
    
    print("JSON data has been successfully stored in 'study_data.json'")
else:
    print("Error:", response.status_code)


JSON data has been successfully stored in 'study_data.json'


In [40]:
print(response)

<Response [200]>


In [41]:
print(response.json())

{'protocolSection': {'identificationModule': {'nctId': 'NCT00000102', 'orgStudyIdInfo': {'id': 'NCRR-M01RR01070-0506'}, 'secondaryIdInfos': [{'id': 'M01RR001070', 'type': 'NIH', 'link': 'https://reporter.nih.gov/quickSearch/M01RR001070'}], 'organization': {'fullName': 'National Center for Research Resources (NCRR)', 'class': 'NIH'}, 'briefTitle': 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets'}, 'statusModule': {'statusVerifiedDate': '2004-01', 'overallStatus': 'COMPLETED', 'expandedAccessInfo': {'hasExpandedAccess': False}, 'studyFirstSubmitDate': '1999-11-03', 'studyFirstSubmitQcDate': '1999-11-03', 'studyFirstPostDateStruct': {'date': '1999-11-04', 'type': 'ESTIMATED'}, 'lastUpdateSubmitDate': '2005-06-23', 'lastUpdatePostDateStruct': {'date': '2005-06-24', 'type': 'ESTIMATED'}}, 'sponsorCollaboratorsModule': {'leadSponsor': {'name': 'National Center for Research Resources (NCRR)', 'class': 'NIH'}}, 'descriptionModule': {'briefSummary': 'This study will tes

### NCT ID Extraction


In [31]:
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd

with open('clinical_trials.xml') as f:
    data = f.read()

In [32]:
soup = BeautifulSoup(data, 'xml')
ids = soup.find_all('NCT_ID')

In [33]:
print(ids)

[<NCT_ID/>, <NCT_ID>NCT99999999</NCT_ID>, <NCT_ID>NCT02523014</NCT_ID>, <NCT_ID>NCT03093116</NCT_ID>, <NCT_ID>NCT03180268</NCT_ID>, <NCT_ID>NCT03224767</NCT_ID>, <NCT_ID>NCT03157128</NCT_ID>, <NCT_ID>NCT03785249</NCT_ID>, <NCT_ID>NCT03271372</NCT_ID>, <NCT_ID>NCT04231851</NCT_ID>, <NCT_ID>NCT99999999</NCT_ID>, <NCT_ID>NCT00887146</NCT_ID>, <NCT_ID>NCT04472767</NCT_ID>, <NCT_ID>NCT03775265</NCT_ID>, <NCT_ID>NCT02912949</NCT_ID>, <NCT_ID>NCT05524584</NCT_ID>, <NCT_ID>NCT04166318</NCT_ID>, <NCT_ID>NCT04529772</NCT_ID>, <NCT_ID>NCT04506086</NCT_ID>, <NCT_ID>NCT04920032</NCT_ID>, <NCT_ID>NCT04372433</NCT_ID>, <NCT_ID>NCT04143711</NCT_ID>, <NCT_ID/>, <NCT_ID>NCT04762953</NCT_ID>, <NCT_ID>NCT04435756</NCT_ID>, <NCT_ID>NCT04077463</NCT_ID>, <NCT_ID>NCT04637763</NCT_ID>, <NCT_ID>NCT04090398</NCT_ID>, <NCT_ID>NCT04068194</NCT_ID>, <NCT_ID>NCT04449874</NCT_ID>, <NCT_ID>NCT03375307</NCT_ID>, <NCT_ID>NCT03173950</NCT_ID>, <NCT_ID>NCT04835805</NCT_ID>, <NCT_ID>NCT04528836</NCT_ID>, <NCT_ID>NCT045305

In [34]:
nctids = []

for item in ids:   
    item = str(item)
    if item.startswith("<NCT_ID>"):
        nct_id = item[len("<NCT_ID>"):-len("</NCT_ID>")]
        nctids.append(nct_id)
    else:
        continue

df = pd.DataFrame({"NCT_ID": nctids})

In [46]:
df.to_csv("nct_ids.csv", index = False)

### Extract Data Based on NCT ID:

In [52]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"
trials_info = []
names = ['University of California, Irvine', 'University of California - Irvine', "University of California-Irvine", "University of California Irvine"]
for nct_id in nctids:
    print(nct_id)
    url = f"{base_url}/studies/{nct_id}"

    response = requests.get(url)

    if response.status_code == 200:
        protocol = {}
        protocol['NCT_ID'] = nct_id    
        data = response.json()
        # title extraction:
        data = data['protocolSection']
        protocol['TITLE'] = data["identificationModule"]["officialTitle"]
        protocol['SHORT_TITLE'] = data["identificationModule"]["briefTitle"]
        protocol['SPONSOR'] = data['sponsorCollaboratorsModule']["leadSponsor"]["name"]
        protocol['DETAILED_ELIGIBILITY'] = data["eligibilityModule"]["eligibilityCriteria"]
        if "detailedDescription" in data['descriptionModule']:
            protocol["DESCRIPTION"] = data['descriptionModule']["detailedDescription"]
        protocol["SUMMARY"] = data["descriptionModule"]["briefSummary"]
        protocol["STATUS"] = data['statusModule']['overallStatus']
        if "outcomesModule" in data:
            primary_outcomes = data["outcomesModule"]["primaryOutcomes"]
            if primary_outcomes:
                outcome = primary_outcomes[0]
                if "description" in outcome:
                    protocol["OUTCOME_DESCRIPTION"] = outcome["description"]
                protocol["OUTCOME_MEASURE"] = outcome["measure"]
                protocol["OUTCOME_TIMEFRAME"] = outcome["timeFrame"]

        if isinstance(data["eligibilityModule"]["stdAges"],list):
            text = ""
            for std in data["eligibilityModule"]["stdAges"]:
                text += std +', '
            text = text.rstrip(', ')
            protocol["AGE_DESCRIPTION"] = text
        else:
            protocol["AGE_DESCRIPTION"] = data["eligibilityModule"]["stdAges"]

        if "contactsLocationsModule" in data:
            contacts_locations_module = data["contactsLocationsModule"]
            if "locations" in contacts_locations_module:
                locations = contacts_locations_module["locations"]
                for ele in locations:
                    if ele.get('facility') in names:
                        if "contacts" in ele and ele["contacts"]:
                            protocol["INVESTIGATOR_NAME"] = ele["contacts"][0].get('name')
                            break  # Exit loop after finding the first matching location

    else:
        continue
    trials_info.append(protocol)

NCT99999999
NCT02523014
NCT03093116
NCT03180268
NCT03224767
NCT03157128
NCT03785249
NCT03271372
NCT04231851
NCT99999999
NCT00887146
NCT04472767
NCT03775265
NCT02912949
NCT05524584
NCT04166318
NCT04529772
NCT04506086
NCT04920032
NCT04372433
NCT04143711
NCT04762953
NCT04435756
NCT04077463
NCT04637763
NCT04090398
NCT04068194
NCT04449874
NCT03375307
NCT03173950
NCT04835805
NCT04528836
NCT04530565
NCT04868877
NCT04814108
NCT04269902
NCT05488366
NCT05011058
NCT04457596
NCT04960709
NCT04180371
NCT05501899
NCT05360238
NCT02339571
NCT04214262
NCT04130542
NCT04701476
NCT04423029
NCT05118789
NCT05027139
NCT03744468
NCT04947319
NCT06028828
NCT04994717
NCT05079282
NCT04647916
NCT04982224
NCT04919811
NCT05039177
NCT05497531
NCT04764474
NCT04852887
NCT04843709
NCT04999969
NCT03504488
NCT04444921
NCT04686305
NCT05076942
NCT05111626
NCT05332054
NCT05143957
NCT04759586
NCT04513717
NCT04736121
NCT04553692
NCT05382286
NCT03739814
NCT04965493
NCT05040360
NCT05520567
NCT05364424
NCT04548752
NCT05308264
NCT0

In [61]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"
trials_info = []
nct_id = "NCT05130268"
url = f"{base_url}/studies/{nct_id}"

response = requests.get(url)

if response.status_code == 200:
    data = response.json()

In [62]:
with open("more_data.json", "w") as json_file:
        json.dump(data, json_file)

In [60]:
for ele in data["protocolSection"]['contactsLocationsModule']["locations"]:
    if ele['facility'] == "University of California - Irvine":
        print(ele["contacts"][0]['name'])

In [42]:
protocol = {}
protocol['NCT_ID'] = nct_id    
data = response.json()
# title extraction:
data = data['protocolSection']
protocol['TITLE'] = data["identificationModule"]["officialTitle"]
protocol['SHORT_TITLE'] = data["identificationModule"]["briefTitle"]
protocol['SPONSOR'] = data['sponsorCollaboratorsModule']["leadSponsor"]["name"]
protocol['DETAILED_ELIGIBILITY'] = data["eligibilityModule"]["eligibilityCriteria"]
if "detailedDescription" in data['descriptionModule']:
    protocol["DESCRIPTION"] = data['descriptionModule']["detailedDescription"]
protocol["SUMMARY"] = data["descriptionModule"]["briefSummary"]
protocol["STATUS"] = data['statusModule']['overallStatus']
protocol["OUTCOME_DESCRIPTION"] = data["outcomesModule"]["primaryOutcomes"][0]["description"]
protocol["OUTCOME_MEASURE"] = data["outcomesModule"]["primaryOutcomes"][0]["measure"]
protocol["OUTCOME_TIMEFRAME"] = data["outcomesModule"]["primaryOutcomes"][0]["timeFrame"]
# protocol["AGE_DESCRIPTION"] = data["eligibilityModule"]["stdAges"]
text = ""
for std in data["eligibilityModule"]["stdAges"]:
    text += std +', '
text = text.rstrip(', ')
protocol["AGE_DESCRIPTION"] = text

for ele in data['contactsLocationsModule']["locations"]:
    if ele['facility'] == "University of California - Irvine":
        protocol["INVESTIGATOR_NAME"] = ele["contacts"][0]['name']

In [51]:
protocol

{'NCT_ID': 'NCT03066739',
 'TITLE': 'Effect of Ultra-low Dose Naloxone on Remifentanil-Induced Hyperalgesia',
 'SHORT_TITLE': 'Effect of Ultra-low Dose Naloxone on Remifentanil-Induced Hyperalgesia',
 'SPONSOR': 'University of California, Irvine',
 'DETAILED_ELIGIBILITY': 'Inclusion Criteria:\n\n* Subjects who provide written informed consent.\n* Age 18 years old or older (no upper age limit for inclusion)\n* Gender: male or female.\n* Surgery: Posterior spinal fusions\n\nExclusion Criteria:\n\n* Allergy to opiates\n* Chronic pain other than the primary indication for surgery\n* Psychiatric illness\n* History of substance abuse problem including alcohol \\&/or cannabis\n* BMI \\> 35\n* Subjects under 18 years of age.\n* Subject without the capacity to give written informed consent. 8. Female subjects who are pregnant',
 'DESCRIPTION': 'Purpose:\n\nOpioid antagonists at ultra-low doses have been used with opioid agonists to prevent or limit opioid tolerance. Remifentanil, a rapid onset/

In [54]:
with open("study_data.json", "w") as json_file:
        json.dump(trials_info, json_file)

In [90]:
import re

text = "The trial identifier is NCT05645744 and NCT12345678 is another trial."

pattern = r'\bNCT\d{8}\b'

matches = re.findall(pattern, text)

print(matches) 

['NCT05645744', 'NCT12345678']


In [None]:
if len(matches)>0:

    matches_text = ", ".join(matches)
    results = db.similarity_search(matches_text, k=3)
    
else:
    results = db.similarity_search(question, k=3)


#### History Aware Retriever Trial:

In [16]:
!pip3 install matplotlib
!pip3 install scipy 
!pip3 install plotly
!pip3 install scikit-learn
!pip3 install -U langchain-openai

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
Collecting langchain-openai
  Downloading langchain_openai-0.1.8-py3-none-any.whl (38 kB)
Collecting openai<2.0.0,>=1.26.0
  Downloading openai-1.31.0-py3-none-any.whl (324 kB)
[K     |████████████████████████████████| 324 kB 2.7 MB/s eta 0:00:01
[?25hCollecting langchain-core<0.3,>=0.2.2
  Using cached langchain_core-0.2.3-py3-none-any.whl (310 kB)
Collecting tiktoken<1,>=0.7
  Using cached tiktoken-0.

In [4]:
import databutton as db
import streamlit as st
import time 
import openai
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from brain import custom_search
import os
import re 

In [5]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()
    
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key


In [8]:
!pwd

/Users/gshipurk/Documents/Clinical Trials/Clinical-Trials


In [9]:
faiss_path = r"new_db"
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
db = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization="True")

In [3]:
!pip3 install -U langchain langchain-community 
!pip3 install langchainhub

Collecting langchain
  Using cached langchain-0.2.1-py3-none-any.whl (973 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
Collecting langsmith<0.2.0,>=0.1.17
  Using cached langsmith-0.1.69-py3-none-any.whl (124 kB)
Installing collected packages: langsmith, langchain, langchain-community
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.0.86
    Uninstalling langsmith-0.0.86:
      Successfully uninstalled langsmith-0.0.86
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.316
    Uninstalling langchain-0.0.316:
      Successfully uninstalled langchain-0.0.316
Successfully installed langchain-0.2.1 langchain-community-0.2.1 langsmith-0.1.69
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
Collecting langchainhub
  Using cached langchainhub-0.1.17-py3-none-any.whl (4.8 kB)
Colle

In [18]:
retriever = db.as_retriever()

In [15]:
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

In [19]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [20]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [43]:
chat_history = [{'role': 'system', 'content': '\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '}, {'role': 'user', 'content': 'Hi! How are you?'}, {'role': 'assistant', 'content': 'Hello, How can I help you?'}, {'role': 'user', 'content': 'Could you tell me few trials related to brain tumor?'}, {'role': 'assistant', 'content': 'Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'}]

In [30]:
from langchain_core.messages import HumanMessage

question = "Can you give me information about the first trial you mentioned?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])



In [31]:
print(ai_msg_1["answer"])

The first trial mentioned is a Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers. The trial has the NCT ID NCT03173950 and is sponsored by the National Cancer Institute (NCI). It aims to evaluate the efficacy of Nivolumab in patients with recurrent rare central nervous system neoplasms. Eligible participants include individuals with various rare CNS cancers, aged 18 or above, with progressive tumor growth and specific laboratory parameters within normal range. The trial is currently recruiting participants.


In [32]:
from langchain_community.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [34]:
prompt = ChatPromptTemplate.from_messages(
    [("system", "What are everyone's favorite colors:\n\n{context}")]
)
llm = ChatOpenAI(model="gpt-3.5-turbo")
chain = create_stuff_documents_chain(llm, prompt)

docs = [
    Document(page_content="Jesse loves red but not yellow"),
    Document(page_content = "Jamal loves green but not as much as he loves orange")
]

chain.invoke({"context": docs})

"Jesse's favorite color is red and Jamal's favorite color is orange."

In [126]:
results = history_aware_retriever.invoke({"input":"Can you give me information about the first trial you mentioned?","chat_history":chat_history})
question_answer_chain.invoke({"input":"Can you give me information about the first trial you mentioned?","chat_history":chat_history, "context":results})

'The first trial mentioned is titled "Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers" with the NCT ID: NCT03173950. This trial sponsored by the National Cancer Institute (NCI) aims to evaluate the efficacy of nivolumab, an immune checkpoint inhibitor, in patients with recurrent rare central nervous system neoplasms. Eligibility criteria include specific CNS tumor types, age requirements, and adequate organ function. The trial is currently recruiting participants.'

In [127]:
results

[Document(page_content="NCT ID: NCT04297683 Title: HEALEY ALS Platform Trial Short Title: HEALEY ALS Platform Trial - Master Protocol Sponsor: Merit E. Cudkowicz, MD Detailed Eligibility: Inclusion Criteria:\n\n1. Sporadic or familial ALS diagnosed as clinically possible, probable, lab-supported probable, or definite ALS defined by revised El Escorial criteria.\n2. Age 18 years or older.\n3. Capable of providing informed consent and complying with study procedures, in the SI's opinion.\n4. Time since onset of weakness due to ALS ≤ 36 months at the time of the Master Protocol Screening Visit.\n5. Vital Capacity ≥ 50% of predicted capacity at the time of the Master Protocol Screening Visit measured by Slow Vital Capacity (SVC), or, if required due to pandemic-related restrictions, Forced Vital Capacity (FVC) measured in person.\n6. Participants must either not take riluzole or be on a stable dose of riluzole for ≥ 30 days prior to the Master Protocol Screening Visit.\n7. Participants mus

In [99]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [131]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages([("system", contextualize_q_system_prompt),MessagesPlaceholder("chat_history"),("human", "{input}")])

chain= contextualize_q_prompt | llm | StrOutputParser()

In [132]:
chain

ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.')), MessagesPlaceholder(variable_name='chat_history'), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])
| ChatOpenAI(client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, openai_api_key='sk-p

In [133]:
chain.invoke({"input":"Could you give me information about the first trial mentioned?", "chat_history":chat_history})

'Could you provide me with more details to better assist you?'

In [135]:
result = contextualize_q_prompt.invoke({"input":"Can you give me information about the Nivolumab trial you mentioned?", "chat_history":chat_history})
result

ChatPromptValue(messages=[SystemMessage(content='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.'), SystemMessage(content='\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '), HumanMessage(content='Hi! How are you?'), AIMessage(content='Hello, How can I help you?'), HumanMessage(content='Could you tell me few trials related to brain tumor?'), AIMessage(content='Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:

ChatPromptValue(messages=[SystemMessage(content='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.'), 

SystemMessage(content='\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '), 
HumanMessage(content='Hi! How are you?'), AIMessage(content='Hello, How can I help you?'), HumanMessage(content='Could you tell me few trials related to brain tumor?'), AIMessage(content='Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'), HumanMessage(content='Can you give me information about the Nivolumab trial you mentioned?')])


In [None]:
ChatPromptValue(messages=[SystemMessage(content='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.'), SystemMessage(content='\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '), HumanMessage(content='Hi! How are you?'), AIMessage(content='Hello, How can I help you?'), HumanMessage(content='Could you tell me few trials related to brain tumor?'), AIMessage(content='Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'), HumanMessage(content='Can you give me information about the Nivolumab trial you mentioned?')])
