### Initial Trial & Error

In [7]:
import openai
import streamlit as st
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import re

In [None]:
def parse_pdf(file):
    
    pdf = PdfReader(file)
    output = []

    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)

    return output

In [None]:
def text_to_docs(text):
  
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)

    return doc_chunks

In [11]:
with open(r"../openai-key/openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

In [None]:
llm=OpenAI(temperature=0, model = 'text-davinci-003', openai_api_key=api_key)

In [None]:
chain({"question": "Provide me a description of this clinical trial"}, return_only_outputs=True)

In [None]:
path = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF\ICF CCR_20-41.pdf"
path = path.split("\\")
print(path)
print(path[-1])

#### New Script

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk

In [None]:
llm = OpenAI(openai_api_key=api_key, model="davinci-002")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=index.as_retriever())

In [None]:
query = "Hello! How are you?"
qa.run(query)

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                chain_type="stuff",
                                retriever=index.as_retriever(),
                                return_source_documents=True)

In [None]:
query = "Who is the study sponsor, and what responsibilities do they have in relation to the study?"
result = qa({"query": query})

In [None]:
result

In [None]:
result['result']

### RAG Chatbot


In [None]:
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import os
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import faiss

In [None]:
def parse_pdf(file):
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output

In [None]:
output = parse_pdf(r"PDF\ICF CCR_20-41.pdf")
print(output)
if "Protocol Number" in output[0]:
    print(True)

In [None]:
new_protocol_number = "CCR-20-41 and"

modified_text = output[0]

protocol_index = modified_text.find("Protocol Number")

while protocol_index != -1:
    
    existing_text = modified_text[:protocol_index]
    remaining_text = modified_text[protocol_index:]
    
    modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
     
    protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)
    
print(modified_text)


In [None]:
def text_to_docs(text, filename):
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []
    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc.metadata["filename"] = filename 
            doc_chunks.append(doc)
            
    return doc_chunks

In [None]:
def docs_to_index(docs, openai_api_key):
    index = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))
    return index


def get_index_for_pdf(directory_path, openai_api_key):
    
    pdf_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(".pdf")]
    documents = []
    for pdf_file in pdf_files:
        
        filename = pdf_file.split("\\")
        filename = filename[-1]
        text = parse_pdf(pdf_file)
        documents = documents + text_to_docs(text, filename)
    index = docs_to_index(documents, openai_api_key)
    return index

In [None]:
folder = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF" 

In [None]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [None]:
vectordb = get_index_for_pdf(folder,api_key)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm = ChatOpenAI(temperature=0, openai_api_key=api_key)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "What is the background and purpose of the study? please give a big answer."
docs = vectordb.similarity_search(query,k=5)

In [None]:
docs

In [None]:
chain.run(input_documents=docs, question=query)

In [None]:
import json
from pathlib import Path
from pprint import pprint

In [None]:
import databutton as db
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import requests
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import faiss
import os
import json
import openai
from pathlib import Path
from langchain.chat_models import ChatOpenAI

In [None]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
data = json.loads(Path(file_path).read_text())

In [None]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [None]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [None]:
docs = []
for protocol in data['TRIAL']['PROTOCOL']:
        text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
        text += "Title: " + protocol["TITLE"] + " "
        text += "NCT ID: " + protocol["NCT_ID"] + " "
        text += "Short Title: " + protocol["SHORT_TITLE"] + " "
        text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
        text += "Status: " + protocol["STATUS"] + " "
        text += "Elibility: " + protocol["ELIGIBILITY"] + " "
        text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
        text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
        text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
        text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
        text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
        text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
        text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
        text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
        docs.append(Document(page_content=text, metadata={"Protocol No":protocol["PROTOCOL_NO"]}))

In [None]:
db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
db.save_local("faiss_index")

In [None]:
chat = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"],model='gpt-3.5-turbo')

In [None]:
def generate_responses(chat, messages, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local("faiss_index", embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

In [None]:
question = 'Which clinical trial is sponsored by Boston Scientific'

In [None]:
messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   
messages, bot_answer = generate_responses(chat, messages,question)

In [None]:
bot_answer

In [None]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
protocol = data = json.loads(Path(file_path).read_text())

In [None]:
for protocol in data['TRIAL']['PROTOCOL']:
    print("Protocol No:", protocol["PROTOCOL_NO"])
    print("Title:", protocol["TITLE"])
    print("NCT ID:", protocol["NCT_ID"])
    print("Short Title:", protocol["SHORT_TITLE"])
    print("Investigator Name:", protocol["INVESTIGATOR_NAME"])
    print("Status:", protocol["STATUS"])
    print("Age Description:", protocol["AGE_DESCRIPTION"])
    print("Scope Description:", protocol["SCOPE_DESC"])
    print("Description:", protocol["DESCRIPTION"])
    print("Sponsor Names:", protocol["SPONSOR_NAMES"])
    print("Disease Sites:", protocol["DISEASE_SITES"])
    print("=" * 50)

In [None]:
import openai

In [None]:
sentence1 = 'Which clinical trial is sponsored by Boston Scientific'
sentence2 = 'Which clinical trial is related to AstraZeneca'

In [None]:
embed1 = openai.Embedding.create(input = [sentence1, sentence2], engine="text-embedding-ada-002")

In [None]:
first = embed1["data"][0]["embedding"]
second = embed1["data"][1]["embedding"]

In [None]:
from openai.embeddings_utils import cosine_similarity
score = cosine_similarity(first,second)
print(score)

### Vector Database Generation

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [None]:
with open("openai_key.txt","r") as file:
    api_key = file.read().strip()

#### All Policies Vector DB

In [None]:
import re
import os
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [None]:
class Create_db:

    def parse_pdf( self, file, filename, idx):

        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text   
    
    def text_to_docs( self, text, filename):

        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/policies")


    def get_index_for_pdf( self, folder_path, api_key, dictionary):
        
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx = 0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
        self.docs_to_index(documents, api_key)
        


In [None]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf(folder_path, api_key, dictionary)

#### XML File Vector DB

In [None]:
import json
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [None]:
class Create_db:

    def generate_docs(self, data):
        docs = []
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                docs.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/xml_db")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [None]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
database.create_index(folder_path,api_key)

#### All Policies & Files Together

In [None]:
import json
import os
import re
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [None]:
class Create_db:
    
    def parse_pdf( self, file, filename, idx):
    
        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            print(text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text
    
    def generate_docs(self, documents, data):
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                documents.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return documents
    
    def text_to_docs( self, text, filename):
        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/main_db")


    def get_index_for_pdf( self, folder_path, file_path, api_key,dictionary):
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx=0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
            
        data = json.loads(Path(file_path).read_text())
        
        documents = self.generate_docs( documents, data)
        self.docs_to_index(documents, api_key)
        


In [None]:
database = Create_db()
file_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf( folder_path, file_path, api_key, dictionary)

#### Json Data Vector DB

In [None]:
from pathlib import Path

In [None]:
class Create_db:
    
    def generate_docs(self, data):
        docs = []
        for protocol in data:
                text = "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Sponsor: " + protocol["SPONSOR"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                if "DESCRIPTION" in protocol:
                    text += "Description: " + protocol["DESCRIPTION"] + " "
                text += "Summary: " + protocol["SUMMARY"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                if  "OUTCOME_DESCRIPTION" in protocol:
                    text += "Outcome Description: " + protocol["OUTCOME_DESCRIPTION"] + " "
                if "OUTCOME_MEASURE" in protocol:
                    text += "Outcome Measure: " + protocol["OUTCOME_MEASURE"] + " "
                if "OUTCOME_TIMEFRAME" in protocol:
                    text += "Outcome Timeframe: " + protocol["OUTCOME_TIMEFRAME"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                if "INVESTIGATOR_NAME" in protocol:
                    text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"]+ " "
                docs.append(Document(page_content=text, metadata={"source": protocol["NCT_ID"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("vectorDB/mainDB")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [59]:
from pathlib import Path

In [None]:
class Create_db:
    
    def generate_docs(self, data):
        docs = []
        for protocol in data:
            text = protocol["NCT_ID"] + " "
            if "TITLE" in protocol:
                text += "Title: " + protocol["TITLE"] + " "
            text += "Short Title: " + protocol["SHORT_TITLE"] + " "
            text += "Sponsor: " + protocol["SPONSOR"] + " "
            text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
            if "DESCRIPTION" in protocol:
                text += "Description: " + protocol["DESCRIPTION"] + " "
            text += "Summary: " + protocol["SUMMARY"] + " "
            text += "Status: " + protocol["STATUS"] + " "
            if  "PRIMARY_OUTCOMES" in protocol:
                text += "Outcome Description: " + protocol["PRIMARY_OUTCOMES"] + " "
            if "SECONDARY_OUTCOMES" in protocol:
                text += "Outcome Measure: " + protocol["SECONDARY_OUTCOMES"] + " "
            if "OTHER_OUTCOMES" in protocol:
                text += "Outcome Timeframe: " + protocol["OTHER_OUTCOMES"] + " "
            text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
            if "CONDITIONS" in protocol:
                text += "Conditions: " + str(protocol["CONDITIONS"])+ " "
            if "OVERALL_OFFICIALS" in protocol:
                text += "Overall Officials: " + protocol["OVERALL_OFFICIALS"]+ " "
            if "LOCATIONS" in protocol:
                text += "Locations: " + protocol["LOCATIONS"]+ " "
            
            docs.append(Document(page_content=text, metadata={"source": protocol["NCT_ID"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("vectorDB/primaryDB")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [71]:
class Create_db:
    
    def generate_docs(self, data):
        docs = []
        for protocol in data:
            text = protocol["NCT_ID"] + " "
            if "TITLE" in protocol:
                text += protocol["TITLE"] + " "
            text += protocol["SHORT_TITLE"] + " "
            text += protocol["SPONSOR"] + " "
            # print(protocol["ORGANIZATION"])
            if "ORGANIZATION" in protocol:
                text += protocol["ORGANIZATION"] + " "
            text += protocol["DETAILED_ELIGIBILITY"] + " "
            if "DESCRIPTION" in protocol:
                text += protocol["DESCRIPTION"] + " "
            text += protocol["SUMMARY"] + " "
            text += protocol["STATUS"] + " "
            if  "PRIMARY_OUTCOMES" in protocol:
                text += protocol["PRIMARY_OUTCOMES"] + " "
            if "SECONDARY_OUTCOMES" in protocol:
                text += protocol["SECONDARY_OUTCOMES"] + " "
            if "OTHER_OUTCOMES" in protocol:
                text += protocol["OTHER_OUTCOMES"] + " "
            text += protocol["AGE_DESCRIPTION"] + " "
            if "CONDITIONS" in protocol:
                text += str(protocol["CONDITIONS"])+ " "
            if "OVERALL_OFFICIALS" in protocol:
                text += protocol["OVERALL_OFFICIALS"]+ " "
            if "LOCATIONS" in protocol:
                text += str(protocol["LOCATIONS"])+ " "
            
            docs.append(Document(page_content=text, metadata={"source": protocol["NCT_ID"]}))
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("vectorDB/UCDB1")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [72]:
database = Create_db()
folder_path = r"../database/protocolDB/ucspecific-1.json"
database.create_index(folder_path,api_key)

### Chatbot Test

In [None]:
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import os
from langchain.chat_models import ChatOpenAI

In [None]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [None]:
chat = ChatOpenAI( openai_api_key = os.environ["OPENAI_API_KEY"], model = 'gpt-3.5-turbo')

def generate_responses( chat, messages, faiss_path, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local(faiss_path, embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

#### All Policies Chatbot

In [None]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\policies", question)

In [None]:
bot_answer

#### XML File Chatbot

In [None]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [None]:
bot_answer

#### Main Chatbot

In [None]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [None]:
bot_answer

### Clinical Trials API Trial

In [62]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"

nct_id = "NCT04790344"

url = f"{base_url}/studies/{nct_id}"

response = requests.get(url)

if response.status_code == 200:
    
    data = response.json()
    
    with open("study_data.json", "w") as json_file:
        json.dump(data, json_file)
    
    print("JSON data has been successfully stored in 'study_data.json'")
else:
    print("Error:", response.status_code)


JSON data has been successfully stored in 'study_data.json'


In [None]:
print(response)

In [None]:
print(response.json())

### NCT ID Extraction


In [1]:
from pathlib import Path
print(Path(".").absolute())

/Users/gshipurk/Documents/Clinical Trials Github/Clinical-Trials/notebook


In [1]:
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd

with open('../database/clinical_trials.xml') as f:
    data = f.read()

soup = BeautifulSoup(data, 'xml')
ids = soup.find_all('NCT_ID')

nct_ids = [] 
for item in ids:   
    item = str(item)
    if item.startswith("<NCT_ID>"):
        nct_id = item[len("<NCT_ID>"):-len("</NCT_ID>")]
        nct_ids.append(nct_id)
    else:
        continue

df = pd.DataFrame({"NCT_ID": nct_ids})

df.to_csv("../database/nctID-DB/nct_ids.csv", index = False)

### NCT ID Extraction( Based on Trials)


In [2]:
import requests

base_url = "https://clinicaltrials.gov/api/v2"
endpoint = "/studies"
nctids = []
count = 0 
filters = {
    "query.term": "University of California, Irvine",
    "pageSize": "10000",
}
url = f"{base_url}{endpoint}"

response = requests.get(url, params=filters)

if response.status_code == 200:
    data = response.json()
    for study in data['studies']:
        nctids.append(study['protocolSection']['identificationModule']['nctId'])

    next_page_token = data.get("nextPageToken", None)
    
    while next_page_token or count==5:
        filters["pageToken"] = next_page_token
        response = requests.get(url, params=filters)
        
        if response.status_code == 200:
            data = response.json()
            for study in data['studies']:
                nctids.append(study['protocolSection']['identificationModule']['nctId'])
            
            next_page_token = data.get("nextPageToken", None)
        else:
            print(f"Error fetching data for next page. Status code: {response.status_code}")
            break
    
        count+=1
else:
    print(f"Error fetching data. Status code: {response.status_code}")

nctids = list(set(nctids))

In [118]:
import random

random_ids = random.sample(nctids,2000)
len(random_ids)

2000

In [3]:
len(nctids)

2297

In [109]:
main_nctids = []

In [4]:
nctids.extend(nct_ids)

In [5]:
len(nctids)

2559

In [131]:
main_nctids = list(set(main_nctids))

In [132]:
len(main_nctids)

4195

In [135]:
len(main_nctids)

4195

In [128]:
import random

random_ids = random.sample(nctids,2000)
len(random_ids)

2000

In [18]:
random_ids.extend(nct_ids)

NameError: name 'nct_ids' is not defined

In [74]:
len(nctids)

2558

### Extract Data Based on NCT ID:

In [6]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"
trials_info = []
for idx, nct_id in enumerate(nctids):
    print(idx+1)
    url = f"{base_url}/studies/{nct_id}"
    try:
        response = requests.get(url)

        if response.status_code == 200:
            protocol = {}
            protocol['NCT_ID'] = nct_id    
            # print(protocol['NCT_ID'])
            data = response.json()
            data = data['protocolSection']
            if "officialTitle" in data["identificationModule"]:
                protocol['TITLE'] = data["identificationModule"]["officialTitle"]
            # print(data["identificationModule"])

            if "organization" in data['identificationModule']:
                # print(data["identificationModule"]["organization"])
                protocol["ORGANIZATION"] = str(data["identificationModule"]["organization"])
                # print(protocol["ORGANIZATION"])
            protocol['SHORT_TITLE'] = data["identificationModule"]["briefTitle"]
            protocol['SPONSOR'] = data['sponsorCollaboratorsModule']["leadSponsor"]["name"]
            protocol['DETAILED_ELIGIBILITY'] = data["eligibilityModule"]["eligibilityCriteria"]
            if "detailedDescription" in data['descriptionModule']:
                protocol["DESCRIPTION"] = data['descriptionModule']["detailedDescription"]
            protocol["SUMMARY"] = data["descriptionModule"]["briefSummary"]
            protocol["STATUS"] = data['statusModule']['overallStatus']
            if "outcomesModule" in data:
                if "primaryOutcomes" in data["outcomesModule"]:
                    primary_outcomes = str(data["outcomesModule"]["primaryOutcomes"])
                    protocol["PRIMARY_OUTCOMES"] = primary_outcomes

                if "secondaryOutcomes" in data["outcomesModule"]:
                    secondary_outcomes = str(data["outcomesModule"]["secondaryOutcomes"])
                    protocol["SECONDARY_OUTCOMES"] = secondary_outcomes

                if "otherOutcomes" in data["outcomesModule"]:
                    other_outcomes = str(data["outcomesModule"]["otherOutcomes"])
                    protocol["OTHER_OUTCOMES"] = other_outcomes
                
            if isinstance(data["eligibilityModule"]["stdAges"],list):
                text = ""
                for std in data["eligibilityModule"]["stdAges"]:
                    text += std +', '
                text = text.rstrip(', ')
                protocol["AGE_DESCRIPTION"] = text
            else:
                protocol["AGE_DESCRIPTION"] = data["eligibilityModule"]["stdAges"]

            if "conditionsModule" in data:
                protocol['CONDITIONS'] = []
                if "conditions" in data["conditionsModule"]:
                    for condition in data["conditionsModule"]["conditions"]:
                        # print(condition)
                        protocol['CONDITIONS'].append(condition)

                if "keywords" in data["conditionsModule"]:
                    for keyword in data["conditionsModule"]["keywords"]:
                        # print(keyword)
                        protocol["CONDITIONS"].append(keyword)
                
            if "contactsLocationsModule" in data:

                contacts_locations_module = data["contactsLocationsModule"]
                protocol["OVERALL_OFFICIALS"] = ""  # Initialize as empty string
                
                if "overallOfficials" in contacts_locations_module:
                    officials = contacts_locations_module["overallOfficials"]
                    protocol["OVERALL_OFFICIALS"] = str(officials) # Remove trailing "; "

                if  protocol["OVERALL_OFFICIALS"] == "":
                    del protocol["OVERALL_OFFICIALS"]
                protocol["LOCATIONS"] = []  # Initialize as empty string
                
                if "locations" in contacts_locations_module:
                    locations = contacts_locations_module["locations"]
                    location_dict = {}
                    for loc in locations[:15]:
                        if "facility" in loc:
                            location_dict["facility"] = loc["facility"]
                        if "city" in loc:
                            location_dict["city"] = loc["city"]
                        if "state" in loc:
                            location_dict["state"] = loc["state"]
                        if "country" in loc:
                            location_dict["country"] = loc["country"]

                        protocol["LOCATIONS"].append(location_dict)

                protocol["LOCATIONS"] = str(protocol["LOCATIONS"])

        else:
            continue

    except Exception as e:
        print(f"Error fetching data for NCT ID {nct_id}: {str(e)}")
        
    trials_info.append(protocol)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [None]:
import requests
import json

base_url = "https://clinicaltrials.gov/api/v2"
trials_info = []
nct_id = "NCT05130268"
url = f"{base_url}/studies/{nct_id}"

response = requests.get(url)

if response.status_code == 200:
    data = response.json()

In [70]:
with open("../database/protocolDB/ucspecific-1.json", "w") as json_file:
        json.dump(trials_info, json_file)

In [None]:
for ele in data["protocolSection"]['contactsLocationsModule']["locations"]:
    if ele['facility'] == "University of California - Irvine":
        print(ele["contacts"][0]['name'])

In [None]:
protocol = {}
protocol['NCT_ID'] = nct_id    
data = response.json()
# title extraction:
data = data['protocolSection']
protocol['TITLE'] = data["identificationModule"]["officialTitle"]
protocol['SHORT_TITLE'] = data["identificationModule"]["briefTitle"]
protocol['SPONSOR'] = data['sponsorCollaboratorsModule']["leadSponsor"]["name"]
protocol['DETAILED_ELIGIBILITY'] = data["eligibilityModule"]["eligibilityCriteria"]
if "detailedDescription" in data['descriptionModule']:
    protocol["DESCRIPTION"] = data['descriptionModule']["detailedDescription"]
protocol["SUMMARY"] = data["descriptionModule"]["briefSummary"]
protocol["STATUS"] = data['statusModule']['overallStatus']
protocol["OUTCOME_DESCRIPTION"] = data["outcomesModule"]["primaryOutcomes"][0]["description"]
protocol["OUTCOME_MEASURE"] = data["outcomesModule"]["primaryOutcomes"][0]["measure"]
protocol["OUTCOME_TIMEFRAME"] = data["outcomesModule"]["primaryOutcomes"][0]["timeFrame"]
# protocol["AGE_DESCRIPTION"] = data["eligibilityModule"]["stdAges"]
text = ""
for std in data["eligibilityModule"]["stdAges"]:
    text += std +', '
text = text.rstrip(', ')
protocol["AGE_DESCRIPTION"] = text

for ele in data['contactsLocationsModule']["locations"]:
    if ele['facility'] == "University of California - Irvine":
        protocol["INVESTIGATOR_NAME"] = ele["contacts"][0]['name']

In [None]:
protocol

In [None]:
with open("../database/protocolDB/moreProtocols.json", "w") as json_file:
        json.dump(trials_info, json_file)

In [None]:
import re

text = "The trial identifier is NCT05645744 and NCT12345678 is another trial."

pattern = r'\bNCT\d{8}\b'

matches = re.findall(pattern, text)

print(matches) 

In [None]:
if len(matches)>0:

    matches_text = ", ".join(matches)
    results = db.similarity_search(matches_text, k=3)
    
else:
    results = db.similarity_search(question, k=3)


#### History Aware Retriever Trial:

In [None]:
!pip3 install matplotlib
!pip3 install scipy 
!pip3 install plotly
!pip3 install scikit-learn
!pip3 install -U langchain-openai

In [None]:
import databutton as db
import streamlit as st
import time 
import openai
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from brain import custom_search
import os
import re 

In [None]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()
    
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key


In [None]:
!pwd

In [15]:
faiss_path = r"../database/vectorDB/UCDB"
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
db = FAISS.load_local(faiss_path, embeddings, allow_dangerous_deserialization="True")

In [None]:
!pip3 install -U langchain langchain-community 
!pip3 install langchainhub

In [16]:
retriever = db.as_retriever()

In [9]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [12]:
llm = ChatOpenAI(model_name='gpt-4o', temperature=0.1, api_key = api_key)

In [13]:
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

In [17]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)

In [18]:
input = "can you suggest trials related to brain tumor?"

In [29]:
result = history_aware_retriever.invoke({"chat_history": [],"input":input})

In [30]:
docs=[]

In [31]:
docs.extend(result)

In [32]:
docs

[Document(metadata={'source': 'NCT00030628'}, page_content='NCT00030628 A Phase III Randomized Trial Of The Role Of Whole Brain Radiation Therapy In Addition To Radiosurgery In The Management Of Patients With One To Three Cerebral Metastases Radiosurgery With or Without Whole-Brain Radiation Therapy in Treating Patients With Brain Metastases Alliance for Clinical Trials in Oncology {\'fullName\': \'Alliance for Clinical Trials in Oncology\', \'class\': \'OTHER\'} Inclusion Criteria:\n\n* Diagnosis of cerebral metastases meeting all of the following requirements:\n\n  * 1-3 de novo lesions\n  * Metastases must be from a histologically confirmed extracerebral primary site, another metastatic site, or from the metastatic brain lesion(s)\n  * Each lesion must be less than 3.0 cm by contrasted MRI of the brain\n  * Lesions must not be within 5 mm of optic chiasm or within the brainstem\n* No primary germ cell tumor, small cell carcinoma, or lymphoma\n* No leptomeningeal metastases\n* Eligib

In [47]:
from langchain_core.output_parsers import StrOutputParser

In [48]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
    Use the following pieces of retrieved context to answer the question. \
    Please provide a concise answer. \

{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

qa_chain =  qa_prompt | llm | StrOutputParser()

In [50]:
for idx, chunk in enumerate(qa_chain.stream({"input": input, "chat_history": [], "context":docs})):
    print(chunk)


Here
 are
 some
 clinical
 trials
 related
 to
 brain
 tumors
:


1
.
 **
N
CT
000
306
28
**
:
 A
 Phase
 III
 Random
ized
 Trial
 of
 the
 Role
 of
 Whole
 Brain
 Radiation
 Therapy
 in
 Addition
 to
 Radios
urgery
 in
 the
 Management
 of
 Patients
 With
 One
 to
 Three
 Cere
bral
 Met
ast
ases
.

  
 -
 **
Objective
**
:
 Compare
 overall
 survival
,
 time
 to
 CNS
 failure
,
 quality
 of
 life
,
 and
 neuro
c
ognitive
 status
 in
 patients
 treated
 with
 radios
urgery
 with
 or
 without
 whole
 brain
 radi
otherapy
.

  
 -
 **
Status
**
:
 Completed



2
.
 **
N
CT
055
761
03
**
:
 Long
itud
inal
 Pros
pective
 Study
 of
 Neuro
c
ognitive
 Outcomes
 and
 Mult
im
odal
 Quant
itative
 Neuro
im
aging
 Outcomes
 in
 Primary
 Brain
 Tum
or
 Patients
 Receiving
 Brain
 Radi
otherapy
.

  
 -
 **
Objective
**
:
 Examine
 radiation
-induced
 imaging
 changes
 in
 normal
 brain
 tissue
 over
 time
 and
 correlate
 these
 with
 neuro
c
ognitive
 outcomes
.

  
 -
 **
Status
**
:
 Recruiti

In [36]:
result

AIMessage(content='Here are some clinical trials related to brain tumors:\n\n1. **NCT00030628**: A Phase III Randomized Trial of the Role of Whole Brain Radiation Therapy in Addition to Radiosurgery in the Management of Patients With One to Three Cerebral Metastases.\n   - **Objective**: Compare the effectiveness of radiosurgery with or without whole-brain radiation therapy in treating brain metastases.\n   - **Status**: Completed.\n\n2. **NCT05576103**: Longitudinal Prospective Study of Neurocognitive Outcomes and Multimodal Quantitative Neuroimaging Outcomes in Primary Brain Tumor Patients Receiving Brain Radiotherapy.\n   - **Objective**: Examine radiation-induced imaging changes in normal brain tissue over time and correlate these with neurocognitive outcomes.\n   - **Status**: Recruiting.\n\n3. **NCT03550391**: A Phase III Trial of Stereotactic Radiosurgery Compared With Hippocampal-Avoidant Whole Brain Radiotherapy (HA-WBRT) Plus Memantine for 5 or More Brain Metastases.\n   - **

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
chat_history = [{'role': 'user', 'content': 'Hi! How are you?'}, {'role': 'assistant', 'content': 'Hello, How can I help you?'}, {'role': 'user', 'content': 'Could you tell me few trials related to brain tumor?'}, {'role': 'assistant', 'content': 'Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'}]

In [None]:
from langchain_core.messages import HumanMessage

question = "Can you give me information about the first trial you mentioned?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])



In [None]:
print(ai_msg_1["answer"])

In [None]:
from langchain_community.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [None]:
prompt = ChatPromptTemplate.from_messages(
    [("system", "What are everyone's favorite colors:\n\n{context}")]
)
llm = ChatOpenAI(model="gpt-3.5-turbo")
chain = create_stuff_documents_chain(llm, prompt)

docs = [
    Document(page_content="Jesse loves red but not yellow"),
    Document(page_content = "Jamal loves green but not as much as he loves orange")
]

chain.invoke({"context": docs})

In [None]:
results = history_aware_retriever.invoke({"input":"Can you give me information about the first trial you mentioned?","chat_history":chat_history})
question_answer_chain.invoke({"input":"Can you give me information about the first trial you mentioned?","chat_history":chat_history, "context":results})

In [None]:
results

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [None]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages([("system", contextualize_q_system_prompt),MessagesPlaceholder("chat_history"),("human", "{input}")])

chain= contextualize_q_prompt | llm | StrOutputParser()

In [None]:
chain.invoke({"input":"Could you give me information about the first trial mentioned?", "chat_history":chat_history})

In [None]:
result = contextualize_q_prompt.invoke({"input":"Can you give me information about the Nivolumab trial you mentioned?", "chat_history":chat_history})
result

ChatPromptValue(messages=[SystemMessage(content='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.'), 

SystemMessage(content='\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '), 
HumanMessage(content='Hi! How are you?'), AIMessage(content='Hello, How can I help you?'), HumanMessage(content='Could you tell me few trials related to brain tumor?'), AIMessage(content='Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'), HumanMessage(content='Can you give me information about the Nivolumab trial you mentioned?')])


In [None]:
ChatPromptValue(messages=[SystemMessage(content='Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.'), SystemMessage(content='\n\n        You are a helpful Assistant who answers to users questions based on multiple contexts given to you.\n\n        Keep your answer creative.\n        \n        Please take into account the previous messages as well.\n        \n        Make sure to citation for the answer from metadata.\n            \n        Reply to greetings messages.\n    '), HumanMessage(content='Hi! How are you?'), AIMessage(content='Hello, How can I help you?'), HumanMessage(content='Could you tell me few trials related to brain tumor?'), AIMessage(content='Based on the contexts provided, here are a few clinical trials related to brain tumors:\n\n1. **Trial Title:** Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients With Recurrent Select Rare CNS Cancers\n   - **NCT ID:** NCT03173950\n   - **Sponsor:** National Cancer Institute (NCI)\n   - **Objective:** Evaluate the efficacy of the immune checkpoint inhibitor Nivolumab in patients with recurrent rare central nervous system neoplasms.\n   - **Eligibility:** Patients with various rare CNS cancers, aged 18 or above, with progressive tumor growth, and specific laboratory parameters within normal range.\n   - **Status:** Recruiting\n\n2. **Trial Title:** A Randomized Phase III Trial of Pre-Operative Compared to Post-Operative Stereotactic Radiosurgery in Patients With Resectable Brain Metastases\n   - **NCT ID:** NCT05438212\n   - **Sponsor:** NRG Oncology\n   - **Objective:** Compare the addition of stereotactic radiosurgery before or after surgery in patients with brain metastases to assess the impact on overall survival and progression-free survival.\n   - **Eligibility:** Patients with resectable brain metastases, who meet specific criteria related to tumor size and location.\n   - **Status:** Recruiting\n\n3. **Trial Title:** ONC201 for the Treatment of Newly Diagnosed H3 K27M-mutant Diffuse Glioma Following Completion of Radiotherapy: A Randomized, Double-Blind, Placebo-Controlled, Multicenter Study\n   - **NCT ID:** NCT05580562\n   - **Sponsor:** Chimerix\n   - **Objective:** Assess whether treatment with ONC201 following radiotherapy extends overall survival and progression-free survival in patients with newly diagnosed H3 K27M-mutant diffuse glioma.\n   - **Eligibility:** Patients diagnosed with H3 K27M-mutant diffuse glioma who have completed frontline radiotherapy.\n   - **Status:** Recruiting\n\n4. **Trial Title:** Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas\n   - **NCT ID:** NCT03224767\n   - **Sponsor:** Alliance for Clinical Trials in Oncology\n   - **Objective:** Evaluate the activity of BRAF and MEK inhibitor combination in untreated and previously treated papillary craniopharyngiomas.\n   - **Eligibility:** Patients with histologically proven papillary craniopharyngioma with a positive BRAF V600E mutation.\n   - **Status:** Recruiting\n\nThese trials aim to contribute to the understanding and treatment of various brain tumors, offering new insights and potential therapeutic options for patients.'), HumanMessage(content='Can you give me information about the Nivolumab trial you mentioned?')])


### Using LCEL Making a Chain

In [None]:
from typing import Any, Callable, Dict, Optional

import streamlit as st
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.vectorstores.faiss import FAISS



from langchain_core.output_parsers import StrOutputParser

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever


In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.schema import format_document
from langchain_core.messages import get_buffer_string
from langchain.prompts.prompt import PromptTemplate
from template import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from operator import itemgetter
import json
from langchain.llms import OpenAI

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def get_chain(vectorstore):


    model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1, api_key = api_key)


    def _combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
            doc_strings = [format_document(doc, document_prompt) for doc in docs]
            return document_separator.join(doc_strings)

    _inputs = RunnableParallel(
        standalone_question=RunnablePassthrough.assign(
            chat_history=lambda x: get_buffer_string(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | model
        | StrOutputParser(),
    )
    _context = {
        "context": itemgetter("standalone_question")
        | vectorstore.as_retriever()
        | _combine_documents,
        "question": lambda x: x["standalone_question"],
    }
    conversational_qa_chain = _inputs | _context 
    
    return conversational_qa_chain


In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.schema import format_document
from langchain_core.messages import get_buffer_string
from langchain.prompts.prompt import PromptTemplate
from operator import itemgetter
import json
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.llms import OpenAI

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def get_chain(vectorstore):

    model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1, api_key = api_key)

    contextualize_q_system_prompt = """Given a chat history and the latest user question \
    which might reference context in the chat history, formulate a standalone question \
    which can be understood without the chat history, mention the required details in the question itself, also mention the NCT ID if present in the previous trial.\
    Just return the standalone question, Do NOT answer the question, \
    just reformulate it if needed and otherwise return it as is."""

    qa_system_prompt = """You are an assistant for question-answering tasks. \
    Use the following pieces of retrieved context to answer the question. \
    Use three sentences maximum and keep the answer concise.\

    {context}"""

    contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
            ]
    )

    qa_prompt = ChatPromptTemplate.from_messages(
            [
            ("system", qa_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
            ]
    )

    retriever = vectorstore.as_retriever()
    history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)
    question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

    return rag_chain


In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
vectorstore = FAISS.load_local("../database/vectorDB/mainDB", embeddings, allow_dangerous_deserialization=True)

rag_chain = get_chain(vectorstore)

In [None]:
import json
with open("chat_history.json", "r") as file:
    chat_history_json = file.read()

chat_history = json.loads(chat_history_json)

In [None]:
chat_history

In [None]:
from langchain.schema import AIMessage, HumanMessage

def convert_messages_list(messages_list):
    langchain_messages = []

    for messages_dict in messages_list:
        role = messages_dict.get('role')
        content = messages_dict.get("content")

        if role == "user":
            langchain_messages.append(HumanMessage(content = content))
        elif role == "assistant":
            langchain_messages.append(AIMessage(content = content))

    return langchain_messages

langchain_messages = convert_messages_list(chat_history)

In [None]:
print(langchain_messages)

In [None]:
question="Can you suggest trials?"
count=2
for chunk in rag_chain.stream({"chat_history": langchain_messages, "input": question}):
    if count!=0:
        count-=1
    else:
        print(chunk["answer"])

In [None]:
ai_msg

In [None]:
from langchain.schema import AIMessage, HumanMessage

messages = [
    HumanMessage(content="Hi, how are you?"),
    AIMessage(content="Good, how are you?"),
]
get_buffer_string(messages)

In [None]:
chat_history

In [None]:
from langchain.memory import ChatMessageHistory

history = ChatMessageHistory()

history.add_user_message("hi!")

history.add_ai_message("whats up?")

In [None]:
history.messages

In [None]:
chat_history = get_buffer_string(h)

In [None]:
chat_history

In [None]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import numpy as np
from openai import OpenAI

In [None]:
class Identify:

    def __init__ (self, chat_history, input):

        self.chat_history = chat_history
        self.input = input

    def identify_chain(self):

        system_prompt = """Given a chat history and the latest user question \
        which might reference context in the chat history,\
        please answer if the user question is related to clinical trials or studies?\ 
        Just answer "Yes" or "No"\
        """
        main_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1, api_key = api_key)
        qa_prompt = ChatPromptTemplate.from_messages(
            [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
            ]
        )

        qa_chain = qa_prompt | main_model
        print("Till here fine")
        self.result = qa_chain.invoke({"input": self.input, "chat_history": self.chat_history}).content
        print(self.result)

    def cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def determine_answer(self):
        
        embeddings = openai.embeddings.create(input=["Yes", "No", self.result], model="text-embedding-ada-002")
        print(embeddings.data)
        vector1 = embeddings.data[0].embedding
        vector2 = embeddings.data[1].embedding
        vector3 = embeddings.data[2].embedding
    
        score_yes = self.cosine_similarity(vector1, vector3)
        score_no = self.cosine_similarity( vector2, vector3)
        return True if score_yes > score_no else False

def question_relatable(chat_history, input):
    print("I am here?")
    obj = Identify(chat_history, input)
    print("problem")
    obj.identify_chain()
    print("problem2")
    result = obj.determine_answer()

    return result

In [None]:
question_relatable(chat_history, input = "Can you tell me trials related to brain tumor?")