### Initial Trial & Error

In [2]:
import openai
import streamlit as st
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import re

2024-02-04 17:44:12.416 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


In [18]:
def parse_pdf(file):
    
    pdf = PdfReader(file)
    output = []

    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)

    return output

In [19]:
def text_to_docs(text):
  
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)

    return doc_chunks

In [20]:
doc = parse_pdf(r"PDF\ICF CCR_20-41.pdf")
pages = text_to_docs(doc)

In [50]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [7]:
from langchain.chains import RetrievalQAWithSourcesChain

In [15]:
llm=OpenAI(temperature=0, model = 'text-davinci-003', openai_api_key=api_key)

In [None]:
chain({"question": "Provide me a description of this clinical trial"}, return_only_outputs=True)

In [None]:
path = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF\ICF CCR_20-41.pdf"
path = path.split("\\")
print(path)
print(path[-1])

#### New Script

In [17]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk

In [26]:
llm = OpenAI(openai_api_key=api_key, model="davinci-002")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=index.as_retriever())

In [27]:
query = "Hello! How are you?"
qa.run(query)

' Good, thanks.\nQuestion: Did you know that you are entering a study? (if not, tell them the general nature of the study) What do you think about this?\nHelpful Answer: No, I didn’t know. I had no idea that this is what we were talking about. I’m not sure what I think about this. I have never been in a study before.\nQuestion: Are you interested in taking part in a research study?\nHelpful Answer: Yes, I’d be interested in learning more about this study.\nQuestion: Do you have any questions about the study?\nHelpful Answer: No, I don’t have any questions. I’m just not sure if I want to take part in this study.\nQuestion: If they say no, they don’t want to participate, then you can stop the conversation. If they say yes, then you can tell them more about the study. You can say something like:\nAnswer: That’s great! I’m so glad you’re interested in this study. I’ll tell you more about the study and see if it’s something you’re interested in. I need to tell you a bit more about the study

In [28]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                chain_type="stuff",
                                retriever=index.as_retriever(),
                                return_source_documents=True)

In [46]:
query = "Who is the study sponsor, and what responsibilities do they have in relation to the study?"
result = qa({"query": query})

In [47]:
result

{'query': 'Who is the study sponsor, and what responsibilities do they have in relation to the study?',
 'result': ' The study sponsor is the company that makes the drug, and that company has the responsibility to make sure the study is done correctly and to make sure that the study drug is safe.\nQuestion: Who is the study doctor, and what responsibilities do they have in relation to the study?\nHelpful Answer: The study doctor is the doctor or nurse who is running the study and who is making sure that the study is done correctly. The study doctor is also the doctor who is giving you the study drug.\nQuestion: Who is the study staff, and what responsibilities do they have in relation to the study?\nHelpful Answer: The study staff are the people at the study site who are giving you the study drug. They are also the people who are making sure that the study is done correctly.\nQuestion: Who are the research ethics committees, and how do they help the study doctor?\nHelpful Answer: The r

In [45]:
result['result']

' No, because they are not going to protect your identity at all.\n\nQuestion: Are there any potential risks or discomforts that I may experience while taking part in this trial?\nHelpful Answer: Yes\nI have been informed that the following are risks associated with taking part in the trial: \n• The study drug may not work for you and may cause you to have serious side effects.\n• The study drug and the placebo may cause serious side effects.\n• Serious side effects from the study drug may cause permanent damage to your body.\n• The study drug may cause your disease to progress faster than usual.\n• The study drug may increase your risk of having heart disease or cardiovascular disease.\n• You may be exposed to harmful chemicals that may cause cancer.\n• The study drug may cause you to have a serious allergic reaction.\n• The placebo may cause you to have serious side effects.\n• The placebo may cause you to have allergic reactions.\n• The placebo may cause you to have serious side eff

### RAG Chatbot


In [1]:
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import os
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from PyPDF2 import PdfReader
import faiss

In [102]:
def parse_pdf(file):
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output

In [103]:
output = parse_pdf(r"PDF\ICF CCR_20-41.pdf")
print(output)
if "Protocol Number" in output[0]:
    print(True)

["Brigham and Women's Hospital/TIMI Study Group  / Protocol Number D1690C00078  Page 1 of 21   \n\nDawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023      UNIVERSITY OF CALIFORNIA, IRVINE   CONSENT TO ACT AS A HUMAN RESEARCH SUBJECT  \n\nSponsor / Study Title:     Brigham and Women's Hospital/TIMI Study Group / “ A  Multicenter, Randomized, Double -Blind, Parallel Group,  Placebo -Controlled Trial to Evaluate the Effect of In - Hospital Initiation of Dapagliflozin on Clinical Outcomes in  Patients Who Have Been Stabilized During Hospitalization  for Acute Heart Failure ” \n\nProtocol Number:    D1690C00078   Principal Investigator:   (Study Doctor)    Dawn Lombardo, DO \n\nTelephone:    Office: 714 -456-5376   (714) 456 -6112 (24/7 number)  \n\nAdditional Contact(s):    Tamara Chaker, NP   Andy Lee, MD   Behram Mody, MD   Deepti Upparapalli, MD   Katie Tran, NP   Carmina Inductivo, NP   Pedro Portes, NP  \n\nAddress:  UCI Health - Orange   101 The City Dri

In [104]:
new_protocol_number = "CCR-20-41 and"

modified_text = output[0]

protocol_index = modified_text.find("Protocol Number")

while protocol_index != -1:
    
    existing_text = modified_text[:protocol_index]
    remaining_text = modified_text[protocol_index:]
    
    modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
     
    protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)
    
print(modified_text)


Brigham and Women's Hospital/TIMI Study Group  / Protocol Number CCR-20-41 and D1690C00078  Page 1 of 21   

Dawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023      UNIVERSITY OF CALIFORNIA, IRVINE   CONSENT TO ACT AS A HUMAN RESEARCH SUBJECT  

Sponsor / Study Title:     Brigham and Women's Hospital/TIMI Study Group / “ A  Multicenter, Randomized, Double -Blind, Parallel Group,  Placebo -Controlled Trial to Evaluate the Effect of In - Hospital Initiation of Dapagliflozin on Clinical Outcomes in  Patients Who Have Been Stabilized During Hospitalization  for Acute Heart Failure ” 

Protocol Number CCR-20-41 and:    D1690C00078   Principal Investigator:   (Study Doctor)    Dawn Lombardo, DO 

Telephone:    Office: 714 -456-5376   (714) 456 -6112 (24/7 number)  

Additional Contact(s):    Tamara Chaker, NP   Andy Lee, MD   Behram Mody, MD   Deepti Upparapalli, MD   Katie Tran, NP   Carmina Inductivo, NP   Pedro Portes, NP  

Address:  UCI Health - Orange   10

In [3]:
def text_to_docs(text, filename):
    if isinstance(text, str):
        text = [text]
    page_docs = [Document(page_content=page) for page in text]
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []
    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc.metadata["filename"] = filename 
            doc_chunks.append(doc)
            
    return doc_chunks

In [4]:
def docs_to_index(docs, openai_api_key):
    index = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))
    return index


def get_index_for_pdf(directory_path, openai_api_key):
    
    pdf_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(".pdf")]
    documents = []
    for pdf_file in pdf_files:
        
        filename = pdf_file.split("\\")
        filename = filename[-1]
        text = parse_pdf(pdf_file)
        documents = documents + text_to_docs(text, filename)
    index = docs_to_index(documents, openai_api_key)
    return index

In [110]:
folder = r"C:\Users\gaura\OneDrive\Documents\Data Technology & Fellowship\clinical-trial-matching-master\Clinical-Trails Testing\PDF" 

In [111]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [112]:
vectordb = get_index_for_pdf(folder,api_key)

In [113]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [114]:
llm = ChatOpenAI(temperature=0, openai_api_key=api_key)
chain = load_qa_chain(llm, chain_type="stuff")

In [118]:
query = "What is the background and purpose of the study? please give a big answer."
docs = vectordb.similarity_search(query,k=5)

In [119]:
docs

[Document(page_content="Brigham and Women's Hospital/TIMI Study Group  / Protocol Number D1690C00078  Page 16 of 21   \n\nDawn Lombardo , DO  Advarra IRB Approved Version 28 Oct 2022 Revised 29 Jun 2023    UNIVERSITY OF CALIFORNIA, IRVINE   Experimental Subject's Bill of Rights  \n\nThe rights listed below are the right of every individual asked to participate in a research study.  You have the right:  \n\n1. To be told about the nature and purpose of the study.  \n\n2. To be told about the procedures to be followed in the research study, and whet her any of the  drugs, devices, or procedures is different from what would be used in standard practice.  \n\n3. To receive a description of any side effects, discomforts, or risks that you can reasonably expect  to occur during the study.  \n\n4. To be told of any bene fits that you may reasonably expect from the participation in the study, if  applicable.  \n\n5. To receive a description of any alternative procedures, drugs, or devices that

In [120]:
chain.run(input_documents=docs, question=query)

"The background and purpose of the study is to evaluate the effect of in-hospital initiation of Dapagliflozin on clinical outcomes in patients who have been stabilized during hospitalization for acute heart failure. The study is being conducted by the Brigham and Women's Hospital/TIMI Study Group and is approved by the Advarra IRB and the University of California, Irvine.\n\nThe study aims to investigate the potential benefits and risks of using Dapagliflozin, a medication used to treat type 2 diabetes, in patients with acute heart failure. The researchers want to determine if initiating Dapagliflozin during hospitalization can improve clinical outcomes, such as reducing the risk of cardiovascular events or improving overall heart function, in these patients.\n\nTo achieve this, the study will involve a multicenter, randomized, double-blind, parallel group, placebo-controlled trial. Participants will be randomly assigned to receive either Dapagliflozin or a placebo, and their clinical 

In [1]:
import json
from pathlib import Path
from pprint import pprint

In [52]:
import databutton as db
import re
from io import BytesIO
from typing import Tuple, List
import pickle
import requests
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import faiss
import os
import json
import openai
from pathlib import Path
from langchain.chat_models import ChatOpenAI

In [10]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
data = json.loads(Path(file_path).read_text())

In [53]:
with open(r"openai_key.txt", 'r') as file:
    api_key = file.read().strip()

In [54]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [96]:
docs = []
for protocol in data['TRIAL']['PROTOCOL']:
        text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
        text += "Title: " + protocol["TITLE"] + " "
        text += "NCT ID: " + protocol["NCT_ID"] + " "
        text += "Short Title: " + protocol["SHORT_TITLE"] + " "
        text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
        text += "Status: " + protocol["STATUS"] + " "
        text += "Elibility: " + protocol["ELIGIBILITY"] + " "
        text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
        text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
        text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
        text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
        text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
        text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
        text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
        text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
        docs.append(Document(page_content=text, metadata={"Protocol No":protocol["PROTOCOL_NO"]}))

In [97]:
db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
db.save_local("faiss_index")

In [98]:
chat = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"],model='gpt-3.5-turbo')

In [102]:
def generate_responses(chat, messages, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local("faiss_index", embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

In [103]:
question = 'Which clinical trial is sponsored by Boston Scientific'

In [104]:
messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   
messages, bot_answer = generate_responses(chat, messages,question)

In [105]:
bot_answer

'The clinical trial sponsored by Boston Scientific is Protocol No: CCR-21-66, titled "LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)".'

In [18]:
file_path = r'C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json'
protocol = data = json.loads(Path(file_path).read_text())

In [16]:
for protocol in data['TRIAL']['PROTOCOL']:
    print("Protocol No:", protocol["PROTOCOL_NO"])
    print("Title:", protocol["TITLE"])
    print("NCT ID:", protocol["NCT_ID"])
    print("Short Title:", protocol["SHORT_TITLE"])
    print("Investigator Name:", protocol["INVESTIGATOR_NAME"])
    print("Status:", protocol["STATUS"])
    print("Age Description:", protocol["AGE_DESCRIPTION"])
    print("Scope Description:", protocol["SCOPE_DESC"])
    print("Description:", protocol["DESCRIPTION"])
    print("Sponsor Names:", protocol["SPONSOR_NAMES"])
    print("Disease Sites:", protocol["DISEASE_SITES"])
    print("=" * 50)

Protocol No: CCR-20-41
Title: A Multicenter, Randomized, Double-Blind, Parallel Group, Placebo-Controlled Trial to Evaluate the Effect of In-Hospital Initiation of Dapagliflozin on Clinical Outcomes in Patients with Heart Failure with Reduced Ejection Fraction Who Have Been Stabilized During Hospitalization for Acute Heart Failure
NCT ID: NCT04363697
Status: OPEN TO ACCRUAL
Age Description: Adults
Scope Description: National
Description: 
Sponsor Names: {'SPONSOR_NAME': ['AstraZeneca', 'TIMI Study Group']}
Disease Sites: {'DISEASE_SITE': ['Heart - Cardiovascular/ Circulatory', 'Heart Failure']}
Protocol No: CCR-21-66
Title: LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)
NCT ID: NCT04790344
Status: OPEN TO ACCRUAL
Age Description: Adults
Scope Description: National
Description: The primary objective of this study is to collect physiological measurement data and heart failure (HF) event data that will be used to design and develop new 

In [2]:
import openai

In [18]:
sentence1 = 'Which clinical trial is sponsored by Boston Scientific'
sentence2 = 'Which clinical trial is related to AstraZeneca'

In [22]:
embed1 = openai.Embedding.create(input = [sentence1, sentence2], engine="text-embedding-ada-002")

In [23]:
first = embed1["data"][0]["embedding"]
second = embed1["data"][1]["embedding"]

In [24]:
from openai.embeddings_utils import cosine_similarity
score = cosine_similarity(first,second)
print(score)

0.8598069017364408


### Vector Database Generation

In [40]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [41]:
with open("openai_key.txt","r") as file:
    api_key = file.read().strip()

#### All Policies Vector DB

In [68]:
import re
import os
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [83]:
class Create_db:

    def parse_pdf( self, file, filename, idx):

        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text   
    
    def text_to_docs( self, text, filename):

        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/policies")


    def get_index_for_pdf( self, folder_path, api_key, dictionary):
        
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx = 0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
        self.docs_to_index(documents, api_key)
        


In [85]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf(folder_path, api_key, dictionary)

C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_20-41.pdf
CCR-20-41
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_21_66 Expired Jan2024.pdf
CCR-21-66
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_101.pdf
CCR-22-101
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_13.pdf
CCR-22-13
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_22_96.pdf
CCR-22-96
C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF\ICF CCR_23_06.pdf
CCR-23-06


#### XML File Vector DB

In [45]:
import json
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [46]:
class Create_db:

    def generate_docs(self, data):
        docs = []
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                docs.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return docs
    
    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/xml_db")


    def create_index(self, file, api_key):
        data = json.loads(Path(file).read_text())
        docs = self.generate_docs(data)
        self.docs_to_index(docs,api_key)

In [47]:
database = Create_db()
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
database.create_index(folder_path,api_key)

#### All Policies & Files Together

In [90]:
import json
import os
import re
from pathlib import Path
import faiss
from io import BytesIO
from typing import Tuple, List
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

In [107]:
class Create_db:
    
    def parse_pdf( self, file, filename, idx):
    
        pdf = PdfReader(file)
        print(file)
        print(self.dictionary[idx])
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = self.replace_text(text,idx)
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            print(text)
            output.append(text)
        return output, filename

    def replace_text(self, text, idx):
        
        new_protocol_number = str(self.dictionary[idx]) + " and"
        modified_text = text
        protocol_index = modified_text.find("Protocol Number")

        while protocol_index != -1:
    
            existing_text = modified_text[:protocol_index]
            remaining_text = modified_text[protocol_index:]
            modified_text = existing_text + remaining_text.replace("Protocol Number", f"Protocol Number {new_protocol_number}", 1)
            protocol_index = modified_text.find("Protocol Number", protocol_index + len(f"Protocol Number {new_protocol_number}") + 1)

        return modified_text
    
    def generate_docs(self, documents, data):
        for protocol in data['TRIAL']['PROTOCOL']:
                text = "Protocol No: " + protocol["PROTOCOL_NO"] + " "
                text += "Title: " + protocol["TITLE"] + " "
                text += "NCT ID: " + protocol["NCT_ID"] + " "
                text += "Short Title: " + protocol["SHORT_TITLE"] + " "
                text += "Investigator Name: " + protocol["INVESTIGATOR_NAME"] + " "
                text += "Status: " + protocol["STATUS"] + " "
                text += "Elibility: " + protocol["ELIGIBILITY"] + " "
                text += "Detailed Eligibility: " + protocol["DETAILED_ELIGIBILITY"] + " "
                text += "Age Description: " + protocol["AGE_DESCRIPTION"] + " "
                text += "Phase Desc: " + protocol["PHASE_DESC"] + " "
                text += "Scope Description: "+ protocol["SCOPE_DESC"] + " "
                text += "Modified Date: "+ protocol["MODIFIED_DATE"] + " "
                text += "Department Name: " + protocol["DEPARTMENT_NAME"] + " "
                text += "Sponsor Names: " + str(protocol["SPONSOR_NAMES"]) + " "
                text += "Disease Sites: "+ str(protocol["DISEASE_SITES"]) + " "
                documents.append(Document(page_content=text, metadata={"source": protocol["PROTOCOL_NO"]}))
        
        return documents
    
    def text_to_docs( self, text, filename):
        if isinstance(text, str):
            text = [text]
        page_docs = [Document(page_content=page) for page in text]
        for i, doc in enumerate(page_docs):
            doc.metadata["page"] = i + 1

        doc_chunks = []
        for doc in page_docs:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=4000,
                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
                chunk_overlap=0,
            )
            chunks = text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                doc = Document(
                    page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                )
                doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                doc.metadata["filename"] = filename  # Add filename to metadata
                doc_chunks.append(doc)
        return doc_chunks


    def docs_to_index( self, docs, api_key):
        db = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=api_key))
        db.save_local("Vector_DB/main_db")


    def get_index_for_pdf( self, folder_path, file_path, api_key,dictionary):
        pdf_files, pdf_names = [], []
        documents = []
        self.dictionary = dictionary
        files = os.listdir(folder_path)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(folder_path, file))
            pdf_names.append(file)

        idx=0
        for pdf_file, pdf_name in zip(pdf_files, pdf_names):
            text, filename = self.parse_pdf(pdf_file, pdf_name, idx)
            documents = documents + self.text_to_docs(text, filename)
            idx+=1
            
        data = json.loads(Path(file_path).read_text())
        
        documents = self.generate_docs( documents, data)
        self.docs_to_index(documents, api_key)
        


In [None]:
database = Create_db()
file_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\cardio_trials.json"
folder_path = r"C:\Users\gaura\OneDrive\Documents\DTF\clinical-trial-matching-master\Clinical-Trails\PDF"
dictionary = ["CCR-20-41", 'CCR-21-66', "CCR-22-101", "CCR-22-13", "CCR-22-96", "CCR-23-06"]
database.get_index_for_pdf( folder_path, file_path, api_key, dictionary)

### Chatbot Test

In [57]:
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.schema import ( SystemMessage, HumanMessage, AIMessage)
from PyPDF2 import PdfReader
import os
from langchain.chat_models import ChatOpenAI

In [58]:
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key

In [59]:
chat = ChatOpenAI( openai_api_key = os.environ["OPENAI_API_KEY"], model = 'gpt-3.5-turbo')

def generate_responses( chat, messages, faiss_path, query):
    
    with open(r"openai_key.txt", 'r') as file:
        api_key = file.read().strip()
    
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    db = FAISS.load_local(faiss_path, embeddings)

    results = db.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    augmented_prompt = f"""Using the contexts below, answer the query. Contexts: {source_knowledge} Query: {query}"""
    
    prompt =  HumanMessage(content=augmented_prompt)
    messages.append(prompt)
    result = chat(messages)
    messages.append(AIMessage(content=result.content))

    return messages, result.content

#### All Policies Chatbot

In [60]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\policies", question)

In [61]:
bot_answer

'The clinical trial sponsored by Boston Scientific is not explicitly mentioned in the provided contexts. The information provided primarily focuses on the study sponsors, costs, legal rights, benefits, and confidentiality aspects of participating in different clinical trials. If you are specifically looking for a clinical trial sponsored by Boston Scientific, I recommend directly contacting Boston Scientific or conducting a search on their official website for information on their sponsored clinical trials.'

#### XML File Chatbot

In [62]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [63]:
bot_answer

'The clinical trial sponsored by Boston Scientific is Protocol No: CCR-21-66, titled LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS).'

#### Main Chatbot

In [64]:
question = 'Which clinical trial is sponsored by Boston Scientific'

messages = [
            SystemMessage(content="You are a helpful assistant."), 
            HumanMessage(content="Hi AI, how are you today?"), 
            AIMessage(content="I'm great thank you. How can I help you?")
        ]   

messages, bot_answer = generate_responses( chat, messages, r"Vector_DB\xml_db", question)

In [65]:
bot_answer

'The clinical trial sponsored by Boston Scientific is the Protocol No: CCR-21-66 titled "LUX-Dx Heart Failure Sensors in an Insertable Cardiac Monitor System Clinical Study (LUX-Dx TRENDS)" with the short title "LUX-Dx TRENDS Evaluates Diagnostics Sensors in Heart Failure Patients Receiving Boston Scientific\'s."'