### Prepare Notebook
Install dependencies, import packages, connect to OpenAI API.

In [None]:
%%capture
!pip install langchain tiktoken deeplake
!pip install langchain-community langchain-core
!pip install openai langchain_openai
!pip install streamlit streamlit_chat
!pip install faiss-cpu
!pip install google_colab_selenium
!pip install selenium webdriver_manager google_colab_selenium

In [None]:
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

import openai

In [None]:
import os
import time
import numpy as np
from typing import List, Tuple, Dict

import requests
from bs4 import BeautifulSoup
import google_colab_selenium as gs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

import streamlit as st
from streamlit_chat import message

In [None]:
os.environ['OPENAI_API_KEY'] = "---"
os.environ["ACTIVELOOP_TOKEN"] = "---"

### Parse Medical Data
Medscape is a source of data.

In [None]:
def fetch_medscape_content(query: str, doucment_observe: int = 10) -> str:
    """ Searches for relevant documents that may contain the answer to the query"""
    search_url = f"https://www.medscape.com/search/?q={query.replace(' ', '+')}&plr=ref&page=1"
    driver = gs.Chrome()
    driver.get(search_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    search_results = soup.find_all('div', class_='searchResult')
    links = []
    for div in search_results:
        a_tag = div.find('a')
        if a_tag and 'href' in a_tag.attrs:
            href = a_tag['href']
            if not href.startswith('http'):
                          href = 'https:' + href
            links.append(href)
    print(f"Found {len(links)} articles")

    documents = []
    for link in links[:doucment_observe]:
          article_response = requests.get(link)
          article_soup = BeautifulSoup(article_response.text, 'html.parser')
          content = article_soup.get_text(separator='\n')
          documents.append((content, link))
    return documents



In [None]:
response = requests.get("https://www.medscape.com/search/?q=%22symptoms%20of%20diabetes%22&plr=ref&page=1")
search_url="https://www.medscape.com/search/?q=%22symptoms%20of%20diabetes%22&plr=ref&page=1"
soup = BeautifulSoup(response.text, 'html.parser')

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.headless = True
driver = webdriver.Chrome('chromedriver', options=chrome_options)
driver.get(search_url)
time.sleep(5)  # Wait for the page to load
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

<IPython.core.display.Javascript object>

In [None]:
search_results = soup.find_all('div', class_='searchResult')
links = []
for div in search_results:
    a_tag = div.find('a')
    if a_tag and 'href' in a_tag.attrs:
        href = a_tag['href']
        if not href.startswith('http'):
                      href = 'https:' + href
        links.append(href)

In [None]:
documents = []
for link in links[:5]:
      article_response = requests.get(link)
      article_soup = BeautifulSoup(article_response.text, 'html.parser')
      content = article_soup.get_text(separator='\n')
      documents.append((content, link))
print(documents)



### Add Data to Vector Database

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

my_activeloop_org_id = "test"
my_activeloop_dataset_name = "test_ds"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
deeplake_vector_store = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

In [None]:
def split_text_and_metadata(documents: List[Tuple[str, str]]) -> Tuple[List[str], List[Dict[str, str]]]:
    """Split text on chunks and return it with list of sources metadata"""
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    split_texts = []
    metadatas = []
    for doc, link in documents:
        cleaned_doc = '\n'.join([line for line in doc.split('\n') if line.strip()])
        chunks = splitter.split_text(cleaned_doc)
        split_texts.extend(chunks)
        metadatas.extend([{"source": link}] * len(chunks))
    return split_texts, metadatas


def add_documents_to_store_by_query(query: str) -> str:
    """ Searches for relevant documents that may contain the answer to the query"""
    documents = fetch_medscape_content(query)
    if len(documents) == 0:
      return [], []
    new_documents = []
    for (doc, link) in documents:
        if not next((d for d in deeplake_vector_store.vectorstore.dataset['metadata'].data()['value'] if d.get('source') == link), None):
          new_documents.append((doc, link))
        else:
          print(f"{link} is already in db")

    texts, metadatas = split_text_and_metadata(new_documents)
    deeplake_vector_store.add_texts(texts=texts, metadatas=metadatas)
    return texts, metadatas

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_texts = []
metadatas = []
for doc, link in documents:
    chunks = splitter.split_text(doc)
    split_texts.extend(chunks)
    metadatas.extend([{"source": link}] * len(chunks))

In [None]:
add_documents_to_store_by_query("manage a newly diagnosed patient with atrial fibrillation")

### Get Answer from DB

In [None]:
def retrieve_documents(query: str) -> str:
      retrieved_docs_similarity = deeplake_vector_store.similarity_search(query, k=7)
      if len(retrieved_docs_similarity) == 0:
          return []
      retrieved_docs_with_score = deeplake_vector_store.similarity_search_with_score(query, k=7)
      scores = retrieved_docs_with_score[0][1]
      retrieved_docs_similarity = np.array(retrieved_docs_similarity)[scores >= 0.8]
      return retrieved_docs_similarity

def get_answer(query: str):
  retireved_documents = retrieve_documents(query)
  # if len(result['sources']) == 0: # fast inmemory answering
  if len(retireved_documents) < 2:
    print("MEMORY ANSWERING")
    split_texts, metadatas = add_documents_to_store_by_query(query)
    if len(split_texts) == 0:
      print("NO results")
      return None
    in_memory_vector_store = FAISS.from_texts(texts=split_texts, embedding=embeddings, metadatas=metadatas)
  else:
    in_memory_vector_store = FAISS.from_documents(retireved_documents, embeddings)

  in_memory_retriever = in_memory_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
  in_memory_qa_chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=in_memory_retriever)

  result = in_memory_qa_chain({"question": query})
  return result

In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7, max_tokens=1500)
deeplake_retriever = deeplake_vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
qa_chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=deeplake_retriever)
deeplake_retriever.search_kwargs["distance_metric"] = "cos"

### Results
As well as processing time measurements

In [None]:
%%time
answer = get_answer("What are the signs and symptoms that indicate worsening heart failure")
# answer = get_answer("manage a newly diagnosed patient with atrial fibrillation")
# answer = get_answer("treatment options for an asymptomatic man with elevated PSA levels")
# answer = get_answer("underlying cause of diarrhea and chills in a patient with COPD")

#90sec for new texts
#40sec for db preloaded

[0.82154846 0.8165041  0.81514907 0.8147638  0.8145889  0.81025535
 0.8101315 ]


  warn_deprecated(


CPU times: user 1.64 s, sys: 165 ms, total: 1.8 s
Wall time: 15.7 s


In [None]:
answer

{'question': 'What are the signs and symptoms that indicate worsening heart failure',
 'answer': 'Signs and symptoms that indicate worsening heart failure include decompensated congestive heart failure (CHF), abrupt onset of episodic palpitations with shortness of breath, irregularly irregular beats on auscultation, high heart rate (140 bpm), cool, diaphoretic, and hypotensive episodes, crackles in the lungs from pulmonary vascular congestion, fever, chills, malaise, persistent productive cough with rust-colored sputum tinged with green, and the use of accessory muscles for respiration.\n',
 'sources': 'https://www.medscape.com/viewarticle/979747, https://reference.medscape.com/article/151066-overview, https://www.medscape.com/viewarticle/989470, https://reference.medscape.com/article/159222-overview'}

In [None]:
%%time
retrieve_documents("manage a newly diagnosed patient with atrial fibrillation")

[0.8290435  0.7870921  0.786822   0.7736869  0.76932687]
CPU times: user 278 ms, sys: 22.5 ms, total: 301 ms
Wall time: 5.14 s


array([Document(page_content='atrial fibrillation\n, and a recent \npercutaneous coronary intervention\n (PCI) for an ST-segment elevation \nmyocardial infarction\n (STEMI).\nOne day before he presented to the ED, the patient experienced sudden onset of progressively worsening dyspnea upon exertion, which was accompanied by increased fatigue. The shortness of breath occurred while he was walking around the house and climbing a flight of stairs. Sitting down eased his symptoms. He states that the dyspnea upon exertion represents an increase from his baseline condition.\nThe patient says that he has used two pillows while sleeping for "as long as I can remember." He does not recall any episodes of waking up at night and gasping for air. He denies any fever, chills, cough, recent sick contacts, or respiratory tract infections. He does not have any chest pain or leg swelling.\nThe patient also reports that his stools have become darker over the past 3 days; the consistency is normal. He ha

In [None]:
%%time
get_answer("severe headacke pills")

CPU times: user 280 ms, sys: 14.1 ms, total: 295 ms
Wall time: 10.3 s


{'question': 'severe headacke pills',
 'answer': 'There is no specific mention of severe headache pills in the information provided.\n',
 'sources': ''}

In [None]:
result = get_answer("symptoms of diabetes")
result

{'question': 'symptoms of diabetes',
 'answer': 'Symptoms of diabetes include hyperglycemia, glycosuria, polydipsia, unexplained weight loss, nonspecific malaise, and symptoms of ketoacidosis. Diabetic peripheral neuropathy causes sensorial and motor symptoms. \n',
 'sources': 'https://reference.medscape.com/article/919999-overview, http://reference.medscape.com/features/slideshow/dmc/'}

### Run the Chatbot

In [None]:
st.title(f"Chat with Medscape")
if "generated" not in st.session_state:
	st.session_state["generated"] = ["i am ready to help you"]

if "past" not in st.session_state:
	st.session_state["past"] = ["hello"]

user_input = st.text_input("", key="input")

if user_input:
	output = qa.run(user_input)
	st.session_state.past.append(user_input)
	st.session_state.generated.append(output)

if st.session_state["generated"]:
	for i in range(len(st.session_state["generated"])):
		message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
		message(st.session_state["generated"][i], key=str(i))

2024-06-13 12:52:08.971 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-06-13 12:52:08.972 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
