In [6]:
# from sentence_transformers import 
# import ctransformers
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from dotenv import load_dotenv

import os


In [7]:
def load_pdf(data):
	loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
	documents = loader.load()
	return documents

In [8]:
extracted_data = load_pdf('data/')

In [10]:
def text_split(extracted_data):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
	return text_splitter.split_documents(extracted_data)

In [11]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

3248


In [13]:
model = "sentence-transformers/sentence-t5-base"
embeddings = HuggingFaceBgeEmbeddings(model_name=model)

In [14]:
load_dotenv()
# from pinecone_notebooks.colab import Authenticate

# Authenticate()

True

In [15]:
from pinecone import Pinecone
from pinecone import ServerlessSpec
import time

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_name = "medical-chatbot"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)
index = pc.Index(index_name)
# [t.page_content for t in text_chunks]
index

<pinecone.data.index.Index at 0x1344e8b80>

In [16]:
from langchain_pinecone import PineconeVectorStore

# Function to connect to existing index or create a new one
def get_or_create_pinecone_index(index_name, text_chunks, embeddings):
    try:
        # Attempt to connect to the existing Pinecone index
        docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)
        print("Connected to existing Pinecone index.")

        query = "What causes malaria"
        docs = docsearch.similarity_search(query)
        # print(docs[0].page_content)
        if len(docs) == 0:
             docsearch = PineconeVectorStore.from_texts(
            [t.page_content for t in text_chunks], embedding=embeddings, index_name=index_name
        )
    except Exception as e:
        print(f"Failed to connect to existing index: {e}")
        print("Creating a new Pinecone index...")
        # Create a new Pinecone index
        docsearch = PineconeVectorStore.from_texts(
            [t.page_content for t in text_chunks], embedding=embeddings, index_name=index_name
        )
        print("New Pinecone index created.")
    return docsearch


In [17]:
docsearch = get_or_create_pinecone_index(index_name, text_chunks, embeddings)

Connected to existing Pinecone index.


In [13]:
query = "what causes malaria"

In [15]:
retriever = docsearch.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)
for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


## Document 0

organisms known as protozoa. The only way to getmalaria is to be bitten by a certain type of mosquito thathas bitten someone who has the disease.Thanks to mos-quito control programs, malaria has been eliminated inthe United States, almost all of Europe, and large parts ofCentral and South America. However, mosquito controlhas not worked well in other parts of the world, andmalaria continues to be a major health problem in partsof Africa, Southeast Asia, Latin America, Haiti, theDominican Republic, and some Pacific Islands. Everyyear, some 30,000 Americans and Europeans who travelto these areas get malaria. People planning to travel tothe tropics are often advised to take antimalarial drugsbefore, during, and after their trips, to help them avoidgetting the disease and bringing it home with them.These drugs kill Plasmodium or prevent its growth.
In recent years, some strains of Plasmodium have

## Document 1

but some cases are associated with hereditary diseases.
GALE E

In [67]:
found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)
for i, doc in enumerate(found_docs):
    print(f"{i + 1}.", doc.page_content, "\n")

1. organisms known as protozoa. The only way to getmalaria is to be bitten by a certain type of mosquito thathas bitten someone who has the disease.Thanks to mos-quito control programs, malaria has been eliminated inthe United States, almost all of Europe, and large parts ofCentral and South America. However, mosquito controlhas not worked well in other parts of the world, andmalaria continues to be a major health problem in partsof Africa, Southeast Asia, Latin America, Haiti, theDominican Republic, and some Pacific Islands. Everyyear, some 30,000 Americans and Europeans who travelto these areas get malaria. People planning to travel tothe tropics are often advised to take antimalarial drugsbefore, during, and after their trips, to help them avoidgetting the disease and bringing it home with them.These drugs kill Plasmodium or prevent its growth.
In recent years, some strains of Plasmodium have 

2. but some cases are associated with hereditary diseases.
GALE ENCYCLOPEDIA OF MEDICINE 

In [18]:
# from langchain.chains import RetrievalQA
from langchain import HuggingFaceHub

model = "google/flan-t5-base"

# Initialize HuggingFaceHub with your API token
hf_hub = HuggingFaceHub(
    repo_id="google/flan-t5-base",
)
retriever = docsearch.as_retriever();
qa_chain = RetrievalQA.from_chain_type(llm=hf_hub, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [73]:
# query = "how much money did microsoft raise?"

results = qa_chain(query)
print(results)
# process_llm_response()

  warn_deprecated(


{'query': 'What causes malaria', 'result': 'Plasmodium', 'source_documents': [Document(page_content='organisms known as protozoa. The only way to getmalaria is to be bitten by a certain type of mosquito thathas bitten someone who has the disease.Thanks to mos-quito control programs, malaria has been eliminated inthe United States, almost all of Europe, and large parts ofCentral and South America. However, mosquito controlhas not worked well in other parts of the world, andmalaria continues to be a major health problem in partsof Africa, Southeast Asia, Latin America, Haiti, theDominican Republic, and some Pacific Islands. Everyyear, some 30,000 Americans and Europeans who travelto these areas get malaria. People planning to travel tothe tropics are often advised to take antimalarial drugsbefore, during, and after their trips, to help them avoidgetting the disease and bringing it home with them.These drugs kill Plasmodium or prevent its growth.\nIn recent years, some strains of Plasmodi

In [22]:

promptTemplate = '''
Use the following piece of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}
Only return the helpful answer below and nothing else
Helpful answer:
'''

In [23]:
from langchain import PromptTemplate

PROMPT = PromptTemplate(template=promptTemplate, input=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
# model = "google/flan-t5-base"
from langchain_community.llms import CTransformers
model_path = 'model/models--TheBloke--Llama-2-13B-chat-GGML/snapshots/3140827b4dfcb6b562cd87ee3d7f07109b014dd0/llama-2-13b-chat.ggmlv3.q5_1.bin'
llm = CTransformers(model=model_path, model_type="llama")

print(llm.invoke('AI is going to'))


In [24]:
from langchain import HuggingFaceHub

model = "google/flan-t5-base"

# Initialize HuggingFaceHub with your API token
hf_hub = HuggingFaceHub(
    repo_id="google/flan-t5-base",
)

In [25]:
retriever = docsearch.as_retriever(search_kwargs={'k': 2});
qa_chain = RetrievalQA.from_chain_type(
	llm=hf_hub,
	chain_type="stuff",
	retriever=retriever,
	return_source_documents=True,
	chain_type_kwargs=chain_type_kwargs
)

In [26]:
# while True:
# 	user_input=input(f">>>")
result = qa_chain({"query": "what is acne?"})
print("Response: ", result["result"])

  warn_deprecated(


Response:  acne is a bacterial infection
