Importing Dependencies

In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from sentence_transformers import SentenceTransformer

API insertion

In [2]:
PINECONE_API_KEY ='pcsk_6ExTpy_85KVZMw5MqgFrcj3faES5qKuCmGU7ikoTXc5JuaztphUHAeK7dT2qrg8VVfRm5R'

PINECONE_API_ENV = 'us-east-1'                # pinecone Version 3 don't need any environment parameter


DATA LOADING & EXTRACTION 

In [3]:
def load_pdf(data):

    loader = DirectoryLoader(data, glob="*.pdf" ,loader_cls=PyPDFLoader)       # Load all PDF files from a directory

    document=loader.load()           # here .load() is a method which will return a list of dictionaries, each dictionary contains the text of a pdf file
    return document

In [4]:
extracted_data = load_pdf("data/")    # Load all PDF files from a directory
#extracted_data

Alternate way for data extraction 

In [5]:
# from langchain.document_loaders import PyPDFLoader

# def load_pdf(file_path):
#     """Loads a PDF file and returns a Document object."""
#     loader = PyPDFLoader(file_path)  # Use PyPDFLoader to load a single file
#     document = loader.load()
#     return document

# extracted_data = load_pdf("/content/Medical_book.pdf")

CHUNKING OF TEXT DATA

In [6]:
def chunk_splitter(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500 , chunk_overlap=50)  # Split the text into chunks of 500 characters with an overlap of 50 characters

    text_chunks = splitter.split_documents(extracted_data)  # here .split() is a method which will return a list of dictionaries, each dictionary contains the text of a chunk
    return text_chunks

In [7]:
chunks = chunk_splitter(extracted_data)
#chunks
print('No of chunks:', len(chunks))

No of chunks: 5961


In [8]:
def load_embedding_model():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding

In [9]:
embeddings = load_embedding_model()

embeddings

  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

Testing The Embedding Model 

In [10]:
result_query = embeddings.embed_query("I Am Abhishek")
print('length of query',len(result_query))

length of query 384


**PINECONE VERSION - Version: 6.0.2**

STEPS: 

1.INITIALIZE THE PINECONE \
2.CONNECT THE INDEX \
3.UPSERT THE VECTOR 

In [11]:
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_6ExTpy_85KVZMw5MqgFrcj3faES5qKuCmGU7ikoTXc5JuaztphUHAeK7dT2qrg8VVfRm5R")
print(pc.list_indexes())

[{
    "name": "medicalchatbot",
    "metric": "cosine",
    "host": "medicalchatbot-sj1ytcn.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}, {
    "name": "medicalchatbot2",
    "metric": "cosine",
    "host": "medicalchatbot2-sj1ytcn.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "embed": {
        "model": "llama-text-embed-v2",
        "field_map": {
            "text": "text"
        },
        "dimension": 384,
        "metr

In [12]:
index = pc.Index("medicalchatbot2")

In [13]:
def insert_vectors(chunks, batch_size=50):
    vectors = []

    for i, chunk in enumerate(chunks):
        embedding = embeddings.embed_query(chunk.page_content)  # 🔹 Generate embeddings for each chunk
        vectors.append({
            "id": str(i),
            "values": embedding,
            "metadata": {"text": chunk.page_content}  # Metadata for retrieval
        })

        # 🔹 Insert in Batches (Every `batch_size` chunks)
        if len(vectors) >= batch_size:
            index.upsert(vectors=vectors)  # Upsert batch
            print(f"Inserted {len(vectors)} vectors into Pinecone.")
            vectors = []  # Clear batch

    # 🔹 Insert any remaining vectors
    if vectors:
        index.upsert(vectors=vectors)
        print(f"Inserted {len(vectors)} remaining vectors into Pinecone.")

# Insert embeddings into Pinecone
insert_vectors(chunks, batch_size=50)  # 🔹 Set batch size to 50

Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors into Pinecone.
Inserted 50 vectors 

**TESTING WITH A QUERY**

In [14]:
query_embedding = embeddings.embed_query("What are the symptoms of diabetes?")
search_results = index.query(vector=query_embedding, top_k=3, include_metadata=True)

for res in search_results["matches"]:
    print(res["metadata"]["text"])  # Print retrieved text

affected and can range greatly.
• Type I diabetes mellitus. Characterized by fatigue and
an abnormally high level of glucose in the blood
(hyperglycemia).
• Amyotrophic lateral schlerosis. First signs are stum-
bling and difficulty climbing stairs. Later, muscle
cramps and twitching may be observed as well as
weakness in the hands making fastening buttons or
turning a key difficult. Speech may become slowed or
slurred. There may also be difficluty swallowing. As
begin to fall. A person with diabetes mellitus either does
not make enough insulin, or makes insulin that does not
work properly. The result is blood sugar that remains
high, a condition called hyperglycemia.
Diabetes must be diagnosed as early as possible. If
left untreated, it can damage or cause failure of the eyes,
kidneys, nerves, heart, blood vessels, and other body
organs. Hypoglycemia, or low blood sugar, may also be
discovered through blood sugar testing. Hypoglycemia is
Resources
BOOKS
A Manual of Laboratory and Diagn

In [15]:

docsearch = index  # Assign the existing index to docsearch

query = "What are Allergies"

docs = docsearch.query(vector=embeddings.embed_query(query), top_k=3, include_metadata=True)  # Use query method on the index object

print("Result:", docs)  # Print the search results

Result: {'matches': [{'id': '1176',
              'metadata': {'text': 'GALE ENCYCLOPEDIA OF MEDICINE 2118\n'
                                   'Allergies\n'
                                   'GEM - 0001 to 0432 - A  10/22/03 1:42 PM  '
                                   'Page 118'},
              'score': 0.765826583,
              'values': []},
             {'id': '1203',
              'metadata': {'text': 'GALE ENCYCLOPEDIA OF MEDICINE 2 121\n'
                                   'Allergies\n'
                                   'GEM - 0001 to 0432 - A  10/22/03 1:42 PM  '
                                   'Page 121'},
              'score': 0.739715,
              'values': []},
             {'id': '1138',
              'metadata': {'text': 'ders Co., 1993.\n'
                                   'Lawlor, G. J. Jr., T. J. Fischer, and D. '
                                   'C. Adelman. Manual of\n'
                                   'Allergy and Immunology.Boston: Little, '
      

In [16]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [17]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [18]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [19]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

AttributeError: 'Index' object has no attribute 'as_retriever'

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),  # Now it will work
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs
)