### Import prerequisites 

In [32]:
# Importing required libraries
from langchain.prompts import PromptTemplate        # Create custom templates for prompting the language model
from langchain.chains import RetrievalQA        # Build a retrival based question answering system
from langchain.embeddings import HuggingFaceEmbeddings      # Hugging face model to generate text embeddings
# from langchain.vectorstores import Pinecone     # For storing and retrieving vector embeddings
from pinecone import Pinecone    # Initialize and configure pinecone vector database
from langchain.document_loaders import PyPDFLoader, DirectoryLoader        # Load documents from directories, specially PDF and general files
from langchain.text_splitter import RecursiveCharacterTextSplitter      # Splits long texts into smaller chunks recursively for processing
from langchain.llms import CTransformers        # Used for model quantization


In [33]:
# Adding api key
PINECONE_API_KEY = "48846a07-d3ce-48fb-b187-16360f241270"

### Extract data and create embedding vectors

In [34]:
# function to extract data from pdf
def load_pdf_data(data_path):

    data_loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    document = data_loader.load()
    return document

In [35]:
# Load the PDF data as a list
extracted_pdf_data = load_pdf_data("data/")
print(type(extracted_pdf_data))
print(f"Length of the Data : {len(extracted_pdf_data)}")


100%|██████████| 1/1 [00:14<00:00, 14.47s/it]

<class 'list'>
Length of the Data : 759





In [36]:
extracted_pdf_data[:5]

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A. Quin, Editor, Ima

In [37]:
# Create text chunks
def get_text_chunks(data):
    extract_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = extract_chunks.split_documents(data)
    return text_chunks

In [38]:
text_chunks = get_text_chunks(extracted_pdf_data)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 8447


In [39]:
text_chunks[:5]

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': 'data\\Gale Encyclopedia 

In [40]:
# Getting the hugging face embedding model
def get_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [41]:
minilm_embeddings = get_hugging_face_embeddings()

In [42]:
# Check the embedding model working
ret = minilm_embeddings.embed_query("How are you doing ?")
print(f"Length of embedding: {len(ret)}")

Length of embedding: 384


### Initialize and enable our vector DB

In [43]:
# Pinecone index name same as pinecone webpage instance name
PINECONE_INDEX = "chatbot-medicine"
# Initialize pinecone client instance 
pc = Pinecone(api_key=PINECONE_API_KEY, pool_threads=30)

# Create the index, if it does not exist already 
if PINECONE_INDEX not in pc.list_indexes().names():
    pc.create_index(
        name = PINECONE_INDEX,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [44]:
index = pc.Index(PINECONE_INDEX)
index

<pinecone.data.index.Index at 0x2012f1533a0>

In [45]:
# # Comment this code, once data is upserted to pinecone vector DB
# for i, chunk in enumerate(text_chunks):
#     # Ensure chunk is a dictionary or object with page_content attribute
#     text_content = chunk['page_content'] if isinstance(chunk, dict) else chunk.page_content

#     vector = minilm_embeddings.embed_query(text_content) # Using embed_query for single query embedding 
    
#     # index.upsert([(f'vec_{i}', vector)], batch_size=200, show_progress=True)
#     index.upsert([
#         {
#             'id': f'vec_{i}',
#             'values': vector,
#             'metadata': {'text_chunk': text_content}
#         }
#     ], batch_size=200, show_progress=True)


Upserted vectors:   0%|          | 0/1 [00:00<?, ?it/s]

Upserted vectors: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.64it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
Upserted vectors: 100%|██████████| 1/1 [

In [47]:
# Function to query the vector database
def get_response_from_db(query_text, top_k=3):
    # 1. Generate embeddings for input query
    query_embed = minilm_embeddings.embed_query(query_text)

    # 2. Run vector db search for the query using pinecone index
    query_response = index.query(vector=query_embed,
                                 top_k=top_k,
                                 include_values= False,  # include vector values in the response
                                 include_metadata= True)
    return query_response

query_text = "What is liver failure"
responses = get_response_from_db(query_text=query_text)

print(responses)


{'matches': [{'id': 'vec_1867',
              'metadata': {'text_chunk': 'pattern and extent of liver injury, '
                                         'indicate functionalabnormalities, '
                                         'and establish the cause of the '
                                         'condition.However, most misdiagnoses '
                                         'occur when physicians relymore on '
                                         'laboratory analysis than on detailed '
                                         'medical his-tory and the results of '
                                         'a thorough physical examination .'},
              'score': 0.602567494,
              'values': []},
             {'id': 'vec_1761',
              'metadata': {'text_chunk': 'infection. Liver transplantgives the '
                                         'best chance for long-term survival.'},
              'score': 0.564635694,
              'values': []},
            