### Import prerequisites 

In [2]:
# Importing required libraries
from langchain.prompts import PromptTemplate        # Create custom templates for prompting the language model
from langchain.chains import RetrievalQA        # Build a retrival based question answering system
from langchain.embeddings import HuggingFaceEmbeddings      # Hugging face model to generate text embeddings
# from langchain.vectorstores import Pinecone     # For storing and retrieving vector embeddings
from pinecone import Pinecone    # Initialize and configure pinecone vector database
from langchain.document_loaders import PyPDFLoader, DirectoryLoader        # Load documents from directories, specially PDF and general files
from langchain.text_splitter import RecursiveCharacterTextSplitter      # Splits long texts into smaller chunks recursively for processing
from langchain.llms import CTransformers        # Used for model quantization


  from tqdm.autonotebook import tqdm


In [3]:
# Adding api key
PINECONE_API_KEY = "48846a07-d3ce-48fb-b187-16360f241270"

### Extract data and create embedding vectors

In [4]:
# function to extract data from pdf
def load_pdf_data(data_path):

    data_loader = DirectoryLoader(
        path=data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )

    document = data_loader.load()
    return document

In [5]:
# Load the PDF data as a list
extracted_pdf_data = load_pdf_data("data/")
print(type(extracted_pdf_data))
print(f"Length of the Data : {len(extracted_pdf_data)}")


100%|██████████| 1/1 [00:20<00:00, 20.31s/it]

<class 'list'>
Length of the Data : 759





In [6]:
extracted_pdf_data[:5]

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A. Quin, Editor, Ima

In [7]:
# Create text chunks
def get_text_chunks(data):
    extract_chunks = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = extract_chunks.split_documents(data)
    return text_chunks

In [8]:
text_chunks = get_text_chunks(extracted_pdf_data)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 8447


In [9]:
text_chunks[:5]

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'data\\Gale Encyclopedia of Medicine. Vol. 2. 2nd ed.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': 'data\\Gale Encyclopedia 

In [10]:
# Getting the hugging face embedding model
def get_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
minilm_embeddings = get_hugging_face_embeddings()

In [12]:
# Check the embedding model working
ret = minilm_embeddings.embed_query("How are you doing ?")
print(f"Length of embedding: {len(ret)}")

Length of embedding: 384


### Initialize and enable our vector DB

In [13]:
# Pinecone index name same as pinecone webpage instance name
PINECONE_INDEX = "chatbot-medicine"
# Initialize pinecone client instance 
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create the index, if it does not exist already 
if PINECONE_INDEX not in pc.list_indexes().names():
    pc.create_index(
        name = PINECONE_INDEX,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [14]:
index = pc.Index(PINECONE_INDEX)
index

<pinecone.data.index.Index at 0x167d1f45430>

In [21]:
for i, chunk in enumerate(text_chunks):
    # Ensure chunk is a dictionary or object with page_content attribute
    text_content = chunk['page_content'] if isinstance(chunk, dict) else chunk.page_content

    vector = minilm_embeddings.embed_query(text_content) # Using embed_query for single query embedding 
    index.upsert([(f'vec_{i}', vector)])

KeyboardInterrupt: 