In [1]:
import time
import pickle
from tqdm.autonotebook import tqdm
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.vectorstores import VectorStore
import pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
import os
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# extracting the PDF data
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob='*.pdf',
        loader_cls=PyMuPDFLoader
    )
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf('data/') # here we can also extract multiple file/documents

In [5]:
#extracted_data

In [6]:
# create text chunks for the large document

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks


In [7]:
text_chunks = text_split(extracted_data)
print('Length of chunks',len(text_chunks))

Length of chunks 5779


In [8]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [9]:
embeddings = download_hugging_face_embeddings()

  warn_deprecated(


In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
query_result = embeddings.embed_query('Hello World')
print('length of the query is',len(query_result))

length of the query is 384


In [12]:
#Initializing the Pinecone


import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )


In [14]:
# Create a Pinecone index
if 'medical' not in pc.list_indexes().names():
    pc.create_index(
        name='medical',
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws", 
            region='us-east-1'
        )
    )


In [15]:
# Connect to the index
# Connect to the index
index_host ="https://medical-5h4j6t4.svc.aped-4627-b74a.pinecone.io"
index = pc.Index("medical", host=index_host)
# Prepare vectors for upsert
vectors = []
for idx, embedding in enumerate(embeddings):
    vectors.append({"id": str(idx), "values": embedding})




In [16]:
vectors

[{'id': '0',
  'values': ('client', SentenceTransformer(
     (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
     (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
     (2): Normalize()
   ))},
 {'id': '1',
  'values': ('model_name', 'sentence-transformers/all-MiniLM-L6-v2')},
 {'id': '2', 'values': ('cache_folder', None)},
 {'id': '3', 'values': ('model_kwargs', {})},
 {'id': '4', 'values': ('encode_kwargs', {})},
 {'id': '5', 'values': ('multi_process', False)},
 {'id': '6', 'values': ('show_progress', False)}]

In [17]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [18]:

from langchain_pinecone import PineconeVectorStore

namespace = "real"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "medical"

try:
    docsearch = PineconeVectorStore.from_documents(
        documents=text_chunks,
        embedding=embeddings,
        index_name=index_name,
        namespace=namespace
    )
except Exception as e:
    print(f"Error initializing PineconeVectorStore: {e}")
    docsearch = None

In [20]:
#Function to generate and save vectors to pickle
import pickle
def generate_and_save_vectors(embeddings, text_chunks, pickle_file):
    for i, t in zip(range(len(text_chunks)), text_chunks):
        query_result = embeddings.embed_query(t.page_content)
        index.upsert(
        vectors=[
                {
                    "id": str(i),  # Convert i to a string
                    "values": query_result, 
                    "metadata": {"text":str(text_chunks[i].page_content)} # meta data as dic
                }
            ],
            namespace="real" 
        )

    # Save vectors to pickle file
    with open(pickle_file, 'wb') as f:
        pickle.dump(vectors, f)
    
    print(f"Vectors saved to {pickle_file} successfully!")

In [21]:
# Generate vectors and save to pickle
pickle_file = 'vectors.pkl'
generate_and_save_vectors(embeddings, text_chunks, pickle_file)

Vectors saved to vectors.pkl successfully!


In [22]:
# Function to upload embeddings to Pinecone
def upload_embeddings_to_pinecone(pickle_file, index):
    # Load vectors from pickle file
    with open(pickle_file, 'rb') as f:
        vectors = pickle.load(f)
    
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i: i + batch_size]
        index.upsert(vectors=batch)
    
    print("Data uploaded successfully to Pinecone!")

In [24]:
# Upload embeddings to Pinecone
#upload_embeddings_to_pinecone(pickle_file, index)

In [25]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [26]:
PROMPT = PromptTemplate(template=prompt_template,input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [27]:
llm = CTransformers(model='Model_instruction/llama-2-7b-chat.ggmlv3.q4_0.bin',
                    model_type='llama',
                    config={'max_new_tokens':512,
                            'temperature':0.8})

In [28]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

In [29]:
for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)

{'matches': [{'id': '0',
              'metadata': {'text': 'The GALE\n'
                                   'ENCYCLOPEDIA\n'
                                   'of MEDICINE\n'
                                   'SECOND EDITION'},
              'score': 1.00085497,
              'values': [0.0214597061,
                         -0.00809714664,
                         -0.0261781085,
                         0.0161040854,
                         -0.0319497548,
                         0.00957581494,
                         0.00321903,
                         0.19288145,
                         -0.0324234217,
                         -0.0413296893,
                         0.00583916809,
                         0.0829793066,
                         0.0454243422,
                         0.0266077686,
                         -0.113559261,
                         0.00635322882,
                         -0.0320440531,
                         -0.0304008741,
                         -

In [30]:
query

{'matches': [{'id': 'fd834f0d-89ce-4546-affb-14fde5f8a364',
              'metadata': {'author': '',
                           'creationDate': "D:20041218170002-05'00'",
                           'creator': '',
                           'file_path': 'data\\Medical_book.pdf',
                           'format': 'PDF 1.5',
                           'keywords': '',
                           'modDate': "D:20041218161531-06'00'",
                           'page': 269.0,
                           'producer': 'PDFlib+PDI 5.0.0 (SunOS)',
                           'source': 'data\\Medical_book.pdf',
                           'subject': '',
                           'text': 'els of the excitatory neurohormone '
                                   'serotonin in the\n'
                                   'brain. They do not alter levels of '
                                   'norepinephrine. These\n'
                                   'have become the drugs of choice for a '
            