In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from  langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# extracting the PDF data
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob='*.pdf',
        loader_cls=PyMuPDFLoader
    )
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf('data/') # here we can also extract multiple file/documents

In [6]:
#extracted_data

In [7]:
# create text chunks for the large document

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks


In [8]:
text_chunks = text_split(extracted_data)
print('Length of chunks',len(text_chunks))

Length of chunks 5779


In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  warn_deprecated(


In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embeddings.embed_query('Hello World')
print('length of the query is',len(query_result))

length of the query is 384


In [66]:
#Initializing the Pinecone


import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )


In [67]:
# Create a Pinecone index
if 'medical' not in pc.list_indexes().names():
    pc.create_index(
        name='medical',
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            region='us-east-1'
        )
    )


In [68]:
# Connect to the index
# Connect to the index
index_host ="https://medical-5h4j6t4.svc.aped-4627-b74a.pinecone.io"
index = pc.Index("medical", host=index_host)
# Prepare vectors for upsert
vectors = []
for idx, embedding in enumerate(embeddings):
    vectors.append({"id": str(idx), "values": embedding})




In [69]:
vectors

[{'id': '0',
  'values': ('client', SentenceTransformer(
     (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
     (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
     (2): Normalize()
   ))},
 {'id': '1',
  'values': ('model_name', 'sentence-transformers/all-MiniLM-L6-v2')},
 {'id': '2', 'values': ('cache_folder', None)},
 {'id': '3', 'values': ('model_kwargs', {})},
 {'id': '4', 'values': ('encode_kwargs', {})},
 {'id': '5', 'values': ('multi_process', False)},
 {'id': '6', 'values': ('show_progress', False)}]

In [70]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [71]:
def upload_embeddings_to_pinecone(embeddings, index):
    vectors = []
    for idx, embedding in enumerate(embeddings):
        try:
            embedding_values = list(map(float, embedding))  # Convert each element to float
            vectors.append({"id": str(idx), "values": embedding_values})
        except ValueError:
            print(f"Skipping embedding {idx} due to non-numeric values: {embedding}")

    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i: i + batch_size]
        index.upsert(vectors=batch)

    print("Data uploaded successfully to Pinecone!")


In [72]:
# Upload embeddings to Pinecone
upload_embeddings_to_pinecone(embeddings, index)

Skipping embedding 0 due to non-numeric values: ('client', SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
))
Skipping embedding 1 due to non-numeric values: ('model_name', 'sentence-transformers/all-MiniLM-L6-v2')
Skipping embedding 2 due to non-numeric values: ('cache_folder', None)
Skipping embedding 3 due to non-numeric values: ('model_kwargs', {})
Skipping embedding 4 due to non-numeric values: ('encode_kwargs', {})
Skipping embedding 5 due to non-numeric values: ('multi_process', False)
Skipping embedding 6 due to non-numeric values: ('show_progress', False)
Data uploaded successfully to Pinecone!


In [None]:
for i, t in zip(range(len(text_chunks)), text_chunks):
   query_result = embeddings.embed_query(t.page_content)
   index.upsert(
   vectors=[
        {
            "id": str(i),  # Convert i to a string
            "values": query_result, 
            "metadata": {"text":str(text_chunks[i].page_content)} # meta data as dic
        }
    ],
    namespace="real" 
)