In [3]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [4]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [7]:
extracted_data = load_pdf("data/")

In [8]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4000, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))


length of my chunk: 7051


In [10]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [11]:
embeddings = download_hugging_face_embeddings()



In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))


Length 384


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pcsk_2hNQJX_UhPypTTXoh7QK1k7ZE56k3REZhunvaSdrrmUSyRkwQPCKuDv2YZFupXTRD3HTGd")
index_name = "medical-chast"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name       = index_name,
        dimension  = 384,              # MiniLM or Llama-Embed = 384 dims
        metric     = "cosine",
        spec       = ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)
print("✔ basic index ready — you’ll embed locally")


✔ basic index ready — you’ll embed locally


In [None]:
# ------------------------------------------------------------------
# 1.  Embed chunks and push them to Pinecone
# ------------------------------------------------------------------
import uuid, tqdm

# text_chunks  -> a list of LangChain Document objects
texts      = [d.page_content for d in text_chunks]
metadatas  = [d.metadata      for d in text_chunks]   # keeps page, file, etc.
vectors    = embeddings.embed_documents(texts)       # List[List[float]]

BATCH = 100
items = [
    (str(uuid.uuid4()), vectors[i], metadatas[i])    # (id, vector, metadata)
    for i in range(len(vectors))
]

for i in tqdm.trange(0, len(items), BATCH, desc="Upserting"):
    index.upsert(items[i : i + BATCH])

print("✅ Upsert complete — total vectors:",
      index.describe_index_stats()["total_vector_count"])

# ------------------------------------------------------------------
# 2.  Example similarity search
# ------------------------------------------------------------------

Upserting: 100%|██████████| 71/71 [02:14<00:00,  1.89s/it]


✅ Upsert complete — total vectors: 7051


TypeError: unhashable type: 'slice'

In [30]:
matches = pinecone_search("What are allergies?", k=3)

for i, m in enumerate(matches, 1):
    meta = m.get("metadata", {})
    
    # Print everything useful in metadata
    print(f"\n--- Match {i} ---")
    print(f"Score: {m.get('score', 'N/A')}")
    print(f"Metadata: {meta}")


--- Match 1 ---
Score: 0.673717141
Metadata: {'page': 158.0, 'source': 'data\\Medic.pdf'}

--- Match 2 ---
Score: 0.659275115
Metadata: {'page': 151.0, 'source': 'data\\Medic.pdf'}

--- Match 3 ---
Score: 0.659085155
Metadata: {'page': 147.0, 'source': 'data\\Medic.pdf'}


In [31]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [32]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [47]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# 1) Build a lightweight text-generation pipeline
hf_pipe = pipeline("text-generation",
                   model="distilgpt2",
                   max_new_tokens=256)


Device set to use cpu


In [50]:
import uuid, tqdm

items = [
    (
        str(uuid.uuid4()),
        vectors[i],
        {**metadatas[i], "text": texts[i]}   # <-- add the raw chunk text here
    )
    for i in range(len(vectors))
]

for i in tqdm.trange(0, len(items), 100, desc="Re-upserting w/ text"):
    index.upsert(items[i : i + 100])

# ------------------------------------------------------------------
# 1.  Helper: embed query + search Pinecone
# ------------------------------------------------------------------
def pinecone_search(question: str, k: int = 3):
    q_vec = embeddings.embed_query(question)
    res   = index.query(vector=q_vec, top_k=k, include_metadata=True)
    return res["matches"]

Re-upserting w/ text: 100%|██████████| 71/71 [02:33<00:00,  2.16s/it]


In [53]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

hf_pipe = pipeline(
    "text-generation",
    model="distilgpt2",
    max_new_tokens=256,
)
llm = HuggingFacePipeline(pipeline=hf_pipe)

# ------------------------------------------------------------------
# 3.  LangChain RetrievalQA using a *custom* retriever
# ------------------------------------------------------------------
class PineconeRetriever:
    """Very small wrapper so LangChain can call our pinecone_search()."""
    def __init__(self, k: int = 3):
        self.k = k
    def get_relevant_documents(self, query):
        matches = pinecone_search(query, self.k)
        # Convert Pinecone matches -> LangChain Document objects
        from langchain.schema import Document
        return [
            Document(page_content=m["metadata"]["text"], metadata=m["metadata"])
            for m in matches
        ]

retriever = PineconeRetriever(k=2)


Device set to use cpu


In [54]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)


ValidationError: 1 validation error for RetrievalQA
retriever
  value is not a valid dict (type=type_error.dict)

In [46]:
llm = HuggingFacePipeline(pipeline=pipe)

# Test prompt
response = llm("mather father")
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response:  and father of two.
































































































