<a href="https://colab.research.google.com/github/LaibaKhan112/36-Weeks-Preparation-Challenge/blob/main/RAG_LANGCHAIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU langchain-pinecone

In [None]:
!pip install -qU langchain langchain_community langchain-core

In [None]:
!pip install numpy==1.26.4 google-auth==2.43.0 --force-reinstall

# Initializing Pinecone

In [None]:
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
PINECONE_API_KEY

In [None]:
from pinecone import Pinecone
pc = Pinecone(api_key = PINECONE_API_KEY)

In [None]:

from pinecone import ServerlessSpec

index_name = "my-langchain-project-02"

if not pc.has_index(index_name):
  pc.create_index(
      name= index_name,
      dimension = 1024,
      metric = "cosine",
      spec = ServerlessSpec(
          cloud = "aws", region = "us-east-1"
  )
  )

index = pc.Index(index_name)

# Document Loader

In [None]:
!pip install -qU langchain-community pyPDF

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader =PyPDFLoader("Laiba Khan.docx.pdf")
document = loader.load()

In [None]:
document[0]

# Text Splitting

In [None]:
!pip install -U langchain-text-splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=150
)

chunks = text_splitter.split_documents(document)

print(chunks[1].page_content)


In [None]:
chunks

# Creating Embeddings

In [None]:
!pip install -qU langchain-google-genai

In [None]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
GOOGLE_API_KEY

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model = "models/embedding-001",
    api_key=GOOGLE_API_KEY
)

texts = [chunk.page_content for chunk in chunks]

vectors = embeddings.embed_documents(texts)



In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/multilingual-e5-large")
vectors = model.encode(texts).tolist()



In [None]:
print(len(vectors[0]))


In [None]:
from uuid import uuid4

to_upsert = []
for vec, text in zip(vectors, texts):
    to_upsert.append((
        str(uuid4()),
        vec,
        {"text": text}
    ))

# 3) Upsert into Pinecone
index.upsert(vectors=to_upsert)

print("Upserted:", len(to_upsert))

In [None]:
query = "What is this PDF about?"
query_vector = model.encode(query).tolist()

results = index.query(
    vector=query_vector,
    top_k=5,
    include_metadata=True
)


for match in results["matches"]:
    print("Score:", match["score"])
    print(match["metadata"]["text"])
    print("-----")


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model = "gemini-2.5-flash",
    api_key = GOOGLE_API_KEY)

In [None]:
from google import genai

llm = genai.Client(api_key=GOOGLE_API_KEY)

def answer_to_user(query: str):

    # 1) Embed user query
    query_vector = model.encode(query).tolist()

    # 2) Similarity search in Pinecone
    results = index.query(
        vector=query_vector,
        top_k=2,
        include_metadata=True,
         # remove if you didnâ€™t use namespace
    )

    # 3) Collect matched text
    context = ""
    for match in results["matches"]:
        context += match["metadata"]["text"] + "\n\n"

    # 4) Ask LLM
    prompt = f"""
    Answer the question using the context below.
    If answer not found in context, say "Not found in document".

    Context:
    {context}

    Question:
    {query}
    """

    response = llm.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

    return response.text



In [None]:
answer_to_user("What is this PDF about ?")

In [None]:
answer = answer_to_user("Who is  Laiba?")
print(answer)


In [None]:
answer = answer_to_user("What is the cgpa of laiba")
print(answer)


In [None]:
print(answer)