In [3]:
print("Hello")

Hello


In [4]:
%pwd

'e:\\Unibot\\research'

In [5]:
import os
os.chdir("../")

In [6]:
%pwd

'e:\\Unibot'

In [7]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
# Extract text from pdf file
def load_pdf_files(data):
    loader = DirectoryLoader(
        data, # path
        glob="*.pdf", # loads all the file with .pdf extension
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [9]:
extracted_data = load_pdf_files("data")
print(extracted_data)

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-09-08T06:40:24+00:00', 'author': 'Mohammad Haris', 'moddate': '2025-09-08T06:40:24+00:00', 'source': 'data\\Uniworld Studios.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content="Uniworld Studios: Company Profile and Services \n1. Company Overview \nUniworld Studios is a marketing agency based in Delhi NCR, described as the region's fastest-growing \nmarketing giant. With over a decade of experience, the agency focuses on nurturing brands, building \nstrategies, and evolving businesses into brand experiences. \nKey Metrics \n\uf0b7 Experience: 10+ Years \n\uf0b7 Clients: 600+ Satisfied Clients \n\uf0b7 Global Reach: 20+ Countries Served \n\uf0b7 Websites Developed: 500+ \n\uf0b7 Designs Delivered: 100,000+ \n\uf0b7 Leads Generated: 4.5 Million+ \n\uf0b7 Sales Pipeline Created: 2 Billion+ \n\uf0b7 Sales Generated: 350 Million+ \nFounder & CEO: Saransh Gupta \nSaran

In [10]:
len(extracted_data)

8

In [11]:
# Just taking source and page content
from typing import List
from langchain.schema import Document

def filter_to_docs_minimal(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata = {"source": src}
            )
        )
    return minimal_docs

In [12]:
minimal_docs = filter_to_docs_minimal(extracted_data)
print(minimal_docs)

[Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content="Uniworld Studios: Company Profile and Services \n1. Company Overview \nUniworld Studios is a marketing agency based in Delhi NCR, described as the region's fastest-growing \nmarketing giant. With over a decade of experience, the agency focuses on nurturing brands, building \nstrategies, and evolving businesses into brand experiences. \nKey Metrics \n\uf0b7 Experience: 10+ Years \n\uf0b7 Clients: 600+ Satisfied Clients \n\uf0b7 Global Reach: 20+ Countries Served \n\uf0b7 Websites Developed: 500+ \n\uf0b7 Designs Delivered: 100,000+ \n\uf0b7 Leads Generated: 4.5 Million+ \n\uf0b7 Sales Pipeline Created: 2 Billion+ \n\uf0b7 Sales Generated: 350 Million+ \nFounder & CEO: Saransh Gupta \nSaransh Gupta is a dynamic business and marketing expert who delivers key marketing strategies and \nmanages business operations at Uniworld Studios. A serial entrepreneur, he has vested interests in \nfamily-managed Iron & Steel, Re

In [51]:
# Split the dcocuments into chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 150,
        chunk_overlap = 50
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [52]:
text_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunk)}")

Number of chunks: 93


In [53]:
text_chunk

[Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='Uniworld Studios: Company Profile and Services \n1. Company Overview'),
 Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content="1. Company Overview \nUniworld Studios is a marketing agency based in Delhi NCR, described as the region's fastest-growing"),
 Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='marketing giant. With over a decade of experience, the agency focuses on nurturing brands, building'),
 Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='strategies, and evolving businesses into brand experiences. \nKey Metrics \n\uf0b7 Experience: 10+ Years \n\uf0b7 Clients: 600+ Satisfied Clients'),
 Document(metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='\uf0b7 Clients: 600+ Satisfied Clients \n\uf0b7 Global Reach: 20+ Countries Served \n\uf0b7 Websites Developed: 500+ \n\uf0b7 Designs Delivered: 100,000+'),
 Document(metadata

In [54]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name
    )
    return embeddings

embedding = download_embeddings()

In [55]:
embedding

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [56]:
vector = embedding.embed_query("This is a test sentence to check embedding")
vector

[0.03809228911995888,
 -0.025415385141968727,
 0.051142480224370956,
 0.002041621832177043,
 0.06559532135725021,
 0.055631693452596664,
 0.04082754626870155,
 -0.059887416660785675,
 0.00482234125956893,
 -0.026000911369919777,
 0.1026691198348999,
 -0.050836432725191116,
 0.03576919436454773,
 0.042771607637405396,
 -0.07258523255586624,
 0.016767244786024094,
 0.09894763678312302,
 -0.020347049459815025,
 -0.05005906894803047,
 -0.011608455330133438,
 -0.02494693547487259,
 -0.003961370792239904,
 0.06463190913200378,
 -0.0752919390797615,
 0.04063158854842186,
 0.00020833671442233026,
 -0.05909157544374466,
 0.047096945345401764,
 0.07448336482048035,
 -0.044449567794799805,
 0.04908823221921921,
 -0.022806962952017784,
 -0.018487900495529175,
 0.060960110276937485,
 0.08270759135484695,
 0.04944857954978943,
 0.06621485203504562,
 -0.019914694130420685,
 -0.007532747462391853,
 0.015988439321517944,
 -0.0008673770353198051,
 -0.0686541348695755,
 0.016207829117774963,
 0.029175611

In [57]:
print("Vector length:", len(vector))

Vector length: 384


In [58]:
from dotenv import load_dotenv
load_dotenv()
import os

In [59]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [60]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)
pc

<pinecone.pinecone.Pinecone at 0x1bd75219310>

In [61]:
# Creating Pinecone Index
from pinecone import ServerlessSpec

index_name = "unibot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [62]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding,
    index_name=index_name
)

In [63]:
# Load Existing Index form Pinecone and use that index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Add more data into the existing index


In [64]:
sample = Document(
    page_content="This is a sample data to be upserted in the existing index",
    metadata={"source": "My gut feeling"}
)

In [65]:
docsearch.add_documents(documents=[sample])

['281c9e2b-f637-4b35-9f48-8a24c0d7487e']

In [66]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [67]:
retrieved_docs = retriever.invoke("What services does uniworld provide?")
retrieved_docs

[Document(id='0447a35b-830a-4eb1-ba38-ae29109747fe', metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='Uniworld Studios: Company Profile and Services \n1. Company Overview'),
 Document(id='0e2086b1-5784-4b43-ba1b-2708e071dda4', metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='Uniworld Studios: Company Profile and Services \n1. Company Overview'),
 Document(id='8ab799ca-bf83-4eca-a3de-96c5269d7a7b', metadata={'source': 'data\\Uniworld Studios.pdf'}, page_content='Uniworld Studios: Company Profile and Services \n1. Company Overview')]

In [81]:
from langchain_groq import ChatGroq

chatModel = ChatGroq(model="openai/gpt-oss-120b")

In [82]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate



In [84]:
system_prompt = (
"""   
You are a helpful assistant for Uniworld Studios.
Your job is to answer the user's question **only** using the information provided in the context below.

{context}
"""
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [85]:
QnA_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, QnA_chain)

In [86]:
response = rag_chain.invoke({"input": "Give me company overview"})
print(response["answer"])

I’m sorry, but the provided context does not contain any details about Uniworld Studios’ company overview.


In [87]:
response = rag_chain.invoke({"input": "what is the sample sentence"})
print(response["answer"])

The sample sentence is:

**“This is a sample data to be upserted in the existing index.”**
