In [2]:
import pdfplumber
from pinecone import Pinecone,ServerlessSpec
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
import os
from dotenv import load_dotenv
load_dotenv()


True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_CLOUD = os.getenv("PINECONE_CLOUD")
PINECONE_REGION = os.getenv("PINECONE_REGION")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small") 
LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini") 
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 500))  
TOP_K = int(os.getenv("TOP_K", 3)) 
PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "pdf_namespace")

In [4]:
try:
    embedder=OpenAIEmbeddings(model=EMBEDDING_MODEL,api_key=os.environ["OPENAI_API_KEY"])
except Exception as e:
    print(f"Error initializing Openai embdeddings :{e}")
    exit(1)

In [5]:
try:
    pc=Pinecone(api_key=PINECONE_API_KEY)
except Exception as e:
    print(f"Error initaliing Pincone: :{e}")
    exit(1)

In [6]:
try:
    if PINECONE_INDEX_NAME not in pc.list_indexes().names():
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(
                cloud=PINECONE_CLOUD,
                region=PINECONE_REGION
            )
        )
        index=pc.Index(PINECONE_INDEX_NAME)
except Exception as E:
    print(f"Error in connecting to Pinecone INDEX {E}")
    exit(1)

In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'pdf_namespace': {'vector_count': 41}},
 'total_vector_count': 41,
 'vector_type': 'dense'}

In [7]:
try:
    llm=ChatOpenAI(
        model=LLM_MODEL,
        api_key=os.environ["OPENAI_API_KEY"]
    )
except Exception as e:
    print(f"Error in Initialiing Openai llm :{e}")
    exit(1)

In [11]:
prompt_template=ChatPromptTemplate.from_template(
    "Based on the following context,answer the question :\n\nContext :{context} \n\nQuestion :{question}\n\nAnswer: "
)

In [12]:
def extract_text(pdf_path):
    try:
        text=""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text+=page.extract_text() or ""
        return text
    except Exception as e:
        print(f"Error extracting text :{e}")
        return ""

In [13]:
def chunk_text(text,chunk_size=CHUNK_SIZE):
    return [text[i:i+chunk_size] for i in range(0,len(text),chunk_size)]


In [14]:
def create_embeddings(chunks,embedder):
    try:
        return embedder.embed_documents(chunks)
    except Exception as e:
        print(f"Error creating embeddings {e}")
        return []

In [15]:
def store_pinecone(chunks,embeddings,index):
    vectors=[
        {"id":f"chunk_{i}","values":embedding,"metadata":{"text":chunk}} for i,(chunk,embedding) in enumerate(zip(chunks,embeddings))
    ]
    try:
        index.upsert(vectors=vectors,namespace=PINECONE_NAMESPACE)
        print(f"Upserted {len(vectors)} vectors")
    except Exception as e:
        print(f"Error in upserting to Pinecone {e}")

In [16]:
def retrieve_chunks(query,index,embedder,top_k=TOP_K):
    try:
        query=embedder.embed_query(query)
        results=index.query(
            vector=query,
            top_k=top_k,
            include_metadata=True,
            namespace=PINECONE_NAMESPACE
        )
        return [match["metadata"]["text"] for match in results["matches"]]
    except Exception as e:
        print(f"Error querying Pinecone {e}")
        return []

In [17]:
def gen_answer(query,rel_chunks,llm,prompt_template):
    try:
        context="\n".join(rel_chunks)
        prompt=prompt_template.format_messages(context=context,question=query)
        response=llm.invoke(prompt)
        return response.content
    except Exception as e:
        print(f"Error generating answer : {e}")
        return "Failed to gen answer"

In [18]:
def process_pdf(pdf_path,query):
    pdf_text=extract_text(pdf_path)
    if not pdf_text:
        return "No text extracted from PDF"
    chunks=chunk_text(pdf_text)
    embeddings=create_embeddings(chunks,embedder)
    if not embeddings:
        return "Failed to create embdeddings"
    store_pinecone(chunks,embeddings,index)
    rel_chunks=retrieve_chunks(query,index,embedder)
    if not rel_chunks:
        return "No revelant information"
    answer=gen_answer(query,rel_chunks,llm,prompt_template)
    return answer


In [19]:
import tkinter as tk
from tkinter import filedialog

In [20]:
if __name__=="__main__":
    root=tk.Tk()
    root.withdraw()
    pdf_path=filedialog.askopenfilename(
        title="Select a PDF file",
        filetypes=[("PDF files","*.pdf"),("All files","*.*")]

    )
    if pdf_path:
        query=input("Ask a question about the PDF")
        try:
            answer=process_pdf(pdf_path,query)
            print("Answer :",answer)
        except Exception as e:
            print(f"Error processing PDF :{e}")
    else:
        print("No PDF file selected")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Upserted 41 vectors
Answer : Macroeconomics is a branch of economics that studies the behavior and performance of an economy as a whole. It focuses on aggregate indicators and phenomena such as national income, overall unemployment rates, inflation, economic growth, and the interactions among those factors within an entire economy. Unlike microeconomics, which examines individual consumers and businesses, macroeconomics looks at the broad trends and patterns that impact large groups of people and economic systems. Key components of macroeconomics include fiscal policy, monetary policy, and economic indicators that help assess the health and trajectory of an economy.
