In [None]:
!pip install openai faiss-cpu PyMuPDF python-dotenv

In [7]:
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from google.colab import userdata

# Load environment variables (optional, if using a .env file locally)
load_dotenv()

# Get your OpenAI API key from Colab's secrets
# Go to the "🔑" icon in the left sidebar, click on "Secrets", and add a new secret with the name "OPENAI_API_KEY" and your API key as the value.
openai_api_key = userdata.get("AIproject")

client = OpenAI(api_key= openai_api_key)

In [8]:
def extract_text_from_pdf(file_path):
    """Extract text from a PDF using PyMuPDF"""
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())
        start += chunk_size - overlap
    return chunks

def get_embeddings(texts, model="text-embedding-3-small"):
    """Generate embeddings for a list of texts"""
    embeddings = []
    batch_size = 50
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        response = client.embeddings.create(model=model, input=batch)
        embeddings.extend([data.embedding for data in response.data])
    return np.array(embeddings).astype("float32")

In [9]:
from google.colab import files

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print(f"Uploaded: {file_name}")

# Extract text
if file_name.lower().endswith(".pdf"):
    text = extract_text_from_pdf(file_name)
else:
    text = open(file_name, "r", encoding="utf-8").read()

print("Total text length:", len(text))

Saving The-Hindu-Review-August.pdf to The-Hindu-Review-August (1).pdf
Uploaded: The-Hindu-Review-August (1).pdf
Total text length: 195033


In [10]:
chunks = chunk_text(text)
print(f"Total chunks created: {len(chunks)}")

embeddings = get_embeddings(chunks)
dimension = embeddings.shape[1]

# Build FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index built successfully!")

Total chunks created: 244
FAISS index built successfully!


In [11]:
def answer_question(question, top_k=5, model="gpt-4o-mini"):
    """Retrieve relevant chunks and answer the question"""
    # Get embedding for the question
    q_embedding = client.embeddings.create(model="text-embedding-3-small", input=[question]).data[0].embedding
    q_embedding = np.array([q_embedding]).astype("float32")

    # Search in FAISS index
    distances, indices = index.search(q_embedding, top_k)

    # Combine top chunks
    retrieved_chunks = [chunks[i] for i in indices[0]]
    context = "\n\n".join(retrieved_chunks)

    # Ask GPT for answer
    prompt = f"Answer the following question using the context below.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant for document Q&A."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
        max_tokens=400,
    )
    return completion.choices[0].message.content

# Example query
question = "Summarize the main idea of the document."
print(answer_question(question))

The document discusses a new tax bill aimed at simplifying tax compliance and enhancing transparency for taxpayers and investors. It outlines key features such as the use of simplified language, organized provisions, the introduction of a single "Tax Year," and a digital-first approach to tax administration. The bill seeks to reduce litigation by removing ambiguous provisions and aims to support the Digital India initiative through automation and clearer rules, ultimately making tax filing easier and more efficient.


In [12]:
while True:
    q = input("\nAsk a question (or type 'exit'): ")
    if q.lower() == "exit":
        break
    print("\nAnswer:", answer_question(q))


Ask a question (or type 'exit'): Give me about topic related in document

Answer: The document covers a variety of topics, including:

1. **Banking & Finance** - Information related to financial institutions and economic developments.
2. **Economy & Business** - Insights into economic trends and business activities.
3. **MoUs & Agreements** - Details on memorandums of understanding and agreements between countries or organizations.
4. **Appointments & Resignations** - Updates on personnel changes in various organizations.
5. **Awards** - Recognition and awards given in different fields.
6. **Summits, Events & Conferences** - Information about significant gatherings and discussions on various topics.
7. **Defence** - Developments and news related to national and international defense.
8. **Science & Technology** - Innovations and advancements in science and technology, including AI developments like Google's Gemini 2.5 Flash Image.
9. **Sports** - Current affairs and updates in the spo