# **Installing Libraries**

In [1]:
pip install transformers torch PyMuPDF



# **Importing Libraries**

In [3]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# **Load The PDF**

In [4]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# **Chunk the PDF text**

In [5]:
def chunk_text(text, max_chunk_length=1000):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) <= max_chunk_length:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


# **Load LLM**

In [7]:
model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

def generate_answer(context, question, max_new_tokens=150):
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding="max_length")
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id
    )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_output.split("Answer:")[-1].strip()

def ask_question_over_pdf(pdf_path, question):
    full_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(full_text, max_chunk_length=1000)
    answers = []
    for i, chunk in enumerate(chunks[:5]):
        answer = generate_answer(chunk, question)
        answers.append(answer)
    return "\n\n---\n\n".join(answers)

# **Running The Pipeline**

In [9]:
pdf_path = "/content/RIL-Integrated-Annual-Report-2023-24 (1).pdf"  # Replace with your actual PDF path
question = "What are the key financial highlights?"
print("Q:", question)
print("A:", ask_question_over_pdf(pdf_path, question))

Q: What are the key financial highlights?


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


A: The Reliance 
Company has been awarded the
International
Investment

(II) Award for its commitment to

investment in the

development of

the

Indian

economy. The

Company has been awarded the

International

Investment

(II) Award for its commitment to

investment in the

development of

the

Indian

economy. The Company has been awarded the

International

Investment

(II) Award for its commitment to

investment in the

development of

the

Indian

economy. The Company has been awarded the

International

Investment

---

The financial highlights are as follows:

• Revenue increased by ₹1,592 Crore

• Net income increased by ₹1,592 Crore

• Net loss increased by ₹1,592 Crore

• Net profit increased by ₹1,592 Crore

• Net loss per share increased by ₹1,592 Crore

• Net profit per share increased by ₹1,592 Crore

• Net profit per share increased by ₹1,592 Crore

• Net profit per share increased by ₹1,592 Crore

• Net profit per share increased by ₹1,592 Cro

---

The key financial 