# **Installing Libraries**

In [None]:
pip install transformers torch PyMuPDF

# **Importing Libraries**

In [None]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# **Load The PDF**

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# **Chunk the PDF text**

In [None]:
def chunk_text(text, max_chunk_length=1000):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) <= max_chunk_length:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


# **Load LLM**

In [None]:
model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

def generate_answer(context, question, max_new_tokens=150):
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding="max_length")
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id
    )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_output.split("Answer:")[-1].strip()

def ask_question_over_pdf(pdf_path, question):
    full_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(full_text, max_chunk_length=1000)
    answers = []
    for i, chunk in enumerate(chunks[:5]):
        answer = generate_answer(chunk, question)
        answers.append(answer)
    return "\n\n---\n\n".join(answers)

# **Running The Pipeline**

In [None]:
pdf_path = "/content/RIL-Integrated-Annual-Report-2023-24 (1).pdf"  # Replace with your actual PDF path
question = "What are the key financial highlights?"
print("Q:", question)
print("A:", ask_question_over_pdf(pdf_path, question))