<a href="https://colab.research.google.com/github/Malavika2609/text_summarization/blob/main/pdf_data_extraction_llama3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
# using llama3-8b-8192
import pymupdf
import fitz
print(fitz.__version__)
import sys
print(sys.modules.get("fitz"))  # Should show 'pymupdf'

import groq
import re

from google.colab import userdata
key=userdata.get('GROQ_API_KEY')

# Initialize Groq client
client = groq.Client(api_key=key)

# Function to extract text with page, section, and paragraph metadata
def extract_text_with_metadata(pdf_path):
    doc = pymupdf.open(pdf_path)
    text_data = []
    current_section = "Unknown Section"  # Default section if not found

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")  # Extract text blocks

        for i, block in enumerate(blocks):
            text = block[4].strip()

            # Detect section headers (heuristic: all caps or bold text)
            if text.isupper() or len(text) < 40:
                current_section = text  # Update current section
                continue  # Skip adding this as a paragraph

            text_data.append({
                "page": page_num + 1,
                "section": current_section,
                "paragraph": i + 1,
                "text": text
            })

    return text_data

# Query Llama 3 to find the answer
def get_answer_from_llama3(text_data, question):
    context = "\n\n".join([f"(Page {item['page']}, Section: {item['section']}, Para {item['paragraph']}): {item['text']}" for item in text_data])

    prompt = f"""
    Given the following extracted text from a legal document, answer the question:

    Context:
    {context}

    Question: {question}

    Provide the exact answer along with the Page Number, Section, and Paragraph Number in the following format:
    "Answer: <Extracted Answer>"
    "Page: <Page Number>"
    "Section: <Section Name>"
    "Paragraph: <Paragraph Number>"
    """

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content

# Example Usage
pdf_path = "/content/sample_data/ANTI FRAUD POLICY 05_11_2011.pdf"
question = "What is the definition of fraud?"
text_data = extract_text_with_metadata(pdf_path)
answer = get_answer_from_llama3(text_data, question)
print(answer)


0.0.1dev2
<module 'fitz' from '/usr/local/lib/python3.11/dist-packages/fitz/__init__.py'>
Answer: Fraud is a false representation or concealment of a material fact or any other illegal act committed intentionally to cause wrongful gains to self or others and/or wrongful loss to others.
Page: Page 2
Section: IV. DEFINITION OF FRAUD
Paragraph: 12


In [89]:
#using llama-3.3-70b-versatile
import pymupdf
import fitz  # PyMuPDF
import groq
import re

from google.colab import userdata
key=userdata.get('GROQ_API_KEY')

# Initialize Groq client
client = groq.Client(api_key=key)

# Function to extract text with page, section, and paragraph metadata
def extract_text_with_metadata(pdf_path):
    doc = pymupdf.open(pdf_path)
    text_data = []
    current_section = "Unknown Section"  # Default section if not found
    section_paragraph_counter = {}  # Track paragraph numbers within sections

    for page_num in range(len(doc)):
        page = doc[page_num]

        blocks = page.get_text("blocks")  # Extract text blocks


        for i, block in enumerate(blocks):

            text = block[4].strip()


            # Detect section headers (heuristic: all caps or bold text)
            if text.isupper() or len(text) < 40:
                current_section = text  # Update current section
                section_paragraph_counter[current_section] = 0  # Reset counter
                continue  # Skip adding this as a paragraph

            # Increment paragraph count within the section
            section_paragraph_counter[current_section] += 1

            text_data.append({
                "page": page_num +1 ,  # Use PyMuPDF's internal page numbering
                "section": current_section,
                "section_paragraph": section_paragraph_counter[current_section],
                "text": text

            })
            #print('ettxt',text_data)
        #print('pg',page.number)

    return text_data

# Query Llama 3 to find the answer and its metadata
def get_answer_from_llama3(text_data, question):
    context = "\n\n".join([
        f"(Page {item['page']}, Section: {item['section']}, Section Paragraph {item['section_paragraph']}): {item['text']}"
        for item in text_data
    ])

    prompt = f"""
    Given the following extracted text from a legal document, answer the question:

    Context:
    {context}

    Question: {question}

    Provide the exact answer along with the page , Section, and Paragraph Number in the following format:
    "Answer: <Extracted Answer>"
    "Page: <page>"
    "Section: <Section Name>"
    "Paragraph: <Paragraph Number>"
    """

    response = client.chat.completions.create(
        #model="llama3-8b-8192",
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content

# Example Usage
pdf_path = "/content/sample_data/ANTI FRAUD POLICY 05_11_2011.pdf"
question = "Objectives of policy?"
text_data = extract_text_with_metadata(pdf_path)
answer = get_answer_from_llama3(text_data, question)
print(answer)


Answer: The policy has been framed to enforce controls so as to provide a system of detection and prevention of fraud, reporting of any fraud or suspected fraud and appropriate dealing of issues relating to fraud. 
The policy aims to ensure that:- 
(i) Management is aware of its responsibility for the detection and prevention of fraud, misappropriations, and other inappropriate conduct. 
(ii) Any fraud that is detected or suspected must be reported immediately to the Nodal Officer designated for the purpose of co-ordination of preliminary investigation. 
(iii) To conduct investigations into fraudulent activities.

Page: 2
Section: III. OBJECTIVES OF POLICY
Paragraph: 1, 2


In [90]:
print('Question',question)
print('Answer',answer)

Question Objectives of policy?
Answer Answer: The policy has been framed to enforce controls so as to provide a system of detection and prevention of fraud, reporting of any fraud or suspected fraud and appropriate dealing of issues relating to fraud. 
The policy aims to ensure that:- 
(i) Management is aware of its responsibility for the detection and prevention of fraud, misappropriations, and other inappropriate conduct. 
(ii) Any fraud that is detected or suspected must be reported immediately to the Nodal Officer designated for the purpose of co-ordination of preliminary investigation. 
(iii) To conduct investigations into fraudulent activities.

Page: 2
Section: III. OBJECTIVES OF POLICY
Paragraph: 1, 2


In [19]:
!pip uninstall fitz pymupdf -y
!pip install pymupdf
!pip install fitz  # PyMuPDF
!pip install groq


Found existing installation: fitz 0.0.1.dev2
Uninstalling fitz-0.0.1.dev2:
  Successfully uninstalled fitz-0.0.1.dev2
Found existing installation: PyMuPDF 1.25.3
Uninstalling PyMuPDF-1.25.3:
  Successfully uninstalled PyMuPDF-1.25.3
Collecting pymupdf
  Using cached pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.3
