# 1. Install & Import Dependencies


In [19]:
!pip install -q openai PyPDF2 python-docx

# import dependencies

In [20]:
import openai
import json
import PyPDF2
import docx
from google.colab import files
import os

# 2. Document Upload

In [21]:
# Document upload
uploaded = files.upload()
filename = next(iter(uploaded))
file_ext = os.path.splitext(filename)[1].lower()

Saving sample_document.pdf to sample_document.pdf


# 3. Extract Text from PDF, DOCX, or TXT

In [44]:
def extract_text(filename):
    # Get the file extension in lowercase (e.g., '.pdf', '.docx', '.txt')
    ext = os.path.splitext(filename)[1].lower()
    pages = []  # List to store extracted text, organized by page or as a single entry

    if ext == ".pdf":
        # If the file is a PDF, use PyPDF2 to read it
        pdf_reader = PyPDF2.PdfReader(open(filename, "rb"))
        for i, page in enumerate(pdf_reader.pages):
            text = page.extract_text()  # Extract text from each page
            pages.append({"page_num": i+1, "text": text})  # Store with page number

    elif ext == ".docx":
        # If the file is a DOCX, use python-docx to read it
        doc = docx.Document(filename)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)  # Collect text from each paragraph
        # Combine all paragraphs into one string and treat as a single "page"
        pages.append({"page_num": 1, "text": "\n".join(full_text)})

    elif ext == ".txt":
        # If the file is a TXT, read the whole file as one "page"
        with open(filename, "r", encoding="utf-8") as f:
            text = f.read()
        pages.append({"page_num": 1, "text": text})

    else:
        # Raise an error if the file type is not supported
        raise ValueError("Unsupported file type: " + ext)

    return pages  # Return the list of extracted text entries

pages = extract_text(filename)

# 4. LLM-based Parsing

In [45]:
groq_api_key = " "   # use your api_key,hide for production

In [46]:
# Create an OpenAI client instance for interacting with the Groq API.
client = openai.OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=groq_api_key,
)

In [47]:
def parse_sections_with_llm_chunked_all_pages(pages):
    """
    Processes each page individually with the LLM to avoid token limit errors,
    and ensures every page is represented in the output, even if no section is detected.
    """
    all_sections = []

    for page in pages:
        # If the page is empty, still add a placeholder
        if not page['text'] or not page['text'].strip():
            all_sections.append({
                "subject_title": None,
                "section_type": "No Content",
                "starting_page_no": page['page_num'],
                "ending_page_no": page['page_num'],
                "entities": [],
                "subsections": [],
                "raw_text": ""  # Optionally include raw text
            })
            continue

        # Prepare a prompt for just this page
        prompt = f"""
You are a document parser. Given the following page, extract its main sections and for each section, provide:
- subject_title (section header)
- section_type (guess if not explicit)
- starting_page_no
- ending_page_no
- entities (list of key named entities, e.g. company names, dates, financial figures, etc.)
- subsections (leave empty for now)

Format your output as a JSON list, as in this example:
[
  {{
    "subject_title": "<section_header>",
    "section_type": "Document Title",
    "starting_page_no": 1,
    "ending_page_no": 1,
    "entities": [{{"company name": "Amazon", "publication year": "2024"}}],
    "subsections": []
  }},
  ...
]

Page {page['page_num']}:
{page['text']}
"""

        # Send the prompt to the LLM and get the response
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )

        import re, ast, json
        content = response.choices[0].message.content
        try:
            # Try to extract the JSON list from the response using regex
            json_str = re.search(r'(\[.*\])', content, re.DOTALL).group(1)
            parsed = json.loads(json_str)
        except Exception:
            try:
                parsed = ast.literal_eval(content)
            except Exception:
                parsed = []

        # If the LLM found sections, add them; otherwise, add a placeholder
        if isinstance(parsed, list) and parsed:
            all_sections.extend(parsed)
        else:
            all_sections.append({
                "subject_title": None,
                "section_type": "No Section Detected",
                "starting_page_no": page['page_num'],
                "ending_page_no": page['page_num'],
                "entities": [],
                "subsections": [],
                "raw_text": page['text']  # Optionally include raw text
            })

    return all_sections  # Return the aggregated list of all sections/pages

In [48]:
structured_json = parse_sections_with_llm_chunked_all_pages(pages)

In [49]:
print(structured_json)

[{'subject_title': 'LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS', 'section_type': 'Document Title', 'starting_page_no': 1, 'ending_page_no': 1, 'entities': [{'company name': 'Microsoft Corporation'}, {'authors': ['Edward Hu', 'Yelong Shen', 'Phillip Wallis', 'Zeyuan Allen-Zhu', 'Yuanzhi Li', 'Shean Wang', 'Lu Wang', 'Weizhu Chen']}], 'subsections': []}, {'subject_title': 'ABSTRACT', 'section_type': 'Abstract', 'starting_page_no': 1, 'ending_page_no': 1, 'entities': [{'model name': 'GPT-3'}, {'parameter count': '175B'}, {'company name': 'Microsoft'}], 'subsections': []}, {'subject_title': 'INTRODUCTION', 'section_type': 'Section', 'starting_page_no': 1, 'ending_page_no': 1, 'entities': [{'model names': ['RoBERTa', 'DeBERTa', 'GPT-2', 'GPT-3']}, {'authors': ['Radford et al.', 'Liu et al.', 'Brown et al.']}, {'publication years': ['b', '2019', '2020']}], 'subsections': []}, {'subject_title': 'Introduction', 'section_type': 'Abstract', 'starting_page_no': 2, 'ending_page_no': 2, 'en

# Data in JSON

In [50]:
import json

# structured_json is your Python dict/list/etc.
json_str = json.dumps(structured_json,
                      indent=4,          # pretty‐print with 4-space indents
                      ensure_ascii=False # allow non-ASCII characters
                     )
print(json_str)

[
    {
        "subject_title": "LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS",
        "section_type": "Document Title",
        "starting_page_no": 1,
        "ending_page_no": 1,
        "entities": [
            {
                "company name": "Microsoft Corporation"
            },
            {
                "authors": [
                    "Edward Hu",
                    "Yelong Shen",
                    "Phillip Wallis",
                    "Zeyuan Allen-Zhu",
                    "Yuanzhi Li",
                    "Shean Wang",
                    "Lu Wang",
                    "Weizhu Chen"
                ]
            }
        ],
        "subsections": []
    },
    {
        "subject_title": "ABSTRACT",
        "section_type": "Abstract",
        "starting_page_no": 1,
        "ending_page_no": 1,
        "entities": [
            {
                "model name": "GPT-3"
            },
            {
                "parameter count": "175B"
            },
       

# 5. Save JSON Output

In [51]:
# 5. Save JSON Output
output_filename = "parsed_document.json"
with open(output_filename, "w") as f:
    json.dump(structured_json, f, indent=2)

files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>