In [1]:
# installing dependencies
!pip install -qq pymupdf4llm

In [6]:
# importing dependencies
import pymupdf4llm
from transformers import pipeline

In [5]:
# Initialize NER extraction pipeline
extract_ner_with_llm = pipeline(
    "ner",
    model="jplu/tf-xlm-r-ner-40-lang",
    tokenizer=(
        'jplu/tf-xlm-r-ner-40-lang',
        {"use_fast": True}),
    framework="tf"
)

# Function to process and extract NER for any text
def extract_ner(text):
    if text and isinstance(text, str) and text.strip():  # Ensure text is a string and not None or empty/whitespace
        ner_entities = extract_ner_with_llm(text)
        return ner_entities
    else:
        return []

# Get information from the PDF
info = pymupdf4llm.to_markdown(
    doc="/content/drive/MyDrive/inbound8315523191781495843.pdf",
    page_chunks=True,
    write_images=True,
    image_path="/content/drive/MyDrive/Colab_Notebooks/",
    image_format="jpg",
    dpi=200,
)

# Function to print structured information with real-time NER extraction
def print_info(info):
    # Print metadata details
    print("Metadata:")
    metadata = info[0]['metadata']
    for key, value in metadata.items():
        print(f"{key}: {value}")
        # Extract and print NER for metadata
        ner_entities = extract_ner(value)
        print(f"NER for metadata: {ner_entities}")

    # Print table of contents items
    print("\nTable of Contents:")
    toc_items = info[0]['toc_items']
    for item in toc_items:
        print(f"Level: {item[0]}, Title: {item[1]}, Page: {item[2]}")
        # Extract and print NER for Table of Contents title
        ner_entities = extract_ner(item[1])
        print(f"NER for Table of Contents title: {ner_entities}")

    # Print tables information
    print("\nTables:")
    tables = info[0]['tables']
    for table in tables:
        print(f"Bounding box: {table['bbox']}, Rows: {table['rows']}, Columns: {table['columns']}")
        # Extract and print NER for tables (if necessary, from bounding box or other attributes)
        ner_entities = extract_ner(str(table['bbox']))  # or another relevant part of the table
        print(f"NER for table bounding box: {ner_entities}")

    # Print images information
    print("\nImages:")
    images = info[0]['images']
    for image in images:
        print(f"Image number: {image['number']}")
        print(f"Bounding box: {image['bbox']}")
        print(f"Transform: {image['transform']}")
        print(f"Width: {image['width']}, Height: {image['height']}")
        print(f"Color space: {image['cs-name']}, Resolution: {image['xres']}x{image['yres']}")
        # Extract and print NER for image data (if relevant)
        ner_entities = extract_ner(f"Image number {image['number']} details")
        print(f"NER for image info: {ner_entities}")

    # Print extracted text
    print("\nExtracted Text:")
    text = info[0]['text']
    print(text)
    # Extract and print NER for extracted text
    ner_entities = extract_ner(text)
    print(f"NER for extracted text: {ner_entities}")

# Call function to print info
print_info(info)

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some layers from the model checkpoint at jplu/tf-xlm-r-ner-40-lang were not used when initializing TFXLMRobertaForTokenClassification: ['dropout_38']
- This IS expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaForTokenClassification were initialized from the model checkpoint at jplu/tf-xlm-r-ner-40-lang.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForTokenClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use 0


Processing /content/drive/MyDrive/inbound8315523191781495843.pdf...

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


]
Metadata:
format: PDF 1.5
NER for metadata: [{'entity': 'ORG', 'score': 0.6977142, 'index': 1, 'word': '▁PDF', 'start': 0, 'end': 3}, {'entity': 'ORG', 'score': 0.65732944, 'index': 2, 'word': '▁1.5', 'start': 3, 'end': 7}]
title: inbound8315523191781495843
NER for metadata: [{'entity': 'ORG', 'score': 0.41432393, 'index': 2, 'word': 'bound', 'start': 2, 'end': 7}]
author: 
NER for metadata: []
subject: 
NER for metadata: []
keywords: 
NER for metadata: []
creator: Microsoft® Word 2016
NER for metadata: [{'entity': 'ORG', 'score': 0.74120563, 'index': 1, 'word': '▁Microsoft', 'start': 0, 'end': 9}, {'entity': 'ORG', 'score': 0.7079035, 'index': 2, 'word': '®', 'start': 9, 'end': 10}, {'entity': 'ORG', 'score': 0.78591436, 'index': 3, 'word': '▁Word', 'start': 10, 'end': 15}, {'entity': 'ORG', 'score': 0.7279076, 'index': 4, 'word': '▁2016', 'start': 15, 'end': 20}]
producer: www.ilovepdf.com
NER for metadata: [{'entity': 'ORG', 'score': 0.63504404, 'index': 3, 'word': 'i', 'st