In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json
import os

annotations_dir = '/content/drive/MyDrive/DOM NLP/Annotations'

# Load one annotation file to inspect its structure
with open(os.path.join(annotations_dir, '1.json'), 'r', encoding='utf-8') as f:
    annotation_example = json.load(f)

print(annotation_example)


{'classes': ['SENDER_FIRSTNAME', 'SENDER_SECONDNAME', 'SENDER_STREETADDRESS', 'SENDER_POSTCODE', 'SENDER_CITY', 'SENDER_VATID', 'SENDER_TAXID', 'SENDER_BANK', 'SENDER_IBAN', 'SENDER_BIC_SWIFT', 'RECIPIENT_COMPANY', 'RECIPIENT_TITLE', 'RECIPIENT_FIRSTNAME', 'RECIPIENT_SECONDNAME', 'RECIPIENT_STREETADDRESS', 'RECIPIENT_POSTCODE', 'RECIPIENT_CITY', 'INVOICE_NUMBER', 'INVOICE_CLIENTNUMBER', 'INVOICE_DATE', 'INVOICE_TOTALCOST', 'INVOICE_CURRENCY', 'INVOICE_DUEDATE', 'INVOICE_PAYMENTMETHOD', 'SENDER_COMPANY', 'SENDER_TITLE'], 'annotations': [['     MARTIN  UHL \nPsychologischer Psychotherapeut \n  Tiefenpsychologisch fundierte Therapie  \n  Systemische Therapie \n \n \nAugsburger Str. 14,    87700 Memmingen \n \nTel.: 08331 / 98 43 420 \n \n   Steuer-Nr. 138/735/01477 \n \n \nMemmingen, 05. 06. 2022 \n \n \nStundenaufstellung  März / April / Mai 2022 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nDatum \nStunden Grund \n05. 03. 22 \n1 \nVorbereitung SIKK und Kooperationsvereinbarung EF

In [3]:
import os
import json

# Paths to directories
annotations_dir = '/content/drive/MyDrive/DOM NLP/Annotations'
txt_dir = '/content/drive/MyDrive/DOM NLP/Extracted_Texts'

# Function to prepare training data
def prepare_training_data():
    training_data = []

    # Loop through annotation files
    for annotation_file in os.listdir(annotations_dir):
        if annotation_file.endswith('.json'):
            file_path = os.path.join(annotations_dir, annotation_file)

            with open(file_path, 'r', encoding='utf-8') as f:
                annotations = json.load(f)

            # Loop through the 'annotations' key
            for text_data in annotations['annotations']:
                text = text_data[0]
                entity_data = text_data[1]['entities']

                entities = []
                for entity in entity_data:
                    start, end, label = entity[0], entity[1], entity[2]
                    entities.append((start, end, label))

                # Append the formatted data as (text, {"entities": [(start, end, label)]})
                training_data.append((text, {"entities": entities}))

    return training_data

# Load training data
training_data = prepare_training_data()

# Print the first item for verification
print(training_data[0])


('EMPFÄNGER\nSYSTEMICA Institut GmbH\nOlgastrasse 94\n89073 Ulm\n\nABSENDER\nProf. Dr. Thomas Messer Prinz-Adalbert-litrasse 100\n85221 Dachau\n\nDatum 03.04 2022\nRechnung\nRechnungsnummer: 07-22\nSteuernummer: 107/250/90744\nSehr geehrte Damen und Herren\nfür meine Referententätigkeit (Psychopharmakologie II) am 12.03.2022 und am 02.04.2022 erlaube ich vertragsgemäß zu berechnen\nHonorar\nEuro\n1800,00\nTotal\nEuro\n1800,00\nIch bitte um Überweisung auf folgendes Konto:\nBankverbindung.\nHypoVereinsbank Wiesbaden\nKonto Nummer\n439 45 34\nBLZ\n510 201 86\nIBAN Nummer\n03510201860004394534\nSWIFT BIC\nHYVEDEMM 478\nIch bedanke mich für die Kooperation und verbleibe für heute\nmit freundlichen Grüllen', {'entities': [(10, 33, 'RECIPIENT_COMPANY'), (34, 48, 'RECIPIENT_STREETADDRESS'), (49, 54, 'RECIPIENT_POSTCODE'), (55, 58, 'RECIPIENT_CITY'), (79, 85, 'SENDER_FIRSTNAME'), (86, 92, 'SENDER_SECONDNAME'), (93, 120, 'SENDER_STREETADDRESS'), (121, 126, 'SENDER_POSTCODE'), (127, 133, 'SENDER

In [None]:
!pip install spacy

In [7]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

# Create a blank model
nlp = spacy.blank("de")

# Add a Named Entity Recognizer to the model
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER pipeline
all_labels = [
    'SENDER_COMPANY',
    'SENDER_TITLE',
    'SENDER_FIRSTNAME',
    'SENDER_SECONDNAME',
    'SENDER_STREETADDRESS',
    'SENDER_POSTCODE',
    'SENDER_CITY',
    'SENDER_VATID',
    'SENDER_TAXID',
    'SENDER_BANK',
    'SENDER_IBAN',
    'SENDER_BIC_SWIFT',
    'RECIPIENT_COMPANY',
    'RECIPIENT_TITLE',
    'RECIPIENT_FIRSTNAME',
    'RECIPIENT_SECONDNAME',
    'RECIPIENT_STREETADDRESS',
    'RECIPIENT_POSTCODE',
    'RECIPIENT_CITY',
    'INVOICE_NUMBER',
    'INVOICE_CLIENTNUMBER',
    'INVOICE_DATE',
    'INVOICE_TOTALCOST',
    'INVOICE_CURRENCY',
    'INVOICE_DUEDATE',
    'INVOICE_PAYMENTMETHOD'
]

# Add all entity labels to the NER pipeline
for label in all_labels:
    ner.add_label(label)

# Prepare the training data (convert to Example objects)
def create_examples(nlp, training_data):
    examples = []
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    return examples

# Get names of other components in the pipeline (if any)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Disable other pipes for training
with nlp.disable_pipes(*other_pipes):
    # Initialize the optimizer
    optimizer = nlp.begin_training()

    for iteration in range(750):  # You can change the number of iterations
        random.shuffle(training_data)
        losses = {}

        # Create Example objects from the training data
        examples = create_examples(nlp, training_data)

        # Batch up the examples using spaCy's minibatch
        batches = minibatch(examples, size=compounding(2, 64, 0.01))
        for batch in batches:
            # Update the model with SGD optimizer
            nlp.update(batch, sgd=optimizer, drop=0.0001, losses=losses)

        print(f"Iteration {iteration}, Losses: {losses}")

# Save the trained model
output_dir = '/content/drive/MyDrive/DOM NLP/NER_Model'
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Iteration 0, Losses: {'ner': 449.71692073345184}
Iteration 1, Losses: {'ner': 405.132205247879}
Iteration 2, Losses: {'ner': 482.3122944831848}
Iteration 3, Losses: {'ner': 569.5289504528046}
Iteration 4, Losses: {'ner': 524.8049120903015}
Iteration 5, Losses: {'ner': 545.3224477767944}
Iteration 6, Losses: {'ner': 571.8865671157837}
Iteration 7, Losses: {'ner': 521.8680663108826}
Iteration 8, Losses: {'ner': 390.7961845397949}
Iteration 9, Losses: {'ner': 108.14682610332966}
Iteration 10, Losses: {'ner': 92.3050288259983}
Iteration 11, Losses: {'ner': 58.0232219575264}
Iteration 12, Losses: {'ner': 64.67533734272001}
Iteration 13, Losses: {'ner': 58.8189010046699}
Iteration 14, Losses: {'ner': 63.6152143125737}
Iteration 15, Losses: {'ner': 63.68890303398439}
Iteration 16, Losses: {'ner': 67.64670499349359}
Iteration 17, Losses: {'ner': 66.83410618833295}
Iteration 18, Losses: {'ner': 63.91593351907795}
Iteration 19, Losses: {'ner': 66.81908121163724}
Iteration 20, Losses: {'ner': 73.

In [1]:
!pip install pymupdf
!pip install pdf2image



In [2]:
!pip install paddlepaddle
!pip install paddleocr
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [5]:
!pip install python-magic

Collecting python-magic
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Installing collected packages: python-magic
Successfully installed python-magic-0.4.27


In [26]:
import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
import fitz  # PyMuPDF for handling PDFs
from google.colab import files
from PIL import Image
import io
import magic
import spacy  # For NER

# Load your trained spaCy model (assuming the model is already trained)
output_dir = '/content/drive/MyDrive/DOM NLP/NER_Model'  # Adjust this path to your model location
nlp = spacy.load(output_dir)

# Initialize PaddleOCR (English and German support)
ocr = PaddleOCR(use_angle_cls=True, lang='de')  # 'de' for German, 'en' for English

# Function to extract text from PDF using PyMuPDF and PaddleOCR
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_text = ""

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()  # Render PDF page to image

        # Convert Pixmap to numpy array for PaddleOCR
        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

        # Use PaddleOCR to extract text from the numpy image array
        result = ocr.ocr(img_array, cls=True)

        # Append OCR results to extracted_text
        for line in result:
            for word_info in line:
                text = word_info[1][0]
                extracted_text += text + "\n"

    return extracted_text

# Function to extract text from image using PaddleOCR
def extract_text_from_image(image_path):
    # Read the image using OpenCV
    image = cv2.imread(image_path)

    # Use PaddleOCR to extract text from the image
    result = ocr.ocr(image, cls=True)
    extracted_text = ""

    for line in result:
        for word_info in line:
            text = word_info[1][0]
            extracted_text += text + "\n"

    return extracted_text

# Function to perform NER using spaCy model
def perform_ner_on_text(extracted_text):
    print("\nExtracted Text:\n")
    print(extracted_text)  # Show extracted text before NER

    doc = nlp(extracted_text)  # Process the text with the spaCy NER model

    print("\nNamed Entities:\n")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")  # Print extracted entities and their labels

# Function to handle the uploaded file and decide whether it's a PDF or image
def handle_uploaded_file(file_path):
    mime = magic.Magic(mime=True)
    file_type = mime.from_file(file_path)

    if 'pdf' in file_type:
        print(f"\nProcessing PDF: {file_path}")
        pdf_text = extract_text_from_pdf(file_path)
        perform_ner_on_text(pdf_text)
    elif 'image' in file_type:
        print(f"\nProcessing Image: {file_path}")
        image_text = extract_text_from_image(file_path)
        perform_ner_on_text(image_text)
    else:
        print(f"\nUnsupported file format for {file_path}. Please upload a PDF or image file.")

# Upload file from user
uploaded_files = files.upload()

# Process each uploaded file
for filename in uploaded_files.keys():
    # Save uploaded file to local disk
    file_path = os.path.join('/content', filename)
    with open(file_path, 'wb') as f:
        f.write(uploaded_files[filename])

    # Handle the uploaded file (PDF or image)
    handle_uploaded_file(file_path)


[2024/10/09 06:03:34] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25,

Saving I122.pdf to I122 (2).pdf

Processing PDF: /content/I122 (2).pdf
[2024/10/09 06:03:50] ppocr DEBUG: dt_boxes num : 55, elapsed : 0.2359790802001953
[2024/10/09 06:03:51] ppocr DEBUG: cls num  : 55, elapsed : 0.2939114570617676
[2024/10/09 06:03:54] ppocr DEBUG: rec_res num  : 55, elapsed : 3.5336716175079346

Extracted Text:

sipgate
sipgate GmbH - Gladbacher Str. 74 - 40219 Dusseldorf
Rechnungsdatum
01.12.2022
Leistungsdatum
01.12.2022
Systemica Institut GmbH
Rechnungsnummer
B2361113
Patrick Rotter
Bezahlung per
SEPA-Lastschrift
Einsteinstr. 35
Kundennummer
3137527
89077 Ulm
Seite
1 / 1
RechnungB2361113
Pos.
Art.-Nr.
Bezeichnung
Menge
Einzelpreis
Einzelpreis
ust
Gesamtpreis
netto
brutto
netto
1
sipgate.de, Telefonieguthaben
1
8,40
10,00
19%
8,40 EUR
Summe Positionen netto
8,40 EUR
19% USt. auf EUR 8,40 (DE)
1,60 EUR
Rechnungsbetrag
10,00 EUR
Der Betrag in Hhe von 10,00 EUR wird am 06.12.2022 von Ihrem Konto abgebucht
Inhaber: Systemica Institut GmbH, IBAN: DE85XXXXXXXXXXXXXXXXX2