<a href="https://colab.research.google.com/github/JosephChennattu123/UDS_OCR/blob/main/UDS_OCR_TEIL2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install python-docx

import os
from google.colab import drive
from docx import Document

drive.mount('/content/gdrive', force_remount=True)

def find_txt_file_with_pattern(directory, pattern="Abzug_fuer_KI"):
    for filename in os.listdir(directory):
        if pattern in filename and filename.endswith(".txt"):
            return os.path.join(directory, filename)
    return None

def load_data_from_txt(txt_file_path):
    data_dict = {}
    with open(txt_file_path, 'r',encoding='windows-1252') as file:
        for line in file:
            line = line.strip().split(";")
            if len(line) == 4:
                eighteen_number = line[0].strip()
                four500_number = line[1].strip() if len(line[1].strip()) == 10 else "X"
                creditor_number = line[2].strip() if len(line[2].strip()) == 6 else "X"
                company_name = line[3].strip() if line[3].strip() else "X"
                data_dict[eighteen_number] = {
                    '4500_number': four500_number,
                    'creditor_number': creditor_number,
                    'company_name': company_name
                }
    return data_dict

def process_table_and_rename_pdfs(word_file_path, data_dict):
    document = Document(word_file_path)
    table = document.tables[0]

    for row in table.rows[1:]:  # Skip header row
        eighteen_ocr_result = row.cells[2].text.strip()
        zw_ocr_result = row.cells[3].text.strip()
        pdf_file_path = row.cells[4].text.strip()

        if eighteen_ocr_result in data_dict:
            four500_number = data_dict[eighteen_ocr_result].get('4500_number', 'X')
            creditor_number = data_dict[eighteen_ocr_result].get('creditor_number', 'X')
            company_name = data_dict[eighteen_ocr_result].get('company_name', 'X')
        else:
            four500_number = "X"
            creditor_number = "X"
            company_name = "X"

        if os.path.exists(pdf_file_path):
            new_file_path = rename_pdf_file(
                pdf_file_path,
                creditor_number,
                eighteen_ocr_result,
                zw_ocr_result,
                four500_number,
                company_name
            )
            # Update the file path in the table
            row.cells[4].text = new_file_path

    # Save the updated document
    document.save(word_file_path)
    print(f"Updated .docx file saved at: {word_file_path}")

def rename_pdf_file(pdf_file_path, creditor_number, eighteen_ocr_result, zw_ocr_result, four500_number, company_name):
    directory = os.path.dirname(pdf_file_path)
    base_name = f"{creditor_number}_{eighteen_ocr_result}_{zw_ocr_result}_{four500_number}_{company_name}"
    extension = ".pdf"
    new_file_path = os.path.join(directory, base_name + extension)
    counter = 1

    # Check if the file exists
    if not os.path.exists(pdf_file_path):
        print(f"File not found: {pdf_file_path}. Skipping.")
        return pdf_file_path

    # Ensure unique file name
    while os.path.exists(new_file_path):
        new_file_path = os.path.join(directory, f"{base_name}_{counter}{extension}")
        counter += 1

    os.rename(pdf_file_path, new_file_path)
    print(f"File '{pdf_file_path}' renamed to: {new_file_path}")
    return new_file_path  # Return the new file path for updating the .docx


word_file_path = "/content/gdrive/My Drive/processed_files_with_images.docx"
directory_path = "/content/gdrive/My Drive/"

# Find the .txt file with the specific pattern
txt_file_path = find_txt_file_with_pattern(directory_path)

# Check if the file was found and proceed with processing
if txt_file_path:
    print(f"Found .txt file: {txt_file_path}")
    data_dict = load_data_from_txt(txt_file_path)
    process_table_and_rename_pdfs(word_file_path, data_dict)
else:
    print(f"No .txt file found with pattern 'Abzug_fuer_KI' in {directory_path}")


Mounted at /content/gdrive
Found .txt file: /content/gdrive/My Drive/Abzug_fuer_KI .txt
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X_.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__1.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_1.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__2.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_2.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__3.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_3.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__4.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_4.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__5.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_5.pdf
File '/content/gdrive/My Drive/Neue Rechnungen/_X_X__6.pdf' renamed to: /content/gdrive/My Drive/Neue Rechnungen/X_X_X_X_X_6.pdf
File '/conten