In [15]:
import os
import json
import shutil
import re
from PyPDF2 import PdfReader
from datetime import datetime
import joblib
import pandas as pd

# Load model and threshold
model = joblib.load("lieferschein_header_detector.pkl")
with open("lieferschein_threshold.txt", "r") as f:
    threshold = float(f.read())

# Feature extraction (must match training logic)
def extract_features(text):
    text_length = len(text)
    num_lines = text.count('\n')
    words = text.split()
    num_words = len(words)

    keywords = ["lieferschein", "bestellnr", "lieferdatum", "kundennr", "iban", "mwst"]
    num_keywords_matched = sum(1 for kw in keywords if re.search(rf"\b{kw}\b", text, re.IGNORECASE))

    contains_lieferschein = int("lieferschein" in text.lower())
    contains_bestellnr = int("bestellnr" in text.lower())
    contains_lieferdatum = int("lieferdatum" in text.lower())
    contains_kundennr = int("kundennr" in text.lower())
    contains_iban = int("iban" in text)
    contains_mwst = int("mwst" in text.lower())

    uppercase_chars = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_chars / text_length if text_length else 0

    num_dates = len(re.findall(r'\d{2}\.\d{2}\.\d{4}', text))
    avg_spaces_per_line = sum(line.count(' ') for line in text.split('\n')) / (num_lines or 1)

    num_numeric_blocks = len(re.findall(r'\b\d{4,}\b', text))
    avg_word_length = sum(len(w) for w in words) / len(words) if words else 0

    lines = text.split('\n')
    first_line = lines[0] if lines else ""
    first_line_caps_ratio = sum(1 for c in first_line if c.isupper()) / len(first_line) if len(first_line) > 0 else 0

    return pd.DataFrame([{
        'text_length': text_length,
        'num_lines': num_lines,
        'num_words': num_words,
        'contains_lieferschein': contains_lieferschein,
        'contains_bestellnr': contains_bestellnr,
        'contains_lieferdatum': contains_lieferdatum,
        'contains_kundennr': contains_kundennr,
        'contains_iban': contains_iban,
        'contains_mwst': contains_mwst,
        'uppercase_ratio': uppercase_ratio,
        'num_dates': num_dates,
        'avg_spaces_per_line': avg_spaces_per_line,
        'num_keywords_matched': num_keywords_matched,
        'num_numeric_blocks': num_numeric_blocks,
        'avg_word_length': avg_word_length,
        'first_line_caps_ratio': first_line_caps_ratio
    }])

# Prediction function
def predict_header_page(pdf_path, page_number=0):
    reader = PdfReader(pdf_path)
    if page_number >= len(reader.pages):
        raise ValueError("Page number out of range")
    text = reader.pages[page_number].extract_text() or ""
    features = extract_features(text)
    prob = model.predict_proba(features)[0][1]
    return int(prob > threshold)

# Copy SAP_data to current_data
shutil.copy("data/SAP_data.json", "data/current_data.json")

# Load SAP data
with open("data/current_data.json", "r", encoding="utf-8") as f:
    sap_data = json.load(f)

# Normalize date format in current_data
for entry in sap_data:
    raw_date = entry.get("Delivery Note Date")
    if raw_date:
        try:
            dt = datetime.fromisoformat(raw_date.replace("T00:00:00.000", ""))
            entry["Delivery Note Date"] = dt.strftime("%Y-%m-%d")
        except Exception:
            entry["Delivery Note Date"] = ""

# Save updated current_data.json
with open("data/current_data.json", "w", encoding="utf-8") as f:
    json.dump(sap_data, f, indent=2, ensure_ascii=False)

# Prepare lookups
delivery_notes = set(str(entry["Delivery Note Number"]).strip() for entry in sap_data)
company_names = set(entry["Vendor - Name 1"].strip() for entry in sap_data if entry["Vendor - Name 1"])

def build_relaxed_pattern(number):
    return re.compile("\\s*".join(re.escape(d) for d in number))

delivery_note_patterns = {
    note: build_relaxed_pattern(note) for note in delivery_notes
}

def build_name_pattern(name):
    cleaned = re.sub(r'\s+', '', name.lower())
    return re.compile("\\s*".join(re.escape(char) for char in cleaned), re.IGNORECASE)

company_name_patterns = {
    name: build_name_pattern(name) for name in company_names
}

def normalize_address_variant(s):
    return (
        s.replace("\u00df", "ss")
         .replace("Straße", "Strafle")
         .replace("Strasse", "Strafle")
         .replace(" ", "")
         .replace(",", "")
         .replace("-", "")
         .replace(".", "")
         .replace("\u2022", "")
         .lower()
    )

address_variants = []
normalized_address_variants = []
for entry in sap_data:
    street = entry.get("Vendor - Address - Street", "")
    number = str(entry.get("Vendor - Address - Number", "")).strip()
    zip_code = str(entry.get("Vendor - Address - ZIP Code", "")).strip()
    city = entry.get("Vendor - Address - City", "")
    country = entry.get("Vendor - Address - Country", "")

    if not street or not number:
        continue

    variants = [
        f"{street} {number}",
        f"{street}- {number}",
        f"{number} {street}",
        f"{street} {number}, {zip_code} {city}",
        f"{street} {number}, {city}",
        f"{street} {number}, {city}, {country}",
        f"{street} {number} • {zip_code} {city}"
    ]

    for v in variants:
        cleaned = v.strip()
        if cleaned:
            address_variants.append(cleaned)
            normalized_address_variants.append(normalize_address_variant(cleaned))

raw_to_standard_date = {}
for entry in sap_data:
    date_str = entry.get("Delivery Note Date")
    if not date_str:
        continue
    try:
        dt = datetime.fromisoformat(date_str)
        formats = [
            dt.strftime("%d.%m.%Y"), dt.strftime("%Y-%m-%d"), dt.strftime("%d/%m/%Y"),
            dt.strftime("%d.%m.%y"), dt.strftime("%d %b %Y"), dt.strftime("%d %B %Y")
        ]
        for f in formats:
            raw_to_standard_date[f] = dt.strftime("%Y-%m-%d")
    except Exception:
        continue

company_prefixes = set()
for entry in sap_data:
    name = entry.get("Vendor - Name 1", "").strip()
    if len(name) >= 7:
        company_prefixes.add(name[:7])

zip_code_variants = set()
for entry in sap_data:
    zip_code = str(entry.get("Vendor - Address - ZIP Code", "")).strip()
    if zip_code:
        zip_code_variants.add(zip_code)

def analyze_batch_pdf(pdf_path):
    result = []
    try:
        reader = PdfReader(pdf_path)
        for i, page in enumerate(reader.pages):
            page_number = i + 1
            text = page.extract_text() or ""
            text_lower = text.lower()
            text_flat = normalize_address_variant(text)
            text_normalized = text_lower.replace(" ", "").replace("\n", "").replace("\r", "")

            delivery_note = "not known"
            for note, pattern in delivery_note_patterns.items():
                if pattern.search(text):
                    delivery_note = note
                    break

            company_name = "not known"
            for name, pattern in company_name_patterns.items():
                if pattern.search(text):
                    company_name = name
                    break

            address = next(
                (original for original, norm in zip(address_variants, normalized_address_variants) if norm in text_flat),
                "not known"
            )

            matched_raw_date = next((d for d in raw_to_standard_date if d in text), None)
            date = raw_to_standard_date.get(matched_raw_date, "not known")
            mjahr = date[:4] if date != "not known" else "not known"

            company_prefix = next(
                (prefix for prefix in company_prefixes if prefix.lower().replace(" ", "") in text_normalized),
                "not known"
            )

            zip_code = "not known"
            if address == "not known":
                zip_code = next((z for z in zip_code_variants if z in text), "not known")

            seite = "not known"
            match = re.search(r"Seite.{0,5}", text)
            if match:
                seite_raw = match.group()
                seite = re.sub(r"[A-Za-z\s]", "", seite_raw)

            is_header = predict_header_page(pdf_path, i)

            result.append({
                "page": page_number,
                "delivery_note": delivery_note,
                "company_name": company_name,
                "address": address,
                "date": date,
                "company_prefix": company_prefix,
                "zip_code": zip_code,
                "MJAHR": mjahr,
                "Seite": seite,
                "is_header_page": is_header
            })
    except Exception as e:
        print(f"❌ Failed to process {pdf_path}: {e}")
    return result

def main():
    base_dir = "data"
    output = {}

    for filename in os.listdir(base_dir):
        if filename.endswith(".pdf") and filename.startswith("batch_"):
            batch_path = os.path.join(base_dir, filename)
            batch_name = filename.replace(".pdf", "")
            output[batch_name] = analyze_batch_pdf(batch_path)

    with open("data/current_data.json", "r", encoding="utf-8") as f:
        current_data = json.load(f)

    # First matching phase
    for batch in output.values():
        for entry in batch:
            dn = entry["delivery_note"]
            matched = next((row for row in current_data if str(row["Delivery Note Number"]).strip() == dn), None)
            entry["MBLNR"] = matched["MBLNR"] if matched else "not known"

    # Second matching phase
    for batch in output.values():
        for entry in batch:
            if entry["MBLNR"] != "not known":
                continue
            name = entry["company_name"]
            date = entry["date"]
            zip_code = entry["zip_code"]
            matched = next(
                (
                    row for row in current_data
                    if row.get("Vendor - Name 1") == name and
                       row.get("Delivery Note Date") == date and
                       str(row.get("Vendor - Address - ZIP Code", "")).strip() == zip_code
                ),
                None
            )
            if matched:
                entry["MBLNR"] = matched["MBLNR"]

    with open("output.json", "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()
