In [1]:
import os
import re
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
import joblib

In [3]:
# ---------- Feature Extraction ----------
def extract_features(text):
    text_length = len(text)
    num_lines = text.count('\n')
    words = text.split()
    num_words = len(words)

    keywords = ["lieferschein", "bestellnr", "lieferdatum", "kundennr", "iban", "mwst"]
    num_keywords_matched = sum(1 for kw in keywords if re.search(rf"\b{kw}\b", text, re.IGNORECASE))

    contains_lieferschein = int("lieferschein" in text.lower())
    contains_bestellnr = int("bestellnr" in text.lower())
    contains_lieferdatum = int("lieferdatum" in text.lower())
    contains_kundennr = int("kundennr" in text.lower())
    contains_iban = int("iban" in text)
    contains_mwst = int("mwst" in text.lower())

    uppercase_chars = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_chars / text_length if text_length else 0

    num_dates = len(re.findall(r'\d{2}\.\d{2}\.\d{4}', text))
    avg_spaces_per_line = sum(line.count(' ') for line in text.split('\n')) / (num_lines or 1)

    num_numeric_blocks = len(re.findall(r'\b\d{4,}\b', text))
    avg_word_length = np.mean([len(w) for w in words]) if words else 0

    lines = text.split('\n')
    first_line = lines[0] if lines else ""
    first_line_caps_ratio = sum(1 for c in first_line if c.isupper()) / len(first_line) if len(first_line) > 0 else 0

    return {
        'text_length': text_length,
        'num_lines': num_lines,
        'num_words': num_words,
        'contains_lieferschein': contains_lieferschein,
        'contains_bestellnr': contains_bestellnr,
        'contains_lieferdatum': contains_lieferdatum,
        'contains_kundennr': contains_kundennr,
        'contains_iban': contains_iban,
        'contains_mwst': contains_mwst,
        'uppercase_ratio': uppercase_ratio,
        'num_dates': num_dates,
        'avg_spaces_per_line': avg_spaces_per_line,
        'num_keywords_matched': num_keywords_matched,
        'num_numeric_blocks': num_numeric_blocks,
        'avg_word_length': avg_word_length,
        'first_line_caps_ratio': first_line_caps_ratio
    }

# ---------- Process all PDFs ----------
def process_pdf_dir(folder_path="data/superbatch"):
    records = []
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(".pdf"):
            continue
        filepath = os.path.join(folder_path, filename)
        try:
            reader = PdfReader(filepath)
            for i, page in enumerate(reader.pages):
                text = page.extract_text() or ""
                features = extract_features(text)
                features['label'] = 1 if i == 0 else 0
                records.append(features)
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    return pd.DataFrame(records)

# ---------- Load and balance ----------
df = process_pdf_dir("data/superbatch")
df_0 = df[df['label'] == 0]
df_1 = df[df['label'] == 1]
min_size = min(len(df_0), len(df_1))
df_balanced = pd.concat([
    df_0.sample(min_size, random_state=42),
    df_1.sample(min_size, random_state=42)
]).sample(frac=1, random_state=42)

X = df_balanced.drop(columns=['label'])
y = df_balanced['label']

# ---------- K-Fold CV ----------
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 3})
cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='f1')
print("Cross-validated F1 scores:", cv_scores)
print("Mean F1:", cv_scores.mean())

# ---------- Final train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# ---------- Threshold tuning ----------
y_prob = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# pick threshold with precision > 0.9 and best recall
safe_thresh = 0.0
for p, r, t in zip(precision, recall, thresholds):
    if p >= 0.9 and r >= 0.6:
        safe_thresh = t
        break
if safe_thresh == 0.0:
    safe_thresh = 0.75  # fallback default

print(f"Using threshold: {safe_thresh:.3f}")
y_pred = (y_prob > safe_thresh).astype(int)

# ---------- Evaluation ----------
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

# ---------- Save model and threshold ----------
joblib.dump(model, "lieferschein_header_detector.pkl")
with open("lieferschein_threshold.txt", "w") as f:
    f.write(str(safe_thresh))


Cross-validated F1 scores: [1.         0.8        0.8        0.85714286 0.8       ]
Mean F1: 0.8514285714285714
Using threshold: 0.750
[[4 0]
 [2 1]]
              precision    recall  f1-score   support

           0     0.6667    1.0000    0.8000         4
           1     1.0000    0.3333    0.5000         3

    accuracy                         0.7143         7
   macro avg     0.8333    0.6667    0.6500         7
weighted avg     0.8095    0.7143    0.6714         7



In [4]:
df

Unnamed: 0,text_length,num_lines,num_words,contains_lieferschein,contains_bestellnr,contains_lieferdatum,contains_kundennr,contains_iban,contains_mwst,uppercase_ratio,num_dates,avg_spaces_per_line,num_keywords_matched,num_numeric_blocks,avg_word_length,first_line_caps_ratio,label
0,2390,56,325,1,1,0,0,0,0,0.133054,2,5.821429,2,39,6.178462,0.033333,1
1,1651,47,192,1,0,0,0,0,0,0.163537,3,4.297872,1,14,7.302083,0.789474,1
2,2134,30,255,1,0,0,0,0,0,0.095595,0,9.366667,1,10,7.149020,0.071429,1
3,2051,25,252,1,1,0,0,0,0,0.095563,0,10.320000,2,9,7.015873,0.071429,0
4,731,20,96,1,0,0,0,0,0,0.124487,0,4.850000,1,7,6.395833,0.122222,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1619,36,196,0,0,0,0,0,0,0.125386,1,5.500000,0,13,7.066327,0.142857,1
97,1716,55,186,1,0,0,1,0,0,0.120629,1,3.490909,3,16,7.897849,0.363636,1
98,1236,33,156,1,0,0,0,0,0,0.188511,2,4.727273,1,23,6.711538,0.076923,1
99,604,22,71,0,0,0,0,0,0,0.135762,1,3.181818,0,14,7.211268,0.052632,0


## Predict:

In [5]:
# Load model and threshold
model = joblib.load("lieferschein_header_detector.pkl")
with open("lieferschein_threshold.txt", "r") as f:
    threshold = float(f.read())

# Feature extraction (must match training logic)
def extract_features(text):
    text_length = len(text)
    num_lines = text.count('\n')
    words = text.split()
    num_words = len(words)

    keywords = ["lieferschein", "bestellnr", "lieferdatum", "kundennr", "iban", "mwst"]
    num_keywords_matched = sum(1 for kw in keywords if re.search(rf"\b{kw}\b", text, re.IGNORECASE))

    contains_lieferschein = int("lieferschein" in text.lower())
    contains_bestellnr = int("bestellnr" in text.lower())
    contains_lieferdatum = int("lieferdatum" in text.lower())
    contains_kundennr = int("kundennr" in text.lower())
    contains_iban = int("iban" in text)
    contains_mwst = int("mwst" in text.lower())

    uppercase_chars = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_chars / text_length if text_length else 0

    num_dates = len(re.findall(r'\d{2}\.\d{2}\.\d{4}', text))
    avg_spaces_per_line = sum(line.count(' ') for line in text.split('\n')) / (num_lines or 1)

    num_numeric_blocks = len(re.findall(r'\b\d{4,}\b', text))
    avg_word_length = sum(len(w) for w in words) / len(words) if words else 0

    lines = text.split('\n')
    first_line = lines[0] if lines else ""
    first_line_caps_ratio = sum(1 for c in first_line if c.isupper()) / len(first_line) if len(first_line) > 0 else 0

    return pd.DataFrame([{
        'text_length': text_length,
        'num_lines': num_lines,
        'num_words': num_words,
        'contains_lieferschein': contains_lieferschein,
        'contains_bestellnr': contains_bestellnr,
        'contains_lieferdatum': contains_lieferdatum,
        'contains_kundennr': contains_kundennr,
        'contains_iban': contains_iban,
        'contains_mwst': contains_mwst,
        'uppercase_ratio': uppercase_ratio,
        'num_dates': num_dates,
        'avg_spaces_per_line': avg_spaces_per_line,
        'num_keywords_matched': num_keywords_matched,
        'num_numeric_blocks': num_numeric_blocks,
        'avg_word_length': avg_word_length,
        'first_line_caps_ratio': first_line_caps_ratio
    }])

# Prediction function
def predict_header_page(pdf_path, page_number=0):
    reader = PdfReader(pdf_path)
    if page_number >= len(reader.pages):
        raise ValueError("Page number out of range")
    text = reader.pages[page_number].extract_text() or ""
    features = extract_features(text)
    prob = model.predict_proba(features)[0][1]
    return int(prob > threshold)

In [13]:
path = "data/batch_4_2022_1.pdf"
for i in range(1,15):
    result = predict_header_page(path, page_number=i)
    print("Predicted as header" if result == 1 else "Not a header")

Predicted as header
Not a header
Not a header
Not a header
Not a header
Not a header
Predicted as header
Not a header
Not a header
Not a header
Predicted as header
Not a header
Not a header
Not a header
