In [1]:
import DTW_utils as utils
import DTW
import pandas as pd


In [2]:
transcriptions = pd.read_csv("transcription.tsv", delimiter="\t", header=None)
transcriptions.columns = ["loc", "word"]

transcription_dict = dict(zip(transcriptions["loc"], transcriptions["word"]))


In [3]:
train_pages = pd.read_csv("train.tsv", sep="\t", header=None, names=["image_nr"])

In [4]:
# skip first page, since num doc_nrs != polygones 
train_pages = train_pages[1:]

In [5]:
train_db = []
train_images = {} # cropped images 

for page in train_pages['image_nr']:
    words = utils.extract_word_images(page)

    for img, poly, loc in words:
        # images
        train_images[loc] = img

        # transcriptiom
        transcription = transcription_dict[loc]

        # Preprocessing & Features
        img_prep = utils.preprocess_img(img)
        feat = utils.extract_features(img_prep)

        train_db.append({
            "loc": loc,
            "word": transcription,
            "features": feat
        })

In [6]:
import os
import pickle

os.makedirs("train_words", exist_ok=True)

for loc, img in train_images.items():
    filename = f"train_words/{loc}.png"
    img.save(filename)


os.makedirs("train_features", exist_ok=True)

with open("train_features/train_db.pkl", "wb") as f:
    pickle.dump(train_db, f)

In [16]:
validation_pages = pd.read_csv('validation.tsv', sep="\t", header=None, names=["image_nr"]) 

In [17]:
validation_db = []
validation_images = {} # cropped images 

for page in validation_pages['image_nr']:
    words = utils.extract_word_images(page)

    for img, poly, loc in words:
        # images
        validation_images[loc] = img

        # transcriptiom
        transcription = transcription_dict[loc]

        # Preprocessing & Features
        img_prep = utils.preprocess_img(img)
        feat = utils.extract_features(img_prep)

        validation_db.append({
            "loc": loc,
            "word": transcription,
            "features": feat
        })

length doc_nrs 203
length polygons 203
length doc_nrs 276
length polygons 276
length doc_nrs 266
length polygons 266
length doc_nrs 306
length polygons 306
length doc_nrs 242
length polygons 242


In [20]:
os.makedirs("validation_words", exist_ok=True)

for loc, img in validation_images.items():
    filename = f"validation_words/{loc}.png"
    img.save(filename)


os.makedirs("validation_features", exist_ok=True)

with open("validation_features/validation_db.pkl", "wb") as f:
    pickle.dump(validation_db, f)

# Competition part

In [18]:
import numpy as np
def extract_word_images_comp(image_nr: int, return_format: str = "PIL"):
    """
    Extract word images from a scanned manuscript page and its SVG.
    Returns list of (word_image, polygon_points, location).
    """
    from PIL import Image, ImageDraw
    from lxml import etree
    import regex as re

    

    image_path = f"images_comp/{image_nr}.jpg"
    svg_path = f"locations_comp/{image_nr}.svg"

    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    # Parse SVG file
    tree = etree.parse(svg_path)
    root = tree.getroot()
    polygons = root.findall(".//{*}path")

    transcriptions = pd.read_csv("transcription.tsv", delimiter="\t")

    # All locations that belong to this document
    doc_nrs = transcriptions[
        transcriptions.iloc[:, 0].str.contains(f"{image_nr}")
    ].iloc[:, 0]

    word_images = []
    i = 0
    for polygon in polygons:
        location = polygon.attrib.get("id")
        i += 1

        d = polygon.attrib["d"]
        coords = re.findall(r"\d*\.?\d+\s+[-+]?\d*\.?\d+", d)

        points = []
        for pair in coords:
            x, y = map(float, pair.split())
            points.append((x, y))
        points = np.array(points)

        # Build mask and crop
        mask = Image.new("L", (width, height), 0)
        ImageDraw.Draw(mask).polygon([tuple(p) for p in points], outline=1, fill=1)
        mask_np = np.array(mask)

        img_np = np.array(image)
        img_np[mask_np == 0] = (255, 255, 255)

        x_min, y_min = points.min(axis=0)
        x_max, y_max = points.max(axis=0)
        crop = img_np[int(y_min):int(y_max), int(x_min):int(x_max)]

        if return_format.lower() == "pil":
            crop = Image.fromarray(crop)

        word_images.append((crop, points, location))

    return word_images

In [19]:
comp_pages = pd.read_csv('test_comp.tsv', sep="\t", header=None, names=["image_nr"]) 

In [20]:
comp_db = []
comp_images = [] # cropped images 

for page in comp_pages['image_nr']:
    words = extract_word_images_comp(page)

    for img, poly, loc in words:
        # images
        comp_images.append(img)

        # Preprocessing & Features
        img_prep = utils.preprocess_img(img)
        feat = utils.extract_features(img_prep)

        comp_db.append({
            "loc": loc,
            "features": feat
        })

In [21]:
import os
import pickle
os.makedirs("comp_words", exist_ok=True)

for i,img in enumerate(comp_images):
    filename = f"comp_words/{i}.png"
    img.save(filename)


os.makedirs("comp_features", exist_ok=True)

with open("comp_features/comp_db.pkl", "wb") as f:
    pickle.dump(comp_db, f)