In [5]:
import sys
sys.path.append("../")

import typing
import csv
import pathlib
import random
import time

import numpy as np
import numpy.typing as npt
from IPython.display import display
import pypdf

from src.algorithm.clustering import ClusteringModel
from src.algorithm.elbow import Elbow
from src.types import CvLabelData
import src.visualization as vis

In [6]:
cv_data_path = pathlib.Path("../data/cv_pdf").resolve()
data_count: int = len(list(cv_data_path.glob("*.pdf")))

print("Er zitten", data_count, "CV's in", cv_data_path)

Er zitten 116 CV's in /home/jimmy/school/jaar-1/blok-d/ipass-23-24/data/cv_pdf


In [7]:
cv_label_data_path = pathlib.Path("../data/cv_cols.csv").resolve()

print("Databestand bestaat:", cv_label_data_path.exists())

cv_label_data: list[CvLabelData] = []

csv_file: typing.TextIO
with cv_label_data_path.open("r") as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)

    cv_label_data = [CvLabelData(*[int(col) for col in row]) for row in csv_reader]

print("Er zijn", len(cv_label_data), "CV's gelabeld in de dataset.")

random_cv: int = random.randint(0, len(cv_label_data))
print("CV", cv_label_data[random_cv].cv_index, "heeft", cv_label_data[random_cv].num_cols, "kolom(men).")

Databestand bestaat: True
Er zijn 116 CV's gelabeld in de dataset.
CV 77 heeft 2 kolom(men).


In [8]:
character_position_data: list[list[tuple[int, int]]] = []

display_handle_progress = display("0% van CV's omgezet naar datapunten.", display_id=True)
display_handle_time = display(display_id=True)
display_handle_size = display(display_id=True)

start_time = time.time()

for i, cv in enumerate(cv_label_data):
    cv_path = next(cv_data_path.glob(f"cv-{cv.cv_index}-*.pdf"))

    pdf_reader = pypdf.PdfReader(cv_path)
    page: pypdf.PageObject
    pdf_text = "\n".join([page.extract_text(extraction_mode="layout") for page in pdf_reader.pages])

    pdf_character_positions: list[tuple[int, int]] = []
    for y, line in enumerate(pdf_text.splitlines()):
        for x, char in enumerate(line):
            if not char.isspace():
                pdf_character_positions.append((x, y))

    character_position_data.append(pdf_character_positions)
    
    display_handle_progress.update(f"{round((i + 1) / data_count * 100, 0)}% van de CV's omgezet naar datapunten.")  # type: ignore

end_time = time.time()

display_handle_time.update(f"Klaar in {round(end_time - start_time, 1)} seconden.")  # type: ignore

dataset = character_position_data

display_handle_size.update(f"De dataset neem ongeveer {sys.getsizeof(dataset) / 1_000}KB aan ruimte in.")  # type: ignore


"100.0% van de CV's omgezet naar datapunten."

'Klaar in 12.0 seconden.'

'De dataset neem ongeveer 1.08KB aan ruimte in.'

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)


In [9]:
estimated_num_cols: npt.NDArray[np.int64] = np.zeros(len(cv_label_data), np.int64)
min_k, max_k = 1, 5

display_handle_progress = display(f"0% van de optimale clusters gevonden.", display_id=True)
display_handle_time = display(display_id=True)

start_time = time.time()

for i in range(len(estimated_num_cols)):
    cv_data = np.array(dataset[i])
    if len(cv_data) == 0:
        continue

    model = ClusteringModel(cv_data, 1, "lloyds")
    elbow = Elbow(model, min_k, max_k, cv_data)
    estimated_num_cols[i] = elbow.find_elbow()

    display_handle_progress.update(f"{round((i + 1) / data_count * 100, 0)}% van de optimale clusters gevonden.")  # type: ignore
    display_handle_time.update(f"Verlopen tijd: {round(time.time() - start_time, 1)} seconden.")  # type: ignore

end_time = time.time()

display_handle_time.update(f"Verlopen tijd: {round(end_time - start_time, 1)}s, gemiddelde duur: {round((end_time - start_time) / data_count, 1)}s")  # type: ignore

'100.0% van de optimale clusters gevonden.'

'Verlopen tijd: 124.8s, gemiddelde duur: 1.1s'

In [10]:
correctness_labels = [cv.num_cols == estimated_num_cols[i] for i, cv in enumerate(cv_label_data)]
print(estimated_num_cols)
num_correct = sum(correctness_labels)
num_incorrect = len(correctness_labels) - num_correct

display(f"Aantal correct: {num_correct}, aantal incorrect: {num_incorrect}, score: {round(num_correct / len(correctness_labels) * 100, 2)}%")

[1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1
 1 1 2 1 1 1 1 1 1 2 1 2 1 1 0 2 1 1 2 1 1 1 1 1 0 1 1 1 1 1 2 1 1 1 1 1 1
 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1
 1 1 1 1 1]


'Aantal correct: 17, aantal incorrect: 99, score: 14.66'