# Import packages

In [None]:
# # If needed, install the following packages:

# !pip install azure-ai-documentintelligence==1.0.0b1
# !pip install numpy
# !pip install pandas
# !pip install tensorflow
# !pip install sklearn
# !pip install scipy

In [1]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential

In [2]:
import numpy as np 
import pandas as pd 
import random
import os
import cv2
import re
import io
import copy
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras import layers, models
from sklearn.metrics.pairwise import euclidean_distances as L2, cosine_similarity as cs
from collections import defaultdict, Counter
from scipy.spatial.distance import cdist

2025-01-17 21:45:27.436717: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 21:45:28.662942: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 21:45:29.772070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737150330.600415   99608 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737150330.837609   99608 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-17 21:45:33.122077: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

# Load Model

In [3]:
MODEL_PATH = 'model.keras'

# Define Model

Use the model parameters defined in training to configure the model settings.

In [4]:
IMG_SIZE = 224
MODEL_ARCHITECTURE = tf.keras.applications.ResNet50V2
LAST_LAYER_SIZE = 2048

In [5]:
embeddings_base_model = MODEL_ARCHITECTURE(
    include_top=False,
    weights="imagenet",
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling='avg',
)
embeddings_model = models.Sequential([
        embeddings_base_model,  
        layers.Dense(LAST_LAYER_SIZE, activation=None), 
        layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)) 
    ])
image_input1 = Input(shape=(IMG_SIZE,IMG_SIZE,3),name='Image1')
image_input2 = Input(shape=(IMG_SIZE,IMG_SIZE,3),name='Image2')
image_input3 = Input(shape=(IMG_SIZE,IMG_SIZE,3),name='Image3')

anchor = embeddings_model(image_input1)
positive = embeddings_model(image_input2)
negative = embeddings_model(image_input3)

siamese_network = Model(inputs=[image_input1,image_input2,image_input3], outputs=[anchor,positive,negative])

2025-01-17 21:45:44.677255: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [6]:
siamese_network.load_weights(MODEL_PATH)

In [None]:
image_embeddings_model = siamese_network.layers[-1]

# Image Preprocessing

The images need to be preprocessed according to the preprocessing of the training set.
<br>
I order to do that, the OCR will be extracted and some transformations will be applied on the images.
<br>
The images need to be all in one folder, which path needs to be defined in DOCS_PATH.
<br>
The extracted OCR in json files will be stored in IMAGES_OCR_PATH.
<br>
The preprocessed images will be stored in DOCS_PREPROCESSED_PATH.

In [7]:
DOCS_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/images/'

In [8]:
DOCS_PREPROCESSED_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/preprocessed_images/'
if not os.path.exists(DOCS_PREPROCESSED_PATH):
    os.mkdir(DOCS_PREPROCESSED_PATH)

In [9]:
# OCR extraction configuration

IMAGES_OCR_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/images_ocr/'
if not os.path.exists(IMAGES_OCR_PATH):
    os.mkdir(IMAGES_OCR_PATH)

AZURE_DI_ENDPOINT = 'https://doc-intelligence-smartops-t102.cognitiveservices.azure.com/'
AZURE_DI_SUBSCRIPTION_KEY = '7319cf168ac1453b8185c4c9cbf49f6d'
AZURE_DI_DEFAULT_CONFIG = {} 
AZURE_DI_CONFIG_READING_ORDER = 'readingOrder'
AZURE_DI_MODEL_ID = 'prebuilt-read'
AZURE_DI_CONTENT_TYPE = 'application/octet-stream'

## OCR

In [10]:
def do_azure_di_ocr(client, image: np.ndarray):
    _, im_buf_arr = cv2.imencode('.jpg', image)
    img = io.BytesIO(im_buf_arr)
    config = {key: value for key, value in AZURE_DI_DEFAULT_CONFIG.items() if (key != AZURE_DI_CONFIG_READING_ORDER)}
    poller = client.begin_analyze_document(
        model_id=AZURE_DI_MODEL_ID,
        analyze_request=img,
        content_type=AZURE_DI_CONTENT_TYPE,
        **config
    )
    return poller.result()

In [11]:
def get_bounding_box(bb_list):
    bb_x = bb_list[::2]
    bb_y = bb_list[1::2]
    return [int(min(bb_x)), int(min(bb_y)), int(max(bb_x)), int(max(bb_y))]

In [12]:
def get_image_words(image_path):
    img = load_img(image_path)
    img = img_to_array(img)
    img_ocr = do_azure_di_ocr(azure_di_client, img)
    img_words = []
    for word in img_ocr['pages'][0]['words']:
        img_words.append({'bbox': get_bounding_box(bb_list=word.polygon),
                          'text': word.content})
    return img_words

In [None]:
# Extract and save OCR

azure_di_client = DocumentIntelligenceClient(endpoint=AZURE_DI_ENDPOINT, credential=AzureKeyCredential(AZURE_DI_SUBSCRIPTION_KEY))
for dirpath, dirnames, filenames in os.walk(DOCS_PATH):
    for i, image_name in enumerate(filenames):
        if image_name.replace('.jpg', '.json') not in os.listdir(IMAGES_OCR_PATH):
            print(f'{i+1}/{len(os.listdir(DOCS_PATH))}')
            img_words = get_image_words(os.path.join(dirpath, image_name))
            with open(os.path.join(IMAGES_OCR_PATH, image_name.replace('.jpg', '.json')), 'w') as f:
                json.dump(img_words, f)

## Image Alterations

In [15]:
def print_2_images_from_array(img1, img2, text_img1='Image 1', text_img2='Image 2'):
    f, axarr = plt.subplots(1,2,figsize=(20, 8))
    axarr[0].imshow(img1)
    axarr[0].title.set_text(text_img1)
    # axarr[0].axis('off')
    axarr[1].imshow(img2)
    axarr[1].title.set_text(text_img2)
    # axarr[1].axis('off')
    plt.show()

In [16]:
def print_3_images_from_array(img1, img2, img3, 
                              text_img1='Image 1', 
                              text_img2='Image 2', 
                              text_img3='Image 3'):
    f, axarr = plt.subplots(1, 3, figsize=(30, 10))
    axarr[0].imshow(img1)
    axarr[0].title.set_text(text_img1)
    axarr[1].imshow(img2)
    axarr[1].title.set_text(text_img2)
    axarr[2].imshow(img3)
    axarr[2].title.set_text(text_img3)
    plt.show()

In [86]:
def preprocess_with_white_padding(img):
    height, width = img.shape[0], img.shape[1]
    # Calculate padding
    if height > width:
        pad_width = (height - width) // 2
        pad_height = 0
        paddings = [[0, 0], [pad_width, pad_width], [0, 0]]
    elif width > height:
        pad_height = (width - height)
        pad_width = 0
        paddings = [[0, pad_height], [0, 0], [0, 0]]
    else:  
        return img
    # Pad the image with white pixels using constant value 255
    padded_img = tf.pad(img, paddings, constant_values=255)
    
    return padded_img

In [None]:
def get_img_max_coords_from_ocr(image, image_ocr):
    x_min, y_min = image.shape[1], image.shape[0]
    x_max, y_max = 0, 0
    for word in image_ocr:
        x_min = min(word['bbox'][0], x_min)
        y_min = min(word['bbox'][1], y_min)
        x_max = max(word['bbox'][2], x_max)
        y_max = max(word['bbox'][3], y_max)
    return x_min, y_min, x_max, y_max

In [14]:
def crop_white_border(image, image_ocr):
    min_contour_area = (image.shape[0] * image.shape[1]) / 2000
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Convert the image to grayscale
    _, thresh = cv2.threshold(gray, 245, 255, cv2.THRESH_BINARY_INV) # Threshold the image to separate foreground and background
    thresh = thresh.astype(np.uint8) # Ensure the thresholded image is 8-bit unsigned integer
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Find contours of non-white regions
    # Find the bounding box of the combined region
    x_min, y_min, x_max, y_max = get_img_max_coords_from_ocr(image, image_ocr)
    for contour in contours:
        if cv2.contourArea(contour) > min_contour_area:
            x, y, w, h = cv2.boundingRect(contour)
            # cv2.rectangle(image, (x, y), (x+w, y+h), color=(0,0,0), thickness=5)
            x_min = min(x, x_min)
            y_min = min(y, y_min)
            x_max = max(x + w, x_max)
            y_max = max(y + h, y_max)
    # Crop the image based on the combined bounding box
    cropped_image = image[y_min:y_max, x_min:x_max] 

    # Move cropped area to the middle of the image
    original_height, original_width = image.shape[:2]
    cropped_height, cropped_width = cropped_image.shape[:2]
    padded_image = np.ones((original_height, original_width, image.shape[2]), dtype=image.dtype) * 255
    horizontal_offset = (original_width - cropped_width) // 2
    padded_image[:cropped_height, horizontal_offset:horizontal_offset + cropped_width] = cropped_image
    
    return padded_image

In [80]:
def thicken_borders(image, color=(0, 0, 0), dilate_iterations=2, canny_threshold1=75, canny_threshold2=150):
    image = image.numpy().astype(np.uint8)  # Convert to uint8
    thickness = max(int(round(((image.shape[0] + image.shape[1]) / 2) / 400, 0)), 0)
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    # Apply Gaussian blur to reduce noise
    gray = cv2.GaussianBlur(gray, (3, 3), 0)
    # Detect edges using Canny
    edges = cv2.Canny(gray, canny_threshold1, canny_threshold2, L2gradient=True)
    edge_mask = edges > 0
    # Dilate the edges to thicken them
    kernel = np.ones((thickness, thickness), np.uint8)
    thickened_edges = cv2.dilate(edges, kernel, iterations=dilate_iterations)
    # Create a mask for the thickened edges
    thicken_edge_mask = thickened_edges > 0
    # Create a copy of the original image to modify
    result = image.copy()
    # Overlay the thickened borders in the specified color
    result[thicken_edge_mask] = color        
    return result

In [81]:
def highlight_word(img, word, color):
    pt1 = (word['bbox'][0], word['bbox'][1])
    pt2 = (word['bbox'][2], word['bbox'][3])
    cv2.rectangle(img, pt1, pt2, color, thickness=-1)

def highlight_text(img, img_ocr):
    for word in img_ocr:
        if re.findall('^[\$€₹]*[0-9,\-]+\.*[0-9]+[\$€₹]*$', word['text']): # Highlight numbers
            highlight_word(img, word, color=(255, 0, 0))
        elif re.findall('[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}', word['text']): # Highlight dates
            highlight_word(img, word, color=(0, 255, 0))
    return img

In [82]:
def get_text_size(img_ocr):
    words_height = np.array([word['bbox'][3] - word['bbox'][1] for word in img_ocr])
    return numpy.median(words_height)

In [83]:
def doc_preprocessing(img, img_ocr):
    img = highlight_text(img, img_ocr)
    img = crop_white_border(img, img_ocr)
    img = preprocess_with_white_padding(img) # add white padding to keep proportions
    img = thicken_borders(img)
    return img

In [84]:
def preprocess_image(image_path, img_ocr_path):
    img = load_img(image_path)
    img = img_to_array(img)
    with open(img_ocr_path) as f:
        img_ocr = json.load(f)
    img_pp = doc_preprocessing(img, img_ocr)
    return img_pp

## Image Preprocessing Execution

In [24]:
# Save preprocessed images

for dirpath, dirnames, filenames in os.walk(DOCS_PATH):
    for i, filename in enumerate(filenames):
        if filename.replace('.jpg', '.json') not in os.listdir(IMAGES_OCR_PATH):
            continue
        image_path = os.path.join(dirpath, filename)
        image_ocr_path = os.path.join(IMAGES_OCR_PATH, filename.replace('.jpg', '.json'))
        img_pp = preprocess_image(image_path, image_ocr_path)
        img_pp_bgr = cv2.cvtColor(img_pp, cv2.COLOR_RGB2BGR)
        cv2.imwrite(os.path.join(DOCS_PREPROCESSED_PATH, filename), img_pp_bgr)

# Calculate embeddings code

Using, the model and the preprocessed images, the embedding representation of the images will be calculated.
<br>
It will be necessary to set a BATCH_SIZE, this is the number of images that will be processed at the same time, the bigger, the faster. This value will be conditioned by the machine resources.

In [17]:
BATCH_SIZE = 32

In [18]:
datagen = ImageDataGenerator(rescale=1/255.)
def preprocess_image_with_datagen(datagen, image_path, img_size=IMG_SIZE):
    img = load_img(image_path, target_size=(img_size, img_size))
    img = img_to_array(img)
    img = datagen.standardize(img)
    return img

In [19]:
def get_doc_embeddings_batch(model, datagen, batch_filenames, img_size=IMG_SIZE):
    imgs = [preprocess_image_with_datagen(datagen, image_path, img_size) for image_path in batch_filenames]
    imgs_array = np.stack(imgs)
    embeddings = model.predict(imgs_array, batch_size=len(batch_filenames))
    return embeddings
    
def get_all_embeddings(folder_path, filter_doc_names=[], batch_size=BATCH_SIZE):
    all_embeddings = {}
    batch = []
    batch_filenames = []
    for dirpath, dirnames, filenames in os.walk(folder_path):
        if filter_doc_names: 
            filenames_list = [f for f in filenames if f in filter_doc_names]
        else:
            filenames_list = filenames
        for filename in filenames_list:
            batch.append(filename)
            batch_filenames.append(os.path.join(dirpath, filename))
            if len(batch) == batch_size:
                embeddings = get_doc_embeddings_batch(image_embeddings_model, datagen, batch_filenames)
                all_embeddings.update({filename: emb for filename, emb in zip(batch, embeddings)})
                batch = []  
                batch_filenames = [] 
        if batch:
            embeddings = get_doc_embeddings_batch(image_embeddings_model, datagen, batch_filenames)
            all_embeddings.update({filename: emb for filename, emb in zip(batch, embeddings)})
    return all_embeddings

# Classify documents from templates base documents

During this testing, we will use a set of documents categorized by template, referred to as "base documents." We will then measure the similarity between each uncategorized document and the base documents. Each uncategorized document will be assigned the template of the closest matching base document. These classified documents will be stored in STORE_CLASSIFICATION_PATH. If no base document has a similarity score above the defined THRESHOLD, the document will be classified as "Others."

In [7]:
BASE_DOCS_TEMPLATES_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB Templates/'
BASE_DOCS_PREPROCESSED_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/All EOB/preprocessed_images/'
UNCATEGORIZED_DOCS_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/All EOB/images/'
UNCATEGORIZED_DOCS_PREPROCESSED_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/All EOB/preprocessed_images/'
STORE_CLASSIFICATION_PATH = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/All EOB/classification/'
THRESHOLD = 0.5

In [11]:
def get_template_mapping_by_doc(data_path):
    template_mapping = {}
    for dirpath, dirnames, filenames in os.walk(data_path):
        for filename in filenames:
            template_mapping[filename] = os.path.basename(dirpath)
    return template_mapping

In [12]:
def classify_docs(base_docs_embeddings, uncategorized_docs_embeddings, template_mapping, threshold=THRESHOLD):
    base_docs_embs = np.array([emb.flatten() for emb in base_docs_embeddings.values()])
    base_doc_names = list(base_docs_embeddings.keys())
    uncategorized_docs_embs = np.array([emb.flatten() for emb in uncategorized_docs_embeddings.values()]) 
    uncategorized_doc_names = list(uncategorized_docs_embeddings.keys())
    distances = cdist(base_docs_embs, uncategorized_docs_embs, metric="euclidean")
    # print(distances)
    docs_classification = {}
    for i in range(len(uncategorized_doc_names)):
        uncategorized_doc_name = uncategorized_doc_names[i]
        closest_template = 'Others'
        min_distance = np.min(distances[:, i])
        min_distance_index = np.argmin(distances[:, i])
        # print(min_distance_index)
        if min_distance < threshold:
            closest_template = template_mapping[base_doc_names[min_distance_index]]
        docs_classification[uncategorized_doc_name] = {'closest_template': closest_template, 'distance': round(float(min_distance), 2)}
        
    return docs_classification

In [13]:
def save_classified_docs(data_path, store_path, docs_classification):
    for doc_name, classification in docs_classification.items():
        if not os.path.exists(os.path.join(store_path, classification['closest_template'])):
            os.mkdir(os.path.join(store_path, classification['closest_template']))
            
        os.system(f'cp "{os.path.join(data_path, doc_name)}" "{os.path.join(store_path, classification["closest_template"], doc_name)}"')
        # print(f'cp "{os.path.join(data_path, doc_name)}" "{os.path.join(store_path, classification["closest_template"], doc_name)}"')

In [14]:
base_docs_names = []
for dirpath, dirnames, filenames in os.walk(BASE_DOCS_TEMPLATES_PATH):
    base_docs_names.extend(filenames)

In [21]:
base_docs_embeddings = get_all_embeddings(BASE_DOCS_PREPROCESSED_PATH, batch_size=BATCH_SIZE, filter_doc_names=base_docs_names)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 459ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 526ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 456ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 462ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [22]:
# delete documents already in templates
uncategorized_doc_names = [doc_name for doc_name in os.listdir(UNCATEGORIZED_DOCS_PATH) if doc_name not in base_docs_names]

In [24]:
uncategorized_docs_embeddings = get_all_embeddings(UNCATEGORIZED_DOCS_PREPROCESSED_PATH, batch_size=BATCH_SIZE, filter_doc_names=uncategorized_doc_names)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 484ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 474ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 473ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [30]:
template_mapping = get_template_mapping_by_doc(BASE_DOCS_TEMPLATES_PATH)
docs_classification = classify_docs(base_docs_embeddings, uncategorized_docs_embeddings, template_mapping)
save_classified_docs(UNCATEGORIZED_DOCS_PATH, STORE_CLASSIFICATION_PATH, docs_classification)

# Clustering

In this testing exercice, given a set of uncategorized documents, we will compare each of the documents and group them if they have a distance lower than THRESHOLD. 

In [21]:
THRESHOLD = 0.5

In [22]:
def get_all_distances(embeddings):
    embeddings_array = np.array([emb.flatten() for emb in embeddings.values()])
    distances = cdist(embeddings_array, embeddings_array, metric="euclidean")
    return distances

In [23]:
def template_clustering(embeddings, threshold=THRESHOLD):
    doc_names = list(embeddings.keys())
    distances = get_all_distances(embeddings)

    clusters = []
    for i, doc_name in enumerate(doc_names):
        clusters.append([doc_names[j] for j in np.where(distances[i] < threshold)[0]])

    final_clusters = []
    while clusters:
        joined_clusters = clusters.pop(0)
        j = 0
        while j < len(clusters):
            cluster2 = clusters[j]
            if len(joined_clusters + cluster2) != len(set(joined_clusters + cluster2)): # If they have documents in common 
                joined_clusters = list(set(joined_clusters + cluster2))
                clusters.pop(j)
                j = 0
            else:
                j += 1
        final_clusters.append(joined_clusters)

    print(len(final_clusters))
    
    clustering_result = {}
    for i, cluster in enumerate(final_clusters):
        for doc_name in cluster:
            clustering_result[doc_name] = i
        
    return clustering_result


In [24]:
def create_clutering_folders(data_path, store_path, clustering_result):
    for dirpath, dirnames, filenames in os.walk(data_path):
        for filename in filenames:
            if filename in clustering_result:
                cluster_n = clustering_result[filename]
                cluster_storage_path = os.path.join(store_path, f'{os.path.basename(store_path)}_{cluster_n}')
                if not os.path.exists(cluster_storage_path):
                    os.mkdir(cluster_storage_path)
                os.system(f'cp "{os.path.join(dirpath, filename)}" "{os.path.join(cluster_storage_path, filename)}"')
            else:
                print('Document ', filename, ' not in clustering results')
    

In [25]:
data_path = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/images/'
preprocessed_data_path = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/preprocessed_images/'
store_path = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/clustering'
embeddings = get_all_embeddings(preprocessed_data_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 475ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 448ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [26]:
clustering_result = template_clustering(embeddings, threshold=THRESHOLD)
create_clutering_folders(data_path, store_path, clustering_result)


362


# Iterative Clustering

This is the same exercise as before but using a different approach to cluter the documents.

In [163]:
def iterative_template_clustering(embeddings, initial_threshold, avg_threshold):
    doc_names = list(embeddings.keys())
    distances = get_all_distances(embeddings)

    clusters = []
    for i, doc_name in enumerate(doc_names):
        clusters.append([doc_names[j] for j in np.where(distances[i] < initial_threshold)[0]])

    initial_clusters = []
    while clusters:
        joined_clusters = clusters.pop(0)
        j = 0
        while j < len(clusters):
            cluster2 = clusters[j]
            if len(joined_clusters + cluster2) != len(set(joined_clusters + cluster2)): # If they have documents in common 
                joined_clusters = list(set(joined_clusters + cluster2))
                clusters.pop(j)
                j = 0
            else:
                j += 1
        initial_clusters.append(joined_clusters)

    avg_clusters_embeddings = {str(i): np.mean([embeddings[doc_name] for doc_name in cluster], axis=0) for i, cluster in enumerate(initial_clusters)}
    avg_distances = get_all_distances(avg_clusters_embeddings)
    averaged_clusters = []
    for i in range(avg_distances.shape[0]):
        c = [initial_clusters[j] for j in np.where(avg_distances[i] < avg_threshold)[0]]
        averaged_clusters.append([item for sublist in c for item in sublist])


    final_clusters = []
    while averaged_clusters:
        joined_clusters = averaged_clusters.pop(0)
        j = 0
        while j < len(averaged_clusters):
            cluster2 = averaged_clusters[j]
            if len(joined_clusters + cluster2) != len(set(joined_clusters + cluster2)): # If they have documents in common 
                joined_clusters = list(set(joined_clusters + cluster2))
                averaged_clusters.pop(j)
                j = 0
            else:
                j += 1
        final_clusters.append(joined_clusters)
    
    clustering_result = {}
    for i, cluster in enumerate(final_clusters): 
        for doc_name in cluster:
            clustering_result[doc_name] = i
        
    return clustering_result

In [153]:
data_path = '/home/165252@USTDEV.COM/Documents/Documents Datasets/Navistar/original_feb_21/JPGs'
preprocessed_data_path = '/home/165252@USTDEV.COM/Documents/Documents Datasets/Navistar/original_feb_21/preprocessed_images'
store_path = '/home/165252@USTDEV.COM/Documents/Template Identification/Test/Test1 - Navistar Feb/iterative_classified'
embeddings = get_all_embeddings(preprocessed_data_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 508ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 647ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 513ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 816ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 872ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 538ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 584ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 575ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 589ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [167]:
clustering_result = iterative_template_clustering(embeddings, initial_threshold=0.3, avg_threshold=0.5)
create_clutering_folders(data_path, store_path, clustering_result)

# compare 2 images

In [86]:
datagen = ImageDataGenerator(rescale=1/255.)
def preprocess_image_with_datagen(datagen, image_path, img_size=IMG_SIZE):
    img = load_img(image_path, target_size=(img_size, img_size))
    img = img_to_array(img)
    img = datagen.standardize(img)
    return img

In [87]:
def get_distance_of_2_images(model, datagen, image1_path, image2_path):
    img1 = preprocess_image_with_datagen(datagen, image1_path, img_size=IMG_SIZE)
    img2 = preprocess_image_with_datagen(datagen, image2_path, img_size=IMG_SIZE)
    emb1 = model.predict(np.expand_dims(img1,axis=0))
    emb2 = model.predict(np.expand_dims(img2,axis=0))
    return L2(emb1,emb2)

In [89]:
image1_path = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/preprocessed_images/23657_20240913_99995270_page_1.jpg'
image2_path = '/home/dut-ftp-user/Documents/miguel/JupyterWorkspace/Similarity Classifier Siamese Network/EOB 2024-10-15/preprocessed_images/23847_20240917_99995270_page_1.jpg'
get_distance_of_2_images(image_embeddings_model, datagen, image1_path, image2_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step


array([[0.7040956]], dtype=float32)