In [20]:
from keras.preprocessing.image import load_img
from skimage.feature import hog
from skimage import data, exposure
from scipy.spatial import distance
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

import os
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [21]:
## create list of image files

# enter paths to all directories from which you want to cluster images
paths = ["/home/tschernn/becore-clustering/preprocessed_imgs/binarized_imgs/writable_area_notar", "/home/tschernn/becore-clustering/preprocessed_imgs/binarized_imgs/writable_area_papst_oe"]
# change the working directory to the path where the images are located

# this list holds all the image filenames
charters = []

# iterate over files in path
for path in paths:
    with os.scandir(path) as files:
        for file in files:
            if file.name.endswith(('.png', '.jpg')):
              # adds the image files to the list
                charters.append((file.name, path.split('/')[-1]))

# save list of charters and folders
charter_p = r"/home/tschernn/becore-clustering/charter_list.pkl"

with open(charter_p,'wb') as file:
    pickle.dump(charters, file)

charters[1:-1:50]

[('wa_77d63365260e4f4f6b999bfb2dc7db88.jpg', 'writable_area_notar'),
 ('wa_9edaeeb95f8fbfba8211d1b029e40fc52.jpg', 'writable_area_papst_oe'),
 ('wa_ec6be3bcdb3d8136fc8807576dfcdde02.jpg', 'writable_area_papst_oe'),
 ('wa_30111c186b060268789e4a4ec66d962a.jpg', 'writable_area_papst_oe')]

In [22]:
## reshape imgs

img_size = (1000, 1000)

def reshape_img(file):
    # load the image as a PIL of the chosen size
    img = load_img(file, target_size=img_size)
    # convert from 'PIL.Image.Image' to numpy array => img.shape is (224, 224, 3)
    img = np.array(img) 
    # reshape the array for the model - reshape(num_of_samples, dim 1, dim 2, channels) => img.shape is (1, 224, 224, 3)
    reshaped_img = img.reshape(1, img_size[0], img_size[1], 3)
    return reshaped_img

### save and export image array
 
imgs_list = []
for charter in charters:
    reshaped = reshape_img(f'/home/tschernn/becore-clustering/preprocessed_imgs/binarized_imgs/{charter[1]}/{charter[0]}')
    imgs_list.append(reshaped)
imgs_array = np.array(imgs_list)
imgs_array = imgs_array.squeeze()
imgs_array.shape

p = r"/home/tschernn/becore-clustering/img_array.pkl"

with open(p,'wb') as file:
    pickle.dump(imgs_array, file)
    print(f'Saved image arrays as {file.name}.')

Saved image arrays as /home/tschernn/becore-clustering/img_array.pkl.


In [23]:
def extract_features_hog(image):
    feat, hog_image = hog(image, orientations=9, 
                        pixels_per_cell=(128, 128), cells_per_block=(2, 2), 
                        visualize=True, channel_axis=-1)
    return feat, hog_image

In [24]:
feature_list = []
hog_image_list = []

len_imgs = len(imgs_array)

for i in range(len_imgs):
    feature, hog_image = extract_features_hog(imgs_array[i])
    
    feature_list.append(feature)
    hog_image_list.append(hog_image)

In [25]:
# create an empty nxn distance matrix
distance_matrix = np.zeros((len_imgs, len_imgs))

for i in range(len_imgs):
    fd_i = feature_list[i]
    for k in range(i):
        fd_k = feature_list[k]
        # measure Jensen–Shannon distance between each feature vector and add to the distance matrix
        distance_matrix[i, k] = distance.jensenshannon(fd_i, fd_k)

# symmetrize the matrix as distance matrix is symmetric
distance_matrix = np.maximum(distance_matrix, distance_matrix.transpose())
data = distance_matrix

data.shape

(155, 155)

In [26]:
### export data as pickle file

p = r"/home/tschernn/becore-clustering/features_hog.pkl"

with open(p,'wb') as file:
    pickle.dump(distance_matrix, file)
    print(f'Saved features as {file.name}.')

Saved features as /home/tschernn/becore-clustering/features_hog.pkl.
