In [2]:
### dimensionality reduction with PCA
## input: array of feature vectors
## output: array of feature vectors

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import pickle

import pandas as pd
import os
import re

from io import BytesIO
from PIL import Image
import base64

from bokeh.plotting import figure, save, output_file
from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper
from bokeh.palettes import Category10
from bokeh.io import output_file

In [3]:
# import features

feature_path = r"/home/tschernn/becore-clustering/pickles/features_hog.pkl"

with open(feature_path, 'rb') as file:
    image_features = pickle.load(file)

#feature_array = np.array(list(image_features.values()))
feature_array = image_features

In [4]:
## option 1: reduce dimensions for clustering (5-10) and save

# make instance of the model and fit
# either choose a desired amount of retained variance (e.g. 0.98), or the number of dimensions to reduce to (e.g. 50)
pca = PCA(n_components = 10, random_state = 47)
pca.fit_transform(feature_array)

# transform our data according to our PCA instance
feature_array = pca.transform(feature_array)
print("Dimensions of our data after PCA  = " + str(feature_array.shape))

feature_path = feature_path.split('.')[0]
feature_path = feature_path + '_dimred_pca.pkl'

### export data as pickle file

with open(feature_path,'wb') as file:
    pickle.dump(feature_array, file)
    print(f'Saved features as {file.name}.')

array([[ 54.10946   ,   6.78184   ,   3.996446  , ...,  -5.9402175 ,
          2.2573185 ,   0.07432115],
       [ -5.740635  ,  -4.742322  ,   3.1904168 , ...,   1.3557707 ,
          1.1334515 ,  -1.0469033 ],
       [-14.180774  ,   1.963369  ,   5.1786504 , ...,   1.6927397 ,
         -1.0850767 ,   0.32431412],
       ...,
       [-17.478882  ,   3.553806  ,  10.483156  , ...,  -0.41904357,
          0.24606015,  -2.693224  ],
       [  1.4703169 ,   1.1796021 ,  -8.481215  , ...,  -0.26435247,
          1.6769    ,  -3.3750725 ],
       [-13.148406  ,   4.2342772 ,   2.7886002 , ...,  -0.88735485,
         -5.989387  ,  -0.21445554]], dtype=float32)

In [5]:

pca = PCA(n_components = 2, random_state = 47)
pca.fit_transform(feature_array)

# convert to a DataFrame for bokeh
df = pd.DataFrame(embedding, columns=('x', 'y'))
df['label'] = [path.split('/')[-2] for path in charters]  # categorical labels for coloring

output_file("visualizations/austria_clahe_retinex_resnet_umap.html") # saves HTML of plot

# === Define same-scribe prefixes ===
same_scribe_prefixes = ['fa17d', '2a924', '0c0fb', 'e37dc', '7e387', 
                        '155ec', '08312', 'b1a27', '60234', '0c20d']

# === Create a flag for same-scribe samples ===
same_scribe_flag = [
    any(fname.split('/')[-1].startswith(prefix) for prefix in same_scribe_prefixes) 
    for fname in charters
]

# convert images to base64 for thumbnail display
def encode_image_to_base64(image_path):
    with Image.open(image_path) as img:
        img.thumbnail((100, 100))
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()

# Extract parent directories and simplify names after "writable_area_"
directories = [os.path.basename(os.path.dirname(path)).lower() for path in charters]
simplified_dirs = [re.sub(r'^.*writable_area_', '', d) for d in directories]

# Create unique color mapping for simplified directories
unique_dirs = list(set(simplified_dirs))
palette = Category10[10] if len(unique_dirs) <= 10 else Category10[10] * (len(unique_dirs) // 10 + 1)
color_mapper = CategoricalColorMapper(factors=unique_dirs, palette=palette[:len(unique_dirs)])

# Assign a color to each point based on its directory
data_colors = [color_mapper.palette[color_mapper.factors.index(d)] for d in simplified_dirs]

# Extract filenames and encode images
filenames = [os.path.basename(path) for path in charters]
encoded_images = [f"data:image/png;base64,{encode_image_to_base64(path)}" for path in charters]
        
# create main data dict
data = dict(
    x = embedding[:, 0],
    y = embedding[:, 1],
    directory = simplified_dirs,
    filename = filenames,
    image = encoded_images,
    same_scribe = same_scribe_flag,
    color = data_colors
)

# split data into two ColumnDataSources
same_scribe_indices = [i for i, flag in enumerate(data['same_scribe']) if flag]
other_indices = [i for i, flag in enumerate(data['same_scribe']) if not flag]

same_scribe_source = ColumnDataSource({key: [data[key][i] for i in same_scribe_indices] for key in data})
other_source = ColumnDataSource({key: [data[key][i] for i in other_indices] for key in data})

# Create figure
p = figure(title='Austrian Charters - CLAHE & Retinex + ResNet + UMAP - Same Scribe Highlighted',
           tools='pan,wheel_zoom,reset,hover,save',
           width=800, height=600)

# Plot "Other" points (colored by directory)
p.scatter(
    'x', 'y',
    source=other_source,
    legend_field='directory',
    color={'field': 'directory', 'transform': color_mapper},
    alpha=0.7, size=8
)

# Plot same-scribe charters (diamonds, red outline, same color inside)
p.scatter(
    'x', 'y',
    marker='diamond',
    source=same_scribe_source,
    size=12,
    line_color='red',
    fill_color='color',
    line_width=2,
    alpha=0.9,
    legend_label="Same Scribe Charters"
)

# Add hover tool with embedded images
hover = HoverTool(tooltips="""
    <div>
        <div><strong>Filename:</strong> @filename</div>
        <div><img src="@image" alt="Image" style="width:100px;height:100px;"/></div>
    </div>
""")
p.add_tools(hover)

# Style the legend
p.legend.title = 'Image Directories'
#p.legend.location = "best"
p.legend.click_policy = "hide"  # Allows toggling visibility of groups

save(p)

Dimensions of our data after PCA  = (420, 290)
