In [26]:
import numpy as np
from sklearn.manifold import TSNE
import pickle

import ipywidgets as widgets
from IPython.display import display

import pandas as pd
import os
import re

from io import BytesIO
from PIL import Image
import base64

from bokeh.plotting import figure, save, output_file
from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper
from bokeh.palettes import Category10
from bokeh.io import output_file

In [27]:
## import features and charter list

# label_path = r"/home/tschernn/becore-clustering/pickles/labels_kmeans_resnet.pkl"
# with open(label_path, 'rb') as file:
#     labels = pickle.load(file)

feature_path = r"/home/tschernn/becore-clustering/pickles/features_vit.pkl"
with open(feature_path, 'rb') as file:
    image_features = pickle.load(file)

charter_list = r"/home/tschernn/becore-clustering/pickles/charter_list.pkl"
with open(charter_list, 'rb') as file:
    image_paths = pickle.load(file)

In [30]:
## option 2: reduce dimensions for visualizations (2) and save plot
# fill in settings and run name!

# set t-SNE and plot settings
vis_settings = {
    "n_components": 2, # 2 dimensions for visualization
    "perplexity": 30,
    "metric": "cosine",
    "random_state": 42,
    "feature_extractor": "DINO ViT",
    "preprocessing": "Binarized",
    "charter_selection": "All Austrian Charters",
    "special_highlighting": "Same Scribe Highlighted",
    "notes": "First attempt tuning UMAP for better separation."
}

# set up output directory with timestamp
run_name = "austria_binarized_resnet_tsne"
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
output_dir = f"visualizations/{timestamp}_{run_name}"

os.makedirs(output_dir, exist_ok=True)

# save experiment dir with plot and settings
with open(os.path.join(output_dir, "settings.json"), "w") as f:
    json.dump(vis_settings, f, indent=4)
output_file(os.path.join(output_dir, "plot.html"))

######

tsne = TSNE(n_components = vis_settings['n_components'],
            perplexity = vis_settings['perplexity'],
            random_state = vis_settings['random_state'])
features_2d = tsne.fit_transform(image_features)

# convert to a DataFrame for bokeh
df = pd.DataFrame(features_2d, columns=('x', 'y'))
df['label'] = [path.split('/')[-2] for path in image_paths]  # categorical labels for coloring

# === Define same-scribe prefixes ===
same_scribe_prefixes = ['0c0fb', 'e37dc', '08312', 'b1a27', '60234', '0c20d']

# other_scribe_prefixes = ['7e387', 'fa17d', '2a924', '155ec']

# === Create a flag for same-scribe samples ===
same_scribe_flag = [
    any(fname.split('/')[-1].startswith(prefix) for prefix in same_scribe_prefixes) 
    for fname in image_paths
]

# convert images to base64 for thumbnail display
def encode_image_to_base64(image_path):
    with Image.open(image_path) as img:
        img.thumbnail((100, 100))
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()

# Extract parent directories and simplify names after "writable_area_"
directories = [os.path.basename(os.path.dirname(path)).lower() for path in image_paths]
simplified_dirs = [re.sub(r'^.*writable_area_', '', d) for d in directories]

# Create unique color mapping for simplified directories
unique_dirs = list(set(simplified_dirs))
palette = Category10[10] if len(unique_dirs) <= 10 else Category10[10] * (len(unique_dirs) // 10 + 1)
color_mapper = CategoricalColorMapper(factors=unique_dirs, palette=palette[:len(unique_dirs)])

# Assign a color to each point based on its directory
data_colors = [color_mapper.palette[color_mapper.factors.index(d)] for d in simplified_dirs]

# Extract filenames and encode images
filenames = [os.path.basename(path) for path in image_paths]
encoded_images = [f"data:image/png;base64,{encode_image_to_base64(path)}" for path in image_paths]
        
# create main data dict
data = dict(
    x = features_2d[:, 0],
    y = features_2d[:, 1],
    directory = simplified_dirs,
    filename = filenames,
    image = encoded_images,
    same_scribe = same_scribe_flag,
    color = data_colors
)

# split data into two ColumnDataSources
same_scribe_indices = [i for i, flag in enumerate(data['same_scribe']) if flag]
other_indices = [i for i, flag in enumerate(data['same_scribe']) if not flag]

same_scribe_source = ColumnDataSource({key: [data[key][i] for i in same_scribe_indices] for key in data})
other_source = ColumnDataSource({key: [data[key][i] for i in other_indices] for key in data})

# Create figure
p = figure(title=f'{vis_settings["charter_selection"]} - {vis_settings["preprocessing"]} - {vis_settings["feature_extractor"]} - t-SNE - {vis_settings["special_highlighting"]}',
           tools='pan,wheel_zoom,reset,hover,save',
           width=800, height=600)

# Plot "Other" points (colored by directory)
p.scatter(
    'x', 'y',
    source=other_source,
    legend_field='directory',
    color={'field': 'directory', 'transform': color_mapper},
    alpha=0.7, size=8
)

# Plot same-scribe charters (diamonds, red outline, same color inside)
p.scatter(
    'x', 'y',
    marker='diamond',
    source=same_scribe_source,
    size=12,
    line_color='red',
    fill_color='color',
    line_width=2,
    alpha=0.9,
    legend_label="Same Scribe Charters"
)

# Add hover tool with embedded images
hover = HoverTool(tooltips="""
    <div>
        <div><strong>Filename:</strong> @filename</div>
        <div><img src="@image" alt="Image" style="width:100px;height:100px;"/></div>
    </div>
""")
p.add_tools(hover)

# Style the legend
p.legend.title = 'Image Directories'
#p.legend.location = "best"
p.legend.click_policy = "hide"  # Allows toggling visibility of groups

save(p)

'/home/tschernn/becore-clustering/visualizations/austria_binarize_resnet_tsne.html'