In [None]:
import pandas as pd

# Usage
mdf = pd.read_pickle("merged.pkl")  # Load your DataFrame
df = mdf[(mdf["model"] == "ViT-Finetuned") & (mdf["dataset"] == "Bristol")]

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


def visualize_cumulative_individual_distribution(name, df, figsize=(12, 6)):
    """
    Visualize the cumulative percentage of individuals remaining at different
    image count thresholds.

    Parameters:
    df (pandas.DataFrame): DataFrame containing a 'label' column.
    figsize (tuple): Figure size in inches. Default is (12, 6).

    Returns:
    None: Displays the plot.
    """
    # Count the occurrences of each label
    label_counts = df["label"].value_counts()

    # Calculate cumulative percentages
    total_individuals = len(label_counts)
    cumulative_percentages = [
        (label_counts >= i).sum() / total_individuals * 100 for i in range(1, label_counts.max() + 1)
    ]

    # Create the plot
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot(range(1, len(cumulative_percentages) + 1), cumulative_percentages, marker="o")
    ax.set_title(f"{name} Cumulative Percentage of Individuals vs. Minimum Image Count")
    ax.set_xlabel("Minimum Number of Images per Individual")
    ax.set_ylabel("Percentage of Individuals Remaining")
    ax.set_ylim(0, 100)
    ax.grid(True)

    # Add description text to the plot
    description = (
        "How any individuals remains at given minimum image size cutoff point? \n This plot shows how many individuals remain in the dataset"
        "as the minimum required number of images per individual increases."
    )
    ax.text(0.5, -0.15, description, transform=ax.transAxes, ha="center", va="center", fontsize=10)

    # Add percentage labels at specific points
    thresholds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50]
    for threshold in thresholds:
        if threshold <= len(cumulative_percentages):
            percentage = cumulative_percentages[threshold - 1]
            ax.annotate(
                f"{percentage:.1f}%", (threshold, percentage), textcoords="offset points", xytext=(0, 10), ha="center"
            )

    plt.tight_layout()
    plt.show()

    # Print some additional statistics
    print("Percentage of individuals remaining at specific thresholds:")
    for threshold in thresholds:
        if threshold <= len(cumulative_percentages):
            percentage = cumulative_percentages[threshold - 1]
            print(f"  ≥ {threshold} images: {percentage:.1f}%")


# 1. Distribution of labels with n number of images
label_counts = df["label"].value_counts()
count_distribution = label_counts.value_counts().sort_index()

plt.figure(figsize=(12, 6))
count_distribution.plot(kind="bar")
plt.title("Distribution of Number of Images per Individual")
plt.xlabel("Number of Images")
plt.ylabel("Number of Individuals")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


visualize_cumulative_individual_distribution(df["dataset"].iloc[0], df)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook
import colorcet as cc
from PIL import Image
import io
import base64

output_notebook()


def reduce_dimensions(embeddings, method="pca", n_components=2, **kwargs):
    """
    Reduce the dimensionality of the embeddings.

    Args:
    embeddings (np.array): The input embeddings.
    method (str): The dimensionality reduction method ('pca' or 'tsne').
    n_components (int): The number of dimensions to reduce to.
    **kwargs: Additional arguments to pass to the dimensionality reduction method.

    Returns:
    np.array: The reduced embeddings.
    """
    if method.lower() == "pca":
        reducer = PCA(n_components=n_components, **kwargs)
    elif method.lower() == "tsne":
        reducer = TSNE(n_components=n_components, **kwargs)
    else:
        raise ValueError("Method must be either 'pca' or 'tsne'")

    return reducer.fit_transform(embeddings)


def image_to_base64(img):
    """Convert a PIL image to base64 string."""
    buffered = io.BytesIO()
    img.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str


def visualize_embeddings(df, method="pca", n_components=2, **kwargs):
    """
    Visualize embeddings using either PCA or t-SNE.

    Args:
    df (pd.DataFrame): The input DataFrame containing embeddings and metadata.
    method (str): The dimensionality reduction method ('pca' or 'tsne').
    n_components (int): The number of dimensions to reduce to.
    **kwargs: Additional arguments to pass to the dimensionality reduction method.

    Returns:
    bokeh.plotting.figure: The Bokeh figure object.
    """
    # Perform dimensionality reduction
    embeddings = np.vstack(df["embedding"].values)
    embeddings_2d = reduce_dimensions(embeddings, method, n_components, **kwargs)

    # Create a color map
    unique_labels = df["label"].unique()
    num_labels = len(unique_labels)
    color_palette = cc.glasbey[:num_labels]
    color_map = dict(zip(unique_labels, color_palette))

    # Prepare data for Bokeh
    data = {
        "x": embeddings_2d[:, 0],
        "y": embeddings_2d[:, 1],
        "color": [color_map[label] for label in df["label"]],
        "class": df["label"],
        "image": [image_to_base64(img) for img in df["input"]],
    }

    source = ColumnDataSource(data=data)

    # Create the figure
    p = figure(
        width=1920,
        height=1080,
        title=f"2D Projection of Classes using {method.upper()}",
        tools="pan,wheel_zoom,box_zoom,reset",
    )

    # Add the scatter plot
    p.scatter(x="x", y="y", size=12, fill_color="color", line_color="black", source=source, legend_field="class")

    # Add hover tool
    hover = HoverTool(tooltips='<img src="data:image/jpeg;base64,@image" width="128" height="128">')
    p.add_tools(hover)

    return p

In [None]:
df = mdf[(mdf["model"] == "ViT-Pretrained") & (mdf["dataset"] == "MNIST")]
fig = visualize_embeddings(df, method="pca")
show(fig)
fig = visualize_embeddings(df, method="tsne")
show(fig)

In [None]:
df = mdf[(mdf["model"] == "Synthetic") & (mdf["dataset"] == "Synthetic 20c 20n")]
fig = visualize_embeddings(df, method="pca")
show(fig)
fig = visualize_embeddings(df, method="tsne", perplexity=max(len(df["label"].unique()), 30))
show(fig)

In [None]:
df = mdf[(mdf["model"] == "ViT-Finetuned") & (mdf["dataset"] == "Bristol")]
fig = visualize_embeddings(df, method="pca")
show(fig)
fig = visualize_embeddings(df, method="tsne", perplexity=max(len(df["label"].unique()), 30))
show(fig)