In [None]:
# Import libraries
import os
import sys
import torch
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans
from adjustText import adjust_text
from IPython.display import display

sys.path.append('../src')

In [None]:
# Prepare for comparison
# Load ImageNet classnames as texts
from open_clip import IMAGENET_CLASSNAMES


# Load tuple from file
def load_tuple(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)


IMAGENET_CLASSNAMES_ZH = load_tuple("./results/imagenet_classnames_zh.pkl")

# Load several text embeddings from ../data/ImageNet/text_embeds/
imagenet_overall_prompt_bert_tiny_uncased = torch.load(
    "../data/ImageNet/text_embeds/imagenet_overall_prompt_bert_tiny_uncased.pt").t()
imagenet_overall_prompt_flan_t5_xxl = torch.load(
    "../data/ImageNet/text_embeds/imagenet_overall_prompt_flan_t5_xxl.pt").t()
imagenet_single_template_bert_tiny_uncased = torch.load(
    "../data/ImageNet/text_embeds/imagenet_single_template_bert_tiny_uncased.pt").t()
imagenet_single_template_flan_t5_xxl = torch.load(
    "../data/ImageNet/text_embeds/imagenet_single_template_flan_t5_xxl.pt").t()

In [None]:
# progress_output = display('', display_id=True)


def print_progress(iteration, total):
    # progress_output.update(
    #     f"Adjusting text: {iteration}/{total} iterations completed")
    print(f"Adjusting text: {iteration}/{total} iterations completed")


def tsne_plot(plot_name, avoid_text_overlap, tensor_data, labels, apply_scaling, **tsne_kwargs):
    """
    Apply t-SNE on a given tensor and plot the results with labels.

    Parameters:
    plot_name (str): The name of the plot.
    avoid_text_overlap (bool): Whether to use adjust_text to avoid text overlap.
    tensor_data (torch.Tensor): An m x n tensor of data points.
    labels (tuple of str): A tuple of strings representing the labels of each data point.
    apply_scaling (bool): Whether to apply StandardScaler before t-SNE.
    **tsne_kwargs: Additional keyword arguments for the t-SNE function.
    """
    # Convert tensor to numpy array
    data = tensor_data.detach().cpu().numpy()

    # Apply StandardScaler if required
    if apply_scaling:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)

    # Apply t-SNE
    tsne = TSNE(**tsne_kwargs)
    tsne_results = tsne.fit_transform(data)

    # Plotting the results
    plt.figure(figsize=(80, 80))
    # plt.figure(figsize=(30, 30))
    texts = []
    for i, label in enumerate(labels):
        plt.scatter(tsne_results[i, 0], tsne_results[i, 1])
        if not avoid_text_overlap:
            # pass
            plt.annotate(
                label,
                (tsne_results[i, 0], tsne_results[i, 1]),
                textcoords="offset points",
                fontproperties="SimHei",
                xytext=(0, 10), ha='center'
            )
        else:
            texts.append(
                plt.text(
                    tsne_results[i, 0],
                    tsne_results[i, 1],
                    label,
                    fontproperties="SimHei",
                    ha='center',
                    va='center'),

            )

    if avoid_text_overlap:
        # Use adjust_text to automatically adjust labels
        adjust_text(
            texts,
            x=tsne_results[:, 0],
            y=tsne_results[:, 1],
            progress_callback=print_progress
        )
    title = f't-SNE plot of {plot_name}'
    plt.title(title)
    # plt.xlabel('t-SNE feature 1')
    # plt.ylabel('t-SNE feature 2')
    plt.savefig(
        os.path.join('./results/', '_'.join(title.split(' ')) + '.pdf'),
        format='pdf'
        # os.path.join('./results/', '_'.join(title.split(' ')+['ppt']) + '.png'),
        # format='png'
    )
    plt.show()

    return tsne_results

In [None]:
# TSNE settings
default_tsne_setting = {
    # Dimension of the embedded space
    'n_components': 2,
    # Number of nearest neighbors to consider
    'perplexity': 10,
    # Controls how tight natural clusters in the original space are in the embedded space
    'early_exaggeration': 12.0,
    # The learning rate for t-SNE optimization
    'learning_rate': 'auto',
    # Maximum number of iterations for the optimization
    'n_iter':1000,
    # Maximum number of iterations without progress before stopping the optimization
    'n_iter_without_progress': 300,
    # If the gradient norm is below this threshold, the optimization will stop
    'min_grad_norm': 1e-7,
    # The metric to use when calculating distance between instances
    'metric': 'cityblock',
    # Initialization of embedding. Possible options are 'random', 'pca', and a numpy array
    'init': 'pca',
    # Seed for the random number generator
    'random_state': 666,
    # The algorithm for gradient descent ('barnes_hut' or 'exact')
    'method': 'barnes_hut',
    # 'method': 'exact',
    # Trade-off between speed and accuracy for 'barnes_hut' method
    'angle': 0.5,
    # Maximum number of CPU cores used for the optimization
    'n_jobs': 10,
    # Verbosity level
    'verbose': 1,
}
tsne_settings = {
    "imagenet_overall_prompt_bert_tiny_uncased": default_tsne_setting,
    "imagenet_overall_prompt_flan_t5_xxl": default_tsne_setting,
    "imagenet_single_template_bert_tiny_uncased": default_tsne_setting,
    "imagenet_single_template_flan_t5_xxl": default_tsne_setting
}

In [None]:
tsne_imagenet_overall_prompt_bert_tiny_uncased = tsne_plot(
    "imagenet_overall_prompt_bert_tiny_uncased",
    # avoid_text_overlap=False,
    avoid_text_overlap=True,
    tensor_data=imagenet_overall_prompt_bert_tiny_uncased,
    labels=IMAGENET_CLASSNAMES_ZH,
    apply_scaling=True,
    **tsne_settings["imagenet_overall_prompt_bert_tiny_uncased"]
)

In [None]:
tsne_imagenet_overall_prompt_flan_t5_xxl = tsne_plot(
    "imagenet_overall_prompt_flan_t5_xxl",
    # avoid_text_overlap=False,
    avoid_text_overlap=True,
    tensor_data=imagenet_overall_prompt_flan_t5_xxl,
    labels=IMAGENET_CLASSNAMES_ZH,
    apply_scaling=True,
    **tsne_settings["imagenet_overall_prompt_flan_t5_xxl"]
)

In [None]:
tsne_imagenet_single_template_bert_tiny_uncased = tsne_plot(
    "imagenet_single_template_bert_tiny_uncased",
    # avoid_text_overlap=False,
    avoid_text_overlap=True,
    tensor_data=imagenet_single_template_bert_tiny_uncased,
    labels=IMAGENET_CLASSNAMES_ZH,
    apply_scaling=True,
    **tsne_settings["imagenet_single_template_bert_tiny_uncased"]
)

In [None]:
tsne_imagenet_single_template_flan_t5_xxl = tsne_plot(
    "imagenet_single_template_flan_t5_xxl",
    # avoid_text_overlap=False,
    avoid_text_overlap=True,
    tensor_data=imagenet_single_template_flan_t5_xxl,
    labels=IMAGENET_CLASSNAMES_ZH,
    apply_scaling=True,
    **tsne_settings["imagenet_single_template_flan_t5_xxl"]
)

In [None]:
def calculate_dbi(data_tensor, k, **kmeans_args):
    
    # Convert tensor to numpy array if it is a tensor for compatibility with sklearn
    if isinstance(data_tensor, torch.Tensor):
        data = data_tensor.cpu().numpy()
    else:
        data = data_tensor

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=k, **kmeans_args).fit(data)

    # Extract cluster labels
    labels = kmeans.labels_

    # Calculate Davies-Bouldin Index
    dbi = davies_bouldin_score(data, labels)

    return dbi


kmeans_args = {
    'init': 'k-means++',  # Initialization method
    'n_init': 10,         # Number of time the k-means algorithm will run
    'max_iter': 300,      # Maximum number of iterations for a single run
    'tol': 1e-4,          # Tolerance to declare convergence
    'random_state': 666,   # Random state for reproducibility
    'algorithm': 'lloyd'   #
}

In [None]:

# Create a list to store the results
results = []

# Perform the for loop
for k in range(2, 22, 2):
    dbi_value_1 = calculate_dbi(
        tsne_imagenet_single_template_bert_tiny_uncased, k, **kmeans_args)
    dbi_value_2 = calculate_dbi(
        tsne_imagenet_single_template_flan_t5_xxl, k, **kmeans_args)
    dbi_value_3 = calculate_dbi(
        tsne_imagenet_overall_prompt_bert_tiny_uncased, k, **kmeans_args)
    dbi_value_4 = calculate_dbi(
        tsne_imagenet_overall_prompt_flan_t5_xxl, k, **kmeans_args)
    results.append(
        [
            k,
            round(dbi_value_1, 2),
            round(dbi_value_2, 2),
            round(dbi_value_3, 2),
            round(dbi_value_4, 2)
        ]
    )

# Create a dataframe from the results
df_results = pd.DataFrame(
    results,
    columns=['k', 'dbi_value_1', 'dbi_value_2', 'dbi_value_3', 'dbi_value_4']
)

# Save the dataframe to an xlsx file
df_results.to_excel('./results/dbi_comparison.xlsx', index=False)