In [4]:
import os 

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
# configure matplotlib
plt.rcParams['text.usetex'] = True # enable LaTeX plotting

params = {
    "axes.linewidth": 1.5
}
matplotlib.rcParams.update(params) # set default parameters for matplotlib

## Replicating embedding comaprison of the Echo embeddings paper with our models

In [None]:
# family = "Sheared-LLaMA-1.3B"
# family = "Llama-2-7b-chat-hf"
family = "Mistral-7B-Instruct-v0.2"

skip_B = True # compute similarity only based on A
# skip_B = False # compute similarity based on A, B

MODELS = [
    # baselines
    f"{family}_uni_initialize_skip-{skip_B}",
    f"{family}_bi_initialize_skip-{skip_B}",

    # # our models
    f"{family}_mlm_initialize_skip-{skip_B}",
    f"{family}_mlm_simcse_merge_skip-{skip_B}",

    # echo
    f"{family}_echo_skip-{skip_B}",
]

MODEL_NAMES = {
    # baselines
    f"{family}_uni_initialize_skip-{skip_B}": "Uni",
    f"{family}_bi_initialize_skip-{skip_B}": "Bi (no training)",

    # # our models
    f"{family}_mlm_initialize_skip-{skip_B}": "Bi + MNTP",
    f"{family}_mlm_simcse_merge_skip-{skip_B}": "Bi + MNTP + SimCSE",

    # echo
    f"{family}_echo_skip-{skip_B}": "Echo embeddings",
}

data_path = "/Users/mariusmosbach/Development/plots/llm2vec/data/echo_analysis_output"

for model in MODELS:
    name = f"structure_1_{model}.npy"
    similarities = np.load(os.path.join(data_path, name))

    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(5.0, 3.5), dpi=120)

    bins = 15
    axes.hist(similarities[:, 0], bins=bins, width=.01, linewidth=2.0, edgecolor="black", alpha=0.75, density=True, label=r"Sim(q, $s^-$)")
    # axes.hist(similarities[:, 1], bins=bins, width=.01, edgecolor="black", color="tab:green", alpha=0.7, density=True, label=r"Sim(q, $s^{+}$)") # these examples have a different prefix
    axes.hist(similarities[:, 2], bins=bins, width=.01, linewidth=2.0, edgecolor="black", alpha=0.75, density=True, label="Sim(q, $s^{+}$)") # these samples have the same prefix

    axes.set_xlabel("Sim (cosine similarity)", fontsize=18)
    axes.set_ylabel("Density", fontsize=18)
    axes.tick_params(axis='both', which='major', labelsize=18)
    axes.set_title(MODEL_NAMES[model], fontsize=18)
    axes.legend(loc="best", fontsize=18)

    plt.tight_layout()
    plot_path = os.path.join('/Users/mariusmosbach/Development/plots/llm2vec/saved_plots/embedding_analysis', f"{model}.pdf")
    plt.savefig(plot_path, dpi='figure', bbox_inches='tight')
    plt.show()
    plt.close();