In [None]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
BASE_PATH = "/content/drive/MyDrive/BachelorThesisResults"
DEFAULT_OFFSET_PATH = "no_model_name_available/no_revision_available"
GLOBAL_SAVE_DIR = "/content/drive/MyDrive/BachelorThesisResults/plots_new"

In [None]:
tasks_dict = {
    "classification": [
        "Banking77Classification",
        "EmotionClassification",
        "TweetSentimentExtractionClassification",
        "AmazonCounterfactualClassification",
        "MassiveIntentClassification",
        "MassiveScenarioClassification",
        "MTOPDomainClassification",
        "MTOPIntentClassification"
    ],
    "clustering": [
        "ArXivHierarchicalClusteringP2P",
        "ArXivHierarchicalClusteringS2S",
        "BiorxivClusteringP2P.v2",
        "BiorxivClusteringS2S.v2",
        "MedrxivClusteringP2P.v2",
        "MedrxivClusteringS2S.v2",
        "RedditClustering.v2",
        "StackExchangeClustering.v2",
        "StackExchangeClusteringP2P.v2",
        "TwentyNewsgroupsClustering.v2",
    ],
    "sts": [
        "BIOSSES",
        "SICK-R",
        "STS12",
        "STS13",
        "STS14",
        "STS15",
        "STS16",
        "STSBenchmark",
        "STS17",
        "STS22",
    ],
    "pairclass": [
        "SprintDuplicateQuestions",
        "TwitterSemEval2015",
        "TwitterURLCorpus",
    ],
    "retrieval": [
        "ArguAna",
        "CQADupstackWebmastersRetrieval",
        "NFCorpus",
    ],
    "rerank": [
        "AskUbuntuDupQuestions",
        "MindSmallReranking",
        "StackOverflowDupQuestions"
    ],
    "summ": [
        "SummEval"
    ],
    "all": [
        "Banking77Classification",
        "EmotionClassification",
        "TweetSentimentExtractionClassification",
        "AmazonCounterfactualClassification",
        "MassiveIntentClassification",
        "MassiveScenarioClassification",
        "MTOPDomainClassification",
        "MTOPIntentClassification",
        "ArXivHierarchicalClusteringP2P",
        "ArXivHierarchicalClusteringS2S",
        "BiorxivClusteringP2P.v2",
        "BiorxivClusteringS2S.v2",
        "MedrxivClusteringP2P.v2",
        "MedrxivClusteringS2S.v2",
        "RedditClustering.v2",
        "StackExchangeClustering.v2",
        "StackExchangeClusteringP2P.v2",
        "TwentyNewsgroupsClustering.v2",
        "BIOSSES",
        "SICK-R",
        "STS12",
        "STS13",
        "STS14",
        "STS15",
        "STS16",
        "STSBenchmark",
        "STS17",
        "STS22",
        "SprintDuplicateQuestions",
        "TwitterSemEval2015",
        "TwitterURLCorpus",
        "ArguAna",
        "CQADupstackWebmastersRetrieval",
        "NFCorpus",
        "AskUbuntuDupQuestions",
        "MindSmallReranking",
        "StackOverflowDupQuestions",
        "SummEval"
    ]
}

In [None]:
models_dict = {
    "nomic-embed-text-v1.5": {
        "path": f"{BASE_PATH}/v1/nomic-ai",
        "dims": [768, 512, 256, 128, 64, 32, 16, 8],
        "quantization_techniques": ["float32", "int8", "binary"],
        "offset_path": DEFAULT_OFFSET_PATH
    },
    "mxbai-embed-large-v1": {
        "path": f"{BASE_PATH}/v1/mixedbread-ai",
        "dims": [1024, 512, 256, 128, 64, 32, 16, 8],
        "quantization_techniques": ["float32", "int8", "binary"],
        "offset_path": DEFAULT_OFFSET_PATH
    },
    "stella_en_400M_v5": {
        "path": f"{BASE_PATH}/v1/dunzhang",
        "dims": [8192, 4096, 2048, 1024, 512, 256, 128, 64],
        "quantization_techniques": ["float32", "int8", "binary"],
        "offset_path": DEFAULT_OFFSET_PATH
    }
}


# Performance

In [None]:
def get_main_score(base_path, model_name, embedding_size, quantization_method, benchmark_name, offset_path="no_model_name_available/no_revision_available"):
    """
    Function 1: Return the main_score for one model, one embedding size, one quantization type, and one benchmark.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.

    Returns:
    float: The main_score for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            main_score = data["scores"]["test"][0]["main_score"]
            return main_score
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_scores(base_path, model_name, dims, quantization_techniques, tasks, offset_path):
    scores = {task: {technique: [] for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                score = get_main_score(base_path, model_name, dim, technique, task, offset_path=offset_path)
                if score is not None:
                    scores[task][technique].append(score)
    return scores

def is_valid_task_category(category):
    return category in ["classification", "clustering", "sts", "pairclass", "retrieval", "rerank", "summ", "all"]

def plot_task_scores(scores, dims, global_save_dir, model_name, task_category, task, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))
    for technique, technique_scores in scores[task].items():
        if len(technique_scores) == len(dims):
            plt.plot(dims, technique_scores, label=technique)
        else:
            print(f"Skipping plot for {technique} on task {task} due to dimension mismatch")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Score')
    plt.ylim(-0.15, 1.05)
    plt.legend()
    plt.grid(True, which="both", ls="--")

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/{task}.png")
    if plot_figures:
      plt.show()
    plt.clf()

def plot_technique_summary(scores, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    for technique in quantization_techniques:
        all_scores = [task_scores[technique] for task_scores in scores.values() if len(task_scores[technique]) == len(dims)]

        if not all_scores:
            continue

        average_scores = np.mean(all_scores, axis=0)

        plt.figure(figsize=(10, 5))

        for task, task_scores in zip(scores.keys(), all_scores):
            sns.lineplot(x=dims, y=task_scores, color='gray', alpha=0.7)

        plt.plot(dims, average_scores, color='red', linestyle='--', linewidth=3, label='Average')

        plt.xscale('log', base=2)
        plt.xticks(dims, [str(dim) for dim in dims])
        plt.xlabel('Dimension (base 10)')
        plt.ylabel('Score')
        plt.ylim(-0.15, 1.05)

        plt.grid(True, which="both", ls="--")

        plt.legend(loc='best')

        if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
            os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

        plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/technique_{technique}.png")
        if plot_figures:
            plt.show()
        plt.clf()

def plot_all_technique_averages(scores, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    for technique in quantization_techniques:
        all_scores = [task_scores[technique] for task_scores in scores.values() if len(task_scores[technique]) == len(dims)]

        if not all_scores:
            continue

        average_scores = np.mean(all_scores, axis=0)
        plt.plot(dims, average_scores, linestyle='--', linewidth=2, label=technique)

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Score')
    plt.ylim(-0.15, 1.05)


    plt.grid(True, which="both", ls="--")

    plt.legend(loc='best')

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/all_averages.png")
    if plot_figures:
        plt.show()
    plt.clf()


def plot_all_technique_averages_relative(scores, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    plt.figure(figsize=(10, 5))


    reference_scores = {task: None for task in scores.keys()}

    for task, task_scores in scores.items():
        if 'float32' in task_scores and len(task_scores['float32']) > 0:
            reference_scores[task] = task_scores['float32'][0]
        else:
            print(f"Skipping task {task} as it doesn't have float32 scores.")
            continue

    for technique in quantization_techniques:
        all_relative_scores = []
        for task, task_scores in scores.items():
            if technique not in task_scores or len(task_scores[technique]) != len(dims):
                print(f"Skipping task {task} for technique {technique} as it has missing results.")
                continue

            relative_scores = []
            for idx, score in enumerate(task_scores[technique]):
                if reference_scores[task] is not None and reference_scores[task] != 0:
                    relative_scores.append(score / reference_scores[task])
                else:
                    relative_scores.append(score)  

            all_relative_scores.append(relative_scores)

        if not all_relative_scores:
            continue

        average_relative_scores = np.mean(all_relative_scores, axis=0)
        plt.plot(dims, average_relative_scores, linestyle='--', linewidth=2, label=technique)

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Relative Score')
    plt.ylim(-0.15, 1.05)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='best')

    save_path = f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    plt.savefig(f"{save_path}/all_averages_relative.png")
    if plot_figures:
        plt.show()
    plt.clf()

def generate_results(base_path, model_name, dims, quantization_techniques, tasks_classification, task_category, model_offset_path, plot_figures=True):
    scores = load_scores(base_path, model_name, dims, quantization_techniques, tasks_classification, MODEL_OFFSET_PATH)

    if task_category != "all":
      for task in tasks_classification:
          plot_task_scores(scores, dims, GLOBAL_SAVE_DIR, MODEL_NAME, task_category, task, plot_figures)

    plot_technique_summary(scores, dims, GLOBAL_SAVE_DIR, MODEL_NAME, quantization_techniques, task_category, plot_figures)
    plot_all_technique_averages(scores, dims, GLOBAL_SAVE_DIR, MODEL_NAME, quantization_techniques, task_category, plot_figures)
    plot_all_technique_averages_relative(scores, dims, GLOBAL_SAVE_DIR, MODEL_NAME, quantization_techniques, task_category, plot_figures)

## Stella en_400M_v5

In [None]:
MODEL_NAME = "stella_en_400M_v5"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=False)

## MixedBread-AI V1

In [None]:
MODEL_NAME = "mxbai-embed-large-v1"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=False)

## Nomic-text-embed-v1.5

In [None]:
MODEL_NAME = "nomic-embed-text-v1.5" 
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=False)

# Time

In [None]:
def get_evaluation_time(base_path, model_name, embedding_size, quantization_method, benchmark_name, offset_path="no_model_name_available/no_revision_available"):
    """
    Function: Return the evaluation_time for one model, one embedding size, one quantization type, and one benchmark.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.

    Returns:
    float: The evaluation_time for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            evaluation_time = data["evaluation_time"]
            return evaluation_time
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_evaluation_times(base_path, model_name, dims, quantization_techniques, tasks, offset_path):
    times = {task: {technique: [] for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                time = get_evaluation_time(base_path, model_name, dim, technique, task, offset_path=offset_path)
                if time is not None:
                    times[task][technique].append(time)
    return times

def plot_task_evaluation_times(times, dims, global_save_dir, model_name, task_category, task, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))
    for technique, technique_times in times[task].items():
        if len(technique_times) == len(dims):
            plt.plot(dims, technique_times, label=technique)
        else:
            print(f"Skipping plot for {technique} on task {task} due to dimension mismatch")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Evaluation Time (s)')
    plt.legend()
    plt.grid(True, which="both", ls="--")


    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")


    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/{task}.png")
    if plot_figures:
        plt.show()
    plt.clf()

def plot_technique_evaluation_time_summary(times, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    for technique in quantization_techniques:
        all_times = [task_times[technique] for task_times in times.values() if len(task_times[technique]) == len(dims)]

        if not all_times:
            continue

        average_times = np.mean(all_times, axis=0)

        plt.figure(figsize=(10, 5))

        for task, task_times in zip(times.keys(), all_times):
            sns.lineplot(x=dims, y=task_times, color='gray', alpha=0.7)

        plt.plot(dims, average_times, color='red', linestyle='--', linewidth=3, label='Average')

        plt.xscale('log', base=2)
        plt.xticks(dims, [str(dim) for dim in dims])
        plt.xlabel('Dimension (base 10)')
        plt.ylabel('Evaluation Time (s)')
        plt.grid(True, which="both", ls="--")

        plt.legend(loc='best')

        if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
            os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

        plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/technique_{technique}.png")
        if plot_figures:
            plt.show()
        plt.clf()

def plot_all_technique_evaluation_time_averages(times, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    for technique in quantization_techniques:
        all_times = [task_times[technique] for task_times in times.values() if len(task_times[technique]) == len(dims)]

        if not all_times:
            continue

        average_times = np.mean(all_times, axis=0)
        plt.plot(dims, average_times, linestyle='--', linewidth=2, label=technique)

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Evaluation Time (s)')

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='best')

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/all_averages.png")
    if plot_figures:
        plt.show()
    plt.clf()

def plot_all_technique_evaluation_time_averages_relative(times, dims, global_save_dir, model_name, quantization_techniques, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    for technique in quantization_techniques:
        relative_times = []
        for task, task_times in times.items():
            if len(task_times['float32']) == len(dims) and len(task_times[technique]) == len(dims):
                reference_time = task_times['float32'][0]  
                relative_task_times = [t / reference_time for t in task_times[technique]]
                relative_times.append(relative_task_times)

        if not relative_times:
            continue

        average_relative_times = np.mean(relative_times, axis=0)
        plt.plot(dims, average_relative_times, linestyle='--', linewidth=2, label=technique)

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Relative Evaluation Time')

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='best')

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/all_averages_evaluation_time_relative.png")
    if plot_figures:
        plt.show()
    plt.clf()

def generate_evaluation_time_results(base_path, model_name, dims, quantization_techniques, tasks, task_category, model_offset_path, plot_figures=True):
    times = load_evaluation_times(base_path, model_name, dims, quantization_techniques, tasks, MODEL_OFFSET_PATH)

    if task_category != "all":
      for task in tasks:
          plot_task_evaluation_times(times, dims, GLOBAL_SAVE_DIR, model_name, task_category, task, plot_figures)

    plot_technique_evaluation_time_summary(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, task_category, plot_figures)
    plot_all_technique_evaluation_time_averages(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, task_category, plot_figures)
    plot_all_technique_evaluation_time_averages_relative(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, task_category, plot_figures)



## Stella en_400M_v5

In [None]:
MODEL_NAME = "stella_en_400M_v5"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_evaluation_time_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## MixedBread-AI V1

In [None]:
MODEL_NAME = "mxbai-embed-large-v1"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_evaluation_time_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Nomic-text-embed-v1.5

In [None]:
MODEL_NAME = "nomic-embed-text-v1.5"
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_evaluation_time_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=False)

# Accuracy Compute TradeOff

In [None]:
def get_main_score(base_path, model_name, embedding_size, quantization_method, benchmark_name, offset_path="no_model_name_available/no_revision_available"):
    """
    Function 1: Return the main_score for one model, one embedding size, one quantization type, and one benchmark.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.

    Returns:
    float: The main_score for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            main_score = data["scores"]["test"][0]["main_score"]
            return main_score
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_scores(base_path, model_name, dims, quantization_techniques, tasks, offset_path):
    scores = {task: {technique: [] for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                score = get_main_score(base_path, model_name, dim, technique, task, offset_path=offset_path)
                if score is not None:
                    scores[task][technique].append(score)
    return scores

def calculate_memory_used(embedding_size, quantization_method):
    if quantization_method == 'float32':
        return embedding_size * 32
    elif quantization_method == 'int8':
        return embedding_size * 8
    elif quantization_method == 'binary':
        return embedding_size * 1
    else:
        raise ValueError(f"Unknown quantization method: {quantization_method}")

def plot_accuracy_compute_tradeoff(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, task, plot_figures):
    plt.figure(figsize=(10, 5))
    colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+'] 

    for technique_idx, technique in enumerate(quantization_techniques):
        technique_scores = scores[task][technique]
        if len(technique_scores) == len(dims):
            memory_used = [calculate_memory_used(dim, technique) for dim in dims]
            for idx, (mem, score) in enumerate(zip(memory_used, technique_scores)):
                plt.scatter(mem, score, label=f"{technique}" if idx == 0 else "", 
                            color=colors[technique_idx], marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='best')
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    if not os.path.exists(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}/{task}.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()


def plot_average_accuracy_compute_tradeoff(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, plot_figures):
    plt.figure(figsize=(12, 6))
    colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+'] 

    average_scores = {technique: [] for technique in quantization_techniques}
    for technique in quantization_techniques:
        all_scores = []
        for task, task_scores in scores.items():
            if len(task_scores[technique]) == len(dims):
                all_scores.append(task_scores[technique])

        if all_scores:
            average_scores[technique] = np.mean(all_scores, axis=0)

    for technique_idx, (technique, scores) in enumerate(average_scores.items()):
        if len(scores) > 0:
            memory_used = [calculate_memory_used(dim, technique) for dim in dims]
            for idx, (mem, score) in enumerate(zip(memory_used, scores)):
                plt.scatter(mem, score, label=f"{technique}" if idx == 0 else "",  
                            color=colors[technique_idx], marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Average Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='best')
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    if not os.path.exists(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}/all_averages.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()


def plot_average_accuracy_compute_tradeoff_relative(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, plot_figures):
    plt.figure(figsize=(12, 6))
    colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']  

    average_scores = {technique: [] for technique in quantization_techniques}

    reference_scores = {task: None for task in scores.keys()}

    for task, task_scores in scores.items():
        if 'float32' in task_scores and len(task_scores['float32']) > 0:
            reference_scores[task] = task_scores['float32'][0]
        else:
            print(f"Skipping task {task} as it doesn't have float32 scores.")
            continue

    for technique in quantization_techniques:
        all_relative_scores = []
        for task, task_scores in scores.items():
            if technique not in task_scores or len(task_scores[technique]) != len(dims):
                print(f"Skipping task {task} for technique {technique} as it has missing results.")
                continue

            relative_scores = []
            for idx, score in enumerate(task_scores[technique]):
                if reference_scores[task] is not None and reference_scores[task] != 0:
                    relative_scores.append(score / reference_scores[task])
                else:
                    relative_scores.append(score) 

            all_relative_scores.append(relative_scores)

        if all_relative_scores:
            average_scores[technique] = np.mean(all_relative_scores, axis=0)

    for technique_idx, (technique, scores) in enumerate(average_scores.items()):
        if len(scores) > 0:
            memory_used = [calculate_memory_used(dim, technique) for dim in dims]
            for idx, (mem, score) in enumerate(zip(memory_used, scores)):
                plt.scatter(mem, score, label=f"{technique}" if idx == 0 else "", 
                            color=colors[technique_idx], marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Relative Average Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='best')
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    save_path = f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    plt.savefig(f"{save_path}/all_averages_relative.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()


In [None]:
def generate_accuracy_compute_tradeoff_results(base_path, model_name, dims, quantization_techniques, tasks, task_category, model_offset_path, plot_figures=True):
    scores = load_scores(base_path, model_name, dims, quantization_techniques, tasks, model_offset_path)

    if task_category != "all":
      for task in tasks:
          plot_accuracy_compute_tradeoff(scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, model_name, task_category, task, plot_figures)

    plot_average_accuracy_compute_tradeoff(scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, model_name, task_category, plot_figures)
    plot_average_accuracy_compute_tradeoff_relative(scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, model_name, task_category, plot_figures)


##  Stella en_400M_v5

In [None]:
MODEL_NAME = "stella_en_400M_v5"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
  generate_accuracy_compute_tradeoff_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Mixed-Bread-Ai v1

In [None]:
MODEL_NAME = "mxbai-embed-large-v1"
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
  generate_accuracy_compute_tradeoff_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Nomic-text-embed-v1.5

In [None]:
MODEL_NAME = "nomic-embed-text-v1.5"
MODEL_BASE_PATH = models_dict[MODEL_NAME]["path"]
MODEL_OFFSET_PATH = models_dict[MODEL_NAME]["offset_path"]
dims = models_dict[MODEL_NAME]["dims"]
quantization_techniques = models_dict[MODEL_NAME]["quantization_techniques"]

In [None]:
for task_category, tasks in tasks_dict.items():
  generate_accuracy_compute_tradeoff_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, task_category, MODEL_OFFSET_PATH, plot_figures=True)

# 2D Model

In [None]:
MODEL_NAME = "mxbai-embed-2d-large-v1"
MODEL_OFFSET_PATH = DEFAULT_OFFSET_PATH
MODEL_BASE_PATH = f"{BASE_PATH}/2D/mixedbread-ai"
dims = [1024, 512, 256, 128, 64, 32, 16, 8]
quantization_techniques = ["float32", "int8", "binary"]
INFERENCE_LAYERS = [24, 20, 16, 12]

## Performance, Intramodel Comparison

In [None]:

LINE_STYLES = {
    'float32': '-',
    'int8': '--',
    'binary': ':',
}


def get_main_score(base_path, model_name, embedding_size, quantization_method, benchmark_name, inference_layer, offset_path="no_model_name_available/no_revision_available"):
    """
    Function 1: Return the main_score for one model, one embedding size, one quantization type, one benchmark, and one inference layer.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.
    inference_layer (int): Inference layer number.

    Returns:
    float: The main_score for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}_{inference_layer}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            main_score = data["scores"]["test"][0]["main_score"]
            return main_score
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_scores(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, offset_path):
    scores = {task: {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                for layer in inference_layers:
                    score = get_main_score(base_path, model_name, dim, technique, task, layer, offset_path=offset_path)
                    if score is not None:
                        scores[task][technique][layer].append(score)
    return scores

def is_valid_task_category(category):
    return category in ["classification", "clustering", "sts", "pairclass", "retrieval", "rerank", "summ", "all"]

def plot_task_scores(scores, dims, global_save_dir, model_name, task_category, task, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    color_palette = sns.color_palette("husl", len(scores[task][list(scores[task].keys())[0]].keys()))
    inference_layers = list(scores[task][list(scores[task].keys())[0]].keys())
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique, layers_scores in scores[task].items():
        for layer, technique_scores in layers_scores.items():
            if len(technique_scores) == len(dims):
                plt.plot(dims, technique_scores, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")
            else:
                print(f"Skipping plot for {technique} on task {task} at layer {layer} due to dimension mismatch")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Score')
    plt.ylim(-0.15, 1.05)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0) 
    plt.grid(True, which="both", ls="--")

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/{task}.png")
    if plot_figures:
        plt.show()
    plt.clf()

def plot_technique_summary(scores, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique in quantization_techniques:
        plt.figure(figsize=(10, 5))

        for layer in inference_layers:
            all_scores = [task_scores[technique][layer] for task_scores in scores.values() if len(task_scores[technique][layer]) == len(dims)]

            if not all_scores:
                continue

            average_scores = np.mean(all_scores, axis=0)
            plt.plot(dims, average_scores, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"Layer {layer}")

        plt.xscale('log', base=2)
        plt.xticks(dims, [str(dim) for dim in dims])
        plt.xlabel('Dimension (base 10)')
        plt.ylabel('Score')
        plt.ylim(-0.15, 1.05)

        plt.grid(True, which="both", ls="--")

        plt.legend(loc='best')

        if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
            os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

        plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/technique_{technique}.png")
        if plot_figures:
            plt.show()
        plt.clf()

def plot_all_technique_averages(scores, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique in quantization_techniques:
        for layer in inference_layers:
            all_scores = [task_scores[technique][layer] for task_scores in scores.values() if len(task_scores[technique][layer]) == len(dims)]

            if not all_scores:
                continue

            average_scores = np.mean(all_scores, axis=0)
            plt.plot(dims, average_scores, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Score')
    plt.ylim(-0.15, 1.05)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0)

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/all_averages.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_all_technique_averages_relative(scores, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    reference_scores = {task: {layer: None for layer in inference_layers} for task in scores.keys()}

    for task, task_scores in scores.items():
        for layer in inference_layers:
            reference_scores[task][layer] = task_scores['float32'][layer][0] if 'float32' in task_scores else None

    for technique in quantization_techniques:
        for layer in inference_layers:
            all_relative_scores = []
            for task, task_scores in scores.items():
                if len(task_scores[technique][layer]) == len(dims):
                    relative_scores = []
                    for idx, score in enumerate(task_scores[technique][layer]):
                        if reference_scores[task][layer] is not None and reference_scores[task][layer] != 0:
                            relative_scores.append(score / reference_scores[task][layer])
                        else:
                            relative_scores.append(score)  

                    all_relative_scores.append(relative_scores)

            if not all_relative_scores:
                continue

            average_relative_scores = np.mean(all_relative_scores, axis=0)
            plt.plot(dims, average_relative_scores, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Relative Score')
    plt.ylim(-0.15, 1.05)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0)

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/all_averages_relative.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_all_technique_vs_layers(scores, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(quantization_techniques))
    technique_color_map = {technique: color for technique, color in zip(quantization_techniques, color_palette)}
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']

    for dim_idx, dim in enumerate(dims):
        for technique in quantization_techniques:
            average_scores = []
            for layer in inference_layers:
                all_scores = [task_scores[technique][layer][dim_idx] for task_scores in scores.values() if len(task_scores[technique][layer]) == len(dims)]

                if not all_scores:
                    continue

                average_scores.append(np.mean(all_scores))

            plt.plot(inference_layers, average_scores, linestyle=LINE_STYLES.get(technique, '-'), color=technique_color_map[technique],
                     marker=markers[dim_idx % len(markers)], linewidth=2, label=f"{technique} ({dim}d)")

    plt.xticks(inference_layers, [str(layer) for layer in inference_layers])
    plt.xlabel('Inference Layer')
    plt.ylabel('Score')
    plt.ylim(-0.15, 1.05)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='best')

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/all_vs_layers.png")
    if plot_figures:
        plt.show()
    plt.clf()


def plot_all_technique_vs_layers_relative(scores, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(quantization_techniques))
    technique_color_map = {technique: color for technique, color in zip(quantization_techniques, color_palette)}
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']

    max_dim = max(dims)
    baseline_layer = inference_layers[0] 
    baseline_scores = {}

    for task_name, task_scores in scores.items():
        if "float32" in task_scores and baseline_layer in task_scores["float32"]:
            if len(task_scores["float32"][baseline_layer]) == len(dims):
                baseline_scores[task_name] = task_scores["float32"][baseline_layer][dims.index(max_dim)]

    if not baseline_scores:
        raise ValueError("Baseline scores could not be determined. Please check the inputs and data structure.")

    for dim_idx, dim in enumerate(dims):
        for technique in quantization_techniques:
            relative_scores = []
            for layer in inference_layers:
                all_scores = []
                for task_name, task_scores in scores.items():
                    if technique in task_scores and layer in task_scores[technique]:
                        if len(task_scores[technique][layer]) == len(dims):
                            score = task_scores[technique][layer][dim_idx]
                            baseline_score = baseline_scores.get(task_name, None)
                            if baseline_score is not None:
                                relative_score = score / baseline_score
                                all_scores.append(relative_score)

                if all_scores:
                    average_relative_score = np.mean(all_scores)
                    relative_scores.append(average_relative_score)

            plt.plot(inference_layers, relative_scores, linestyle=LINE_STYLES.get(technique, '-'), color=technique_color_map[technique],
                     marker=markers[dim_idx % len(markers)], linewidth=2, label=f"{technique} ({dim}d)")

    plt.xticks(inference_layers, [str(layer) for layer in inference_layers])
    plt.xlabel('Inference Layer')
    plt.ylabel('Relative Score')
    plt.ylim(-0.15, 1.05)

    plt.grid(True, which="both", ls="--")
    
    plt.legend(loc='best')

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModel/{model_name}/{task_category}/all_vs_layers_relative.png")
    if plot_figures:
        plt.show()
    plt.clf()




def generate_results(base_path, model_name, dims, quantization_techniques, tasks_classification, inference_layers, task_category, model_offset_path, plot_figures=True):
    scores = load_scores(base_path, model_name, dims, quantization_techniques, tasks_classification, inference_layers, model_offset_path)

    if task_category != "all":
      for task in tasks_classification:
          plot_task_scores(scores, dims, GLOBAL_SAVE_DIR, model_name, task_category, task, plot_figures)

    plot_technique_summary(scores, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)
    plot_all_technique_averages(scores, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)
    plot_all_technique_averages_relative(scores, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)

    scores_last_dim = load_scores(base_path, model_name, [1024], quantization_techniques, tasks_classification, inference_layers, model_offset_path)
    plot_all_technique_vs_layers(scores_last_dim, [1024], GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)
    plot_all_technique_vs_layers_relative(scores_last_dim, [1024], GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)

In [None]:
for task_category, tasks in tasks_dict.items():
    generate_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, INFERENCE_LAYERS, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Efficiency, Intramodels

In [None]:
LINE_STYLES = {
    'float32': '-',
    'int8': '--',
    'binary': ':',
}

def get_evaluation_time(base_path, model_name, embedding_size, quantization_method, benchmark_name, inference_layer, offset_path="no_model_name_available/no_revision_available"):
    """
    Function: Return the evaluation_time for one model, one embedding size, one quantization type, one benchmark, and one inference layer.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.
    inference_layer (int): Inference layer number.

    Returns:
    float: The evaluation_time for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}_{inference_layer}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            evaluation_time = data["evaluation_time"]
            return evaluation_time
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_evaluation_times(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, offset_path):
    times = {task: {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                for layer in inference_layers:
                    time = get_evaluation_time(base_path, model_name, dim, technique, task, layer, offset_path=offset_path)
                    if time is not None:
                        times[task][technique][layer].append(time)
    return times

def plot_task_evaluation_times(times, dims, global_save_dir, model_name, task_category, task, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    color_palette = sns.color_palette("husl", len(times[task][list(times[task].keys())[0]].keys()))
    inference_layers = list(times[task][list(times[task].keys())[0]].keys())
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique, layers_times in times[task].items():
        for layer, technique_times in layers_times.items():
            if len(technique_times) == len(dims):
                plt.plot(dims, technique_times, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")
            else:
                print(f"Skipping plot for {technique} on task {task} at layer {layer} due to dimension mismatch")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Evaluation Time (s)')

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0) 

    plt.grid(True, which="both", ls="--")

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/{task}.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()




def plot_all_technique_evaluation_time_averages(times, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique in quantization_techniques:
        for layer in inference_layers:
            all_times = [task_times[technique][layer] for task_times in times.values() if len(task_times[technique][layer]) == len(dims)]

            if not all_times:
                continue

            average_times = np.mean(all_times, axis=0)
            plt.plot(dims, average_times, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Evaluation Time (s)')
    plt.ylim(0, None)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0)

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/all_averages.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_technique_evaluation_time_summary(times, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    if not is_valid_task_category(task_category):
        raise ValueError("Invalid task category. Received {}, but expected one of: classification, clustering, sts, pairclass, retrieval, rerank, summ".format(task_category))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    for technique in quantization_techniques:
        plt.figure(figsize=(10, 5))

        for layer in inference_layers:
            all_times = [task_times[technique][layer] for task_times in times.values() if len(task_times[technique][layer]) == len(dims)]

            if not all_times:
                continue

            average_times = np.mean(all_times, axis=0)
            plt.plot(dims, average_times, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"Layer {layer}")

        plt.xscale('log', base=2)
        plt.xticks(dims, [str(dim) for dim in dims])
        plt.xlabel('Dimension (base 10)')
        plt.ylabel('Evaluation Time (s)')
        plt.ylim(0, None)

        plt.grid(True, which="both", ls="--")

        plt.legend(loc='best')

        if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
            os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

        plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/technique_{technique}.png")
        if plot_figures:
            plt.show()
        plt.clf()


def plot_all_technique_evaluation_time_averages_relative(times, dims, global_save_dir, model_name, quantization_techniques, inference_layers, task_category, plot_figures):
    plt.figure(figsize=(10, 5))

    color_palette = sns.color_palette("husl", len(inference_layers))
    layer_color_map = {layer: color for layer, color in zip(inference_layers, color_palette)}

    reference_layer = 24

    for technique in quantization_techniques:
        for layer in inference_layers:
            relative_times = []
            for task, task_times in times.items():
                if 'float32' in task_times and reference_layer in task_times['float32']:
                    if len(task_times['float32'][reference_layer]) > 0:
                        reference_time = task_times['float32'][reference_layer][0]  
                    else:
                        continue  

                    if len(task_times[technique][layer]) == len(dims):
                        relative_task_times = [t / reference_time for t in task_times[technique][layer]]
                        relative_times.append(relative_task_times)

            if not relative_times:
                continue

            average_relative_times = np.mean(relative_times, axis=0)
            plt.plot(dims, average_relative_times, linestyle=LINE_STYLES.get(technique, '-'), color=layer_color_map[layer], linewidth=2, label=f"{technique} (Layer {layer})")

    plt.xscale('log', base=2)
    plt.xticks(dims, [str(dim) for dim in dims])
    plt.xlabel('Dimension (base 10)')
    plt.ylabel('Relative Evaluation Time')
    plt.ylim(0, None)

    plt.grid(True, which="both", ls="--")

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0)

    if not os.path.exists(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/PerBenchmarkAndModelEfficiency/{model_name}/{task_category}/all_averages_evaluation_time_relative.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()



def generate_evaluation_time_results(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, task_category, model_offset_path, plot_figures=True):
    times = load_evaluation_times(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, model_offset_path)

    if task_category != "all":
      for task in tasks:
          plot_task_evaluation_times(times, dims, GLOBAL_SAVE_DIR, model_name, task_category, task, plot_figures)

    plot_technique_evaluation_time_summary(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)
    plot_all_technique_evaluation_time_averages(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)
    plot_all_technique_evaluation_time_averages_relative(times, dims, GLOBAL_SAVE_DIR, model_name, quantization_techniques, inference_layers, task_category, plot_figures)


In [None]:
for task_category, tasks in tasks_dict.items():
    generate_evaluation_time_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, INFERENCE_LAYERS, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Accuracy Compute TradeOff

In [None]:
LINE_STYLES = {
    'float32': '-',
    'int8': '--',
    'binary': ':',
}

def get_main_score(base_path, model_name, embedding_size, quantization_method, benchmark_name, inference_layer, offset_path="no_model_name_available/no_revision_available"):
    """
    Function 1: Return the main_score for one model, one embedding size, one quantization type, one benchmark, and one inference layer.

    Args:
    base_path (str): Base directory containing the model folders.
    model_name (str): Name of the model.
    embedding_size (int): Embedding size of the model.
    quantization_method (str): Quantization method used.
    benchmark_name (str): Name of the benchmark.
    inference_layer (int): Inference layer number.

    Returns:
    float: The main_score for the specified configuration.
    """

    subfolder = f"{model_name}_{embedding_size}_{quantization_method}_{inference_layer}"
    file_path = os.path.join(base_path, subfolder, offset_path, f"{benchmark_name}.json")

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            main_score = data["scores"]["test"][0]["main_score"]
            return main_score
    except (FileNotFoundError, KeyError) as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def load_scores(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, offset_path):
    scores = {task: {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                for layer in inference_layers:
                    score = get_main_score(base_path, model_name, dim, technique, task, layer, offset_path=offset_path)
                    if score is not None:
                        scores[task][technique][layer].append(score)
    return scores

def calculate_memory_used(embedding_size, quantization_method):
    if quantization_method == 'float32':
        return embedding_size * 32
    elif quantization_method == 'int8':
        return embedding_size * 8
    elif quantization_method == 'binary':
        return embedding_size * 1
    else:
        raise ValueError(f"Unknown quantization method: {quantization_method}")

def plot_accuracy_compute_tradeoff(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, task, inference_layers, plot_figures):
    plt.figure(figsize=(10, 5))
    colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+'] 

    for technique_idx, technique in enumerate(quantization_techniques):
        for layer in inference_layers:
            technique_scores = scores[task][technique][layer]
            if len(technique_scores) == len(dims):
                memory_used = [calculate_memory_used(dim, technique) for dim in dims]
                for idx, (mem, score) in enumerate(zip(memory_used, technique_scores)):
                    plt.scatter(mem, score, label=f"{technique} (Layer {layer}, {dims[idx]}d)" if idx == 0 else "",
                                color=colors[technique_idx], marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='best')
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    if not os.path.exists(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}/{task}.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_average_accuracy_compute_tradeoff(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, inference_layers, plot_figures):
    plt.figure(figsize=(12, 6))
    base_colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+'] 

    average_scores = {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques}
    for technique in quantization_techniques:
        for layer in inference_layers:
            all_scores = []
            for task, task_scores in scores.items():
                if len(task_scores[technique][layer]) == len(dims):
                    all_scores.append(task_scores[technique][layer])

            if all_scores:
                average_scores[technique][layer] = np.mean(all_scores, axis=0)

    for technique_idx, (technique, layers_scores) in enumerate(average_scores.items()):
        base_color = base_colors[technique_idx]
        for layer_idx, (layer, scores) in enumerate(layers_scores.items()):
            if len(scores) > 0:
                memory_used = [calculate_memory_used(dim, technique) for dim in dims]
                color = sns.set_hls_values(base_color, l=0.4 + 0.6 * (layer_idx / len(inference_layers)))
                for idx, (mem, score) in enumerate(zip(memory_used, scores)):
                    plt.scatter(mem, score, label=f"{technique} (Layer {layer})" if idx == 0 else "",  
                                color=color, marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Average Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='upper left') 
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    if not os.path.exists(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}/all_averages.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_average_accuracy_compute_tradeoff_relative(scores, dims, quantization_techniques, global_save_dir, model_name, task_category, inference_layers, plot_figures):
    plt.figure(figsize=(12, 6))
    base_colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']  

    reference_scores = {task: {layer: None for layer in inference_layers} for task in scores.keys()}

    for task, task_scores in scores.items():
        for layer in inference_layers:
            reference_scores[task][layer] = task_scores['float32'][layer][0] if 'float32' in task_scores else None

    average_scores = {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques}
    for technique in quantization_techniques:
        for layer in inference_layers:
            all_relative_scores = []
            for task, task_scores in scores.items():
                if len(task_scores[technique][layer]) == len(dims):
                    relative_scores = []
                    for idx, score in enumerate(task_scores[technique][layer]):
                        if reference_scores[task][layer] is not None and reference_scores[task][layer] != 0:
                            relative_scores.append(score / reference_scores[task][layer])
                        else:
                            relative_scores.append(score)  

                    all_relative_scores.append(relative_scores)

            if all_relative_scores:
                average_scores[technique][layer] = np.mean(all_relative_scores, axis=0)

    for technique_idx, (technique, layers_scores) in enumerate(average_scores.items()):
        base_color = base_colors[technique_idx]
        for layer_idx, (layer, scores) in enumerate(layers_scores.items()):
            if len(scores) > 0:
                memory_used = [calculate_memory_used(dim, technique) for dim in dims]
                color = sns.set_hls_values(base_color, l=0.4 + 0.6 * (layer_idx / len(inference_layers)))
                for idx, (mem, score) in enumerate(zip(memory_used, scores)):
                    plt.scatter(mem, score, label=f"{technique} (Layer {layer})" if idx == 0 else "",
                                color=color, marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Memory Used (bits)')
    plt.ylabel('Relative Average Performance Score')
    plt.xscale('log', base=2)
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='upper left') 
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='lower right', bbox_to_anchor=(1, 0))

    if not os.path.exists(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/AccuracyCompute/{model_name}/{task_category}/all_averages_relative.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()


def generate_accuracy_compute_results(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, task_category, model_offset_path, plot_figures=True):
    scores = load_scores(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, model_offset_path)

    if task_category != "all":
        for task in tasks:
            pass
    plot_average_accuracy_compute_tradeoff(scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, model_name, task_category, inference_layers, plot_figures)
    plot_average_accuracy_compute_tradeoff_relative(scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, model_name, task_category, inference_layers, plot_figures)

In [None]:
for task_category, tasks in tasks_dict.items():
  generate_accuracy_compute_results(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, INFERENCE_LAYERS, task_category, MODEL_OFFSET_PATH, plot_figures=True)

## Evaluation time vs. Accuracy TradeOff

In [None]:
def load_evaluation_times(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, offset_path):
    times = {task: {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                for layer in inference_layers:
                    subfolder = f"{model_name}_{dim}_{technique}_{layer}"
                    file_path = os.path.join(base_path, subfolder, offset_path, f"{task}.json")
                    try:
                        with open(file_path, 'r') as file:
                            data = json.load(file)
                            evaluation_time = data["evaluation_time"]
                            times[task][technique][layer].append(evaluation_time)
                    except (FileNotFoundError, KeyError) as e:
                        print(f"Error reading file {file_path}: {e}")
    return times

def load_scores(base_path, model_name, dims, quantization_techniques, tasks, inference_layers, offset_path):
    scores = {task: {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques} for task in tasks}
    for task in tasks:
        for technique in quantization_techniques:
            for dim in dims:
                for layer in inference_layers:
                    subfolder = f"{model_name}_{dim}_{technique}_{layer}"
                    file_path = os.path.join(base_path, subfolder, offset_path, f"{task}.json")
                    try:
                        with open(file_path, 'r') as file:
                            data = json.load(file)
                            main_score = data["scores"]["test"][0]["main_score"]
                            scores[task][technique][layer].append(main_score)
                    except (FileNotFoundError, KeyError) as e:
                        print(f"Error reading file {file_path}: {e}")
    return scores

def plot_relative_evaluation_time_vs_scores(times, scores, dims, quantization_techniques, global_save_dir, model_name, task_category, inference_layers, plot_figures):
    plt.figure(figsize=(14, 8))
    base_colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']

    baseline_times = {}
    baseline_scores = {}
    for task in scores.keys():
        baseline_times[task] = times[task]['float32'][max(inference_layers)][dims.index(max(dims))]
        baseline_scores[task] = scores[task]['float32'][max(inference_layers)][dims.index(max(dims))]

    for technique_idx, technique in enumerate(quantization_techniques):
        base_color = base_colors[technique_idx]
        for layer_idx, layer in enumerate(inference_layers):
            for task in scores.keys():
                task_scores = scores[task][technique][layer]
                evaluation_times = times[task][technique][layer]
                if len(task_scores) == len(dims) and len(evaluation_times) == len(dims):
                    relative_times = [time / baseline_times[task] for time in evaluation_times]
                    relative_scores = [score / baseline_scores[task] for score in task_scores]
                    color = sns.set_hls_values(base_color, l=0.4 + 0.6 * (layer_idx / len(inference_layers)))
                    for idx, (rel_time, rel_score) in enumerate(zip(relative_times, relative_scores)):
                        plt.scatter(rel_time, rel_score, label=f"{technique} (Layer {layer})" if idx == 0 else "",
                                    color=color, marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Relative Evaluation Time')
    plt.ylabel('Relative Task Score')
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = {label.split(',')[0]: handle for handle, label in zip(handles, labels)}
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='upper left', bbox_to_anchor=(1, 1), title="Techniques and Layers")
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='upper left', bbox_to_anchor=(1, 0.5), title="Dimensions")

    plt.tight_layout(rect=[0, 0, 0.75, 1])  

    if not os.path.exists(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}/evaluation_time_vs_scores.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()

def plot_average_relative_evaluation_time_vs_scores(times, scores, dims, quantization_techniques, global_save_dir, model_name, task_category, inference_layers, plot_figures):
    plt.figure(figsize=(14, 8))
    base_colors = sns.color_palette("husl", len(quantization_techniques))
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', '+']

    baseline_times = {}
    baseline_scores = {}
    for task in scores.keys():
        baseline_times[task] = times[task]['float32'][max(inference_layers)][dims.index(max(dims))]
        baseline_scores[task] = scores[task]['float32'][max(inference_layers)][dims.index(max(dims))]

    average_relative_times = {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques}
    average_relative_scores = {technique: {layer: [] for layer in inference_layers} for technique in quantization_techniques}

    for technique in quantization_techniques:
        for layer in inference_layers:
            for dim in dims:
                rel_times = []
                rel_scores = []
                for task in scores.keys():
                    if len(scores[task][technique][layer]) == len(dims) and len(times[task][technique][layer]) == len(dims):
                        rel_times.append(times[task][technique][layer][dims.index(dim)] / baseline_times[task])
                        rel_scores.append(scores[task][technique][layer][dims.index(dim)] / baseline_scores[task])
                if rel_times and rel_scores:
                    average_relative_times[technique][layer].append(np.mean(rel_times))
                    average_relative_scores[technique][layer].append(np.mean(rel_scores))

    for technique_idx, technique in enumerate(quantization_techniques):
        base_color = base_colors[technique_idx]
        for layer_idx, layer in enumerate(inference_layers):
            color = sns.set_hls_values(base_color, l=0.4 + 0.6 * (layer_idx / len(inference_layers)))
            for idx, (rel_time, rel_score) in enumerate(zip(average_relative_times[technique][layer], average_relative_scores[technique][layer])):
                plt.scatter(rel_time, rel_score, label=f"{technique} (Layer {layer}), {dims[idx]}d)" if idx == 0 else "",
                            color=color, marker=markers[idx % len(markers)], s=100, edgecolor='k')

    plt.xlabel('Relative Evaluation Time')
    plt.ylabel('Relative Task Score')
    plt.grid(True, which="both", ls="--")

    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = {label.split(',')[0]: handle for handle, label in zip(handles, labels)}
    legend1 = plt.legend(by_label.values(), by_label.keys(), loc='upper left', bbox_to_anchor=(1, 1), title="Techniques and Layers")
    plt.gca().add_artist(legend1)

    marker_handles = [plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor='k', markersize=10, linestyle='None') for marker in markers[:len(dims)]]
    marker_labels = [f'{dim}d' for dim in dims]
    legend2 = plt.legend(marker_handles, marker_labels, loc='upper left', bbox_to_anchor=(1, 0.5), title="Dimensions")

    plt.tight_layout(rect=[0, 0, 0.75, 1]) 

    if not os.path.exists(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}"):
        os.makedirs(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}")

    plt.savefig(f"{global_save_dir}/EvaluationTimeVsScores/{model_name}/{task_category}/average_evaluation_time_vs_scores.png", bbox_inches='tight')
    if plot_figures:
        plt.show()
    plt.clf()


In [None]:
for task_category, tasks in tasks_dict.items():
  times = load_evaluation_times(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, INFERENCE_LAYERS, MODEL_OFFSET_PATH)
  scores = load_scores(MODEL_BASE_PATH, MODEL_NAME, dims, quantization_techniques, tasks, INFERENCE_LAYERS, MODEL_OFFSET_PATH)

  plot_average_relative_evaluation_time_vs_scores(times, scores, dims, quantization_techniques, GLOBAL_SAVE_DIR, MODEL_NAME, task_category, INFERENCE_LAYERS, True)