In [None]:
!pip install sentence-transformers
!pip install mteb

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
import numpy as np
import torch
import mteb
import gc
from google.colab import drive
drive.mount('/content/drive')

base_out="drive/MyDrive/BachelorThesisResults/2D"

In [None]:
class CustomModel:
    """
    A class to handle the embedding of PDF documents and retrieve relevant chunks of text based on a query.

    Attributes:
        model (SentenceTransformer): The sentence transformer model used for embedding.
        dimensions (int): The number of dimensions for the embeddings.
        quantization_method (str): The method used for quantizing embeddings.
    """

    def __init__(self, model_name="mixedbread-ai/mxbai-embed-large-v1", quantization_method="float32", dimensions=1024, custom_layer=-1, cast_to_fp32=False):
        """
        Initializes the DefaultPdfEmbedder with a specified model, quantization method, and dimensions.

        Args:
            model_name (str): The name of the model to be used.
            quantization_method (str): The method for quantization ('float32', 'ubinary', 'uint8').
            dimensions (int): The number of dimensions for the embeddings.
        """
        self.model = SentenceTransformer(model_name, trust_remote_code=True, truncate_dim=dimensions)
        if custom_layer != -1:
          self.model[0].auto_model.encoder.layer = self.model[0].auto_model.encoder.layer[:custom_layer]

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.dimensions = dimensions
        self.cast_to_fp32 = cast_to_fp32
        self.quantization_method = quantization_method
        self.model_card_data = self.model.model_card_data

    def encode(self, sentences, **kwargs):
        """
        Encodes sentences into embeddings, optionally quantizing them.

        Args:
            sentences (list of str): The sentences to be encoded.
            quantize (bool): Whether to quantize the embeddings.

        Returns:
            numpy.ndarray: The embeddings of the sentences.
        """
        if self.cast_to_fp32:
          return self.quantize(self.model.encode(sentences, device=self.device)).astype("float32")
        else:
          return self.quantize(self.model.encode(sentences, device=self.device))

    def quantize(self, embeddings):
        """
        Quantizes the embeddings using the specified precision.

        Args:
            embeddings (numpy.ndarray): The embeddings to be quantized.

        Returns:
            numpy.ndarray: The quantized embeddings.
        """
        return quantize_embeddings(embeddings, precision=self.quantization_method)


In [None]:
tasks = [
         "Banking77Classification",
        "EmotionClassification",
        "TweetSentimentExtractionClassification",
        "AmazonCounterfactualClassification",
        "MassiveIntentClassification",
        "MassiveScenarioClassification",
        "MTOPDomainClassification",
        "MTOPIntentClassification",
        "ArXivHierarchicalClusteringP2P",
        "ArXivHierarchicalClusteringS2S",
        "BiorxivClusteringP2P.v2",
        "BiorxivClusteringS2S.v2",
        "MedrxivClusteringP2P.v2",
        "MedrxivClusteringS2S.v2",
        "RedditClustering.v2",
        "StackExchangeClustering.v2",
        "StackExchangeClusteringP2P.v2",
        "TwentyNewsgroupsClustering.v2",
        "BIOSSES",
        "SICK-R",
        "STS12",
        "STS13",
        "STS14",
        "STS15",
        "STS16",
        "STSBenchmark",
        "STS17",
        "STS22",
        "SprintDuplicateQuestions",
        "TwitterSemEval2015",
        "TwitterURLCorpus",
        "ArguAna",
        "CQADupstackWebmastersRetrieval",
        "NFCorpus",
        "AskUbuntuDupQuestions",
        "MindSmallReranking",
        "StackOverflowDupQuestions",
        "SummEval"
]

In [None]:
custom_num_layers = [24, 20, 16, 12]
model_name = "mixedbread-ai/mxbai-embed-2d-large-v1"
dimensions = [1024, 512, 256, 128, 64, 32, 16, 8]

def test_all_models_varying_dim():
            for quantization_method in ["float32", "int8", "binary"]:
              for num_layers in custom_num_layers:
                for emb_size in dimensions:
                  if emb_size % 2 != 0:
                      break

                  if emb_size < 8 and quantization_method == "binary":
                      continue

                  evaluation = mteb.MTEB(tasks=tasks, task_langs=["en"])
                  print("Running Model {} with dimensions {} and quantization {} for Layer-No. {}".format(model_name, emb_size, quantization_method, num_layers))

                  model = CustomModel(model_name=model_name,
                                      dimensions=emb_size,
                                      quantization_method=quantization_method,
                                      custom_layer=num_layers
                                      )

                  evaluation.run(model, output_folder=f"{base_out}/{model_name}_{emb_size}_{quantization_method}_{num_layers}", eval_splits=["test"], encode_kwargs={"precision": quantization_method, "batch_size": 128})

                  torch.cuda.empty_cache()
                  del model
                  del evaluation
                  gc.collect()

In [None]:
test_all_models_varying_dim()

In [None]:
from google.colab import runtime
runtime.unassign()