In [3]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import os

# import logging

# #### Just some code to print debug information to stdout
# logging.basicConfig(format='%(asctime)s - %(message)s',
#                     datefmt='%Y-%m-%d %H:%M:%S',
#                     level=logging.INFO,
#                     handlers=[LoggingHandler()])


from torchdr import PCA, TSNE
from sentence_transformers import SentenceTransformer
import pandas as pd


In [4]:
model_name_list =[
    # 'all-mpnet-base-v2', #  0.5481, 0.2312, 0.2559 (1 mins) (2312, 2559)
    # # 'all-mpnet-base-v2', #[full 10] 2414 -> 2684kwo, 2440kw, 2298kp, 2740kpo, 2352kso, 2399ks
    # 'sentence-t5-xl', #[full] 0.6754, 0.2543, 0.2990

    'all-MiniLM-L12-v1',
    'all-mpnet-base-v2',
]

reduction_classes = {
    'pca': PCA,
    # 'tsne': TSNE
}

class idenity_reduction:

    def __init__(self, *args, **kwargs):
        pass

    def fit(self, x):
        pass

    def transform(self, x):
        return x

class ST_wrapper(SentenceTransformer):
    def __init__(self, model_name, reduction_type = 'x', reduction_kwargs={}, *args, **kwargs):
        super(ST_wrapper, self).__init__(model_name, *args, **kwargs)
        if reduction_type == 'x':
            self.reduction = idenity_reduction()
        else:
            self.reduction = reduction_classes[reduction_type](**reduction_kwargs)

    def encode_queries(self, queries, *args, **kwargs):
        embeddings = self.encode(queries, *args, **kwargs)
        self.reduction.fit(embeddings)
        return self.reduction.transform(embeddings)

    def encode_corpus(self, corpus, *args, **kwargs):
        embeddings = self.encode(corpus, *args, **kwargs)
        print(embeddings.shape)
        return self.reduction.transform(embeddings)


reduction_kwargs_choices = {
    'x': {},
    'pca': {'n_components': 128},
    # 'tsne': {'perplexity': 30}
}


out_dir = "./beir"
os.mkdir(out_dir) if not os.path.exists(out_dir) else None
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset_list = ['scifact', ]

df_list = []
for dataset in dataset_list:

    # dataset = "scidocs"
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)

    data_path = util.download_and_unzip(url, out_dir)

    #### Provide the data_path where scifact has been downloaded and unzipped
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")


    for model_name in model_name_list:
        for reduction_type in ['x', 'pca']:
            results_dict = {}
            
            model = ST_wrapper(model_name, reduction_type, reduction_kwargs_choices[reduction_type])
            model = DRES(model, batch_size=128)

            retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
            results = retriever.retrieve(corpus, queries)
            ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
            results_dict[f"{model_name}+{reduction_type}"] = {}
            for result in [ndcg, _map, recall, precision]:
                for k, v in result.items():
                    results_dict[f"{model_name}+{reduction_type}"][k] = v

            df = pd.DataFrame(results_dict)
            df_list.append(df)

  0%|          | 0/5183 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

torch.Size([5183, 384])


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

torch.Size([5183, 384])


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

torch.Size([5183, 768])


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

torch.Size([5183, 768])


In [12]:
all_df = pd.concat(df_list, axis=1)
# all_df.to_csv(f"./RE-results.csv")
# all_df
for model in model_name_list:
    col_pca = f"{model}+pca"
    col_x = f"{model}+x"
    all_df[f"{model} % change"] = ((all_df[col_pca] - all_df[col_x]) / all_df[col_x] * 100).round(1)
all_df[sorted(all_df.columns, reverse=True)]

Unnamed: 0,all-mpnet-base-v2+x,all-mpnet-base-v2+pca,all-mpnet-base-v2 % change,all-MiniLM-L12-v1+x,all-MiniLM-L12-v1+pca,all-MiniLM-L12-v1 % change
MAP@1,0.47789,0.45872,-4.0,0.48139,0.45206,-6.1
MAP@10,0.58434,0.56075,-4.0,0.57537,0.54391,-5.5
MAP@100,0.59424,0.57104,-3.9,0.58381,0.55368,-5.2
MAP@1000,0.59454,0.57132,-3.9,0.58409,0.55398,-5.2
MAP@3,0.55327,0.53157,-3.9,0.54461,0.51169,-6.0
MAP@5,0.57367,0.54806,-4.5,0.56156,0.52871,-5.8
NDCG@1,0.49667,0.48,-3.4,0.49667,0.47,-5.4
NDCG@10,0.63309,0.61067,-3.5,0.6217,0.59294,-4.6
NDCG@100,0.67379,0.65479,-2.8,0.6599,0.63539,-3.7
NDCG@1000,0.68032,0.66179,-2.7,0.66767,0.64449,-3.5
