# In this notebook we perform benchmark on selected libraries for generating long text embeddings

In [1]:
#install important libraries
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [2]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  
import numpy as np
import os
import warnings
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data  import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import models
from transformers import (
    get_cosine_schedule_with_warmup,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding
)

from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
from datasets import load_dataset

from torch.utils.data import DataLoader, Dataset


warnings.simplefilter(action='ignore', category=FutureWarning)


**Embedding normalization for computing simmilarity**

In [3]:
def normalization(embeds):
  '''
  Normalizes sentence embeddings.
  '''
  norms = np.linalg.norm(embeds, 2, axis=1, keepdims=True)
  return embeds/norms

**CLS pooling for transformer models**

In [4]:
def cls_pooling(model_output):
    '''
    Performs cls pooling on model output
    '''
    return model_output[0][:,0]

**Shortening decimal places for easily-readable output**

In [5]:
def to_short(x):
    '''
    Shortens float value to two decimal places
    '''
    return f"{x:.2f}" 

# Import of benchmark datasets and perform data analysis

**BUCC Dataset import & analysis**

In [6]:
bucc_ds = load_dataset("mteb/bucc-bitext-mining", "default")

README.md:   0%|          | 0.00/5.38k [00:00<?, ?B/s]

de-en.jsonl.gz:   0%|          | 0.00/834k [00:00<?, ?B/s]

fr-en.jsonl.gz:   0%|          | 0.00/713k [00:00<?, ?B/s]

ru-en.jsonl.gz:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

zh-en.jsonl.gz:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/35000 [00:00<?, ? examples/s]

In [7]:
bucc_ds

DatasetDict({
    test: Dataset({
        features: ['sentence1', 'sentence2', 'lang'],
        num_rows: 35000
    })
})

**Create a pandas DataFrame out of dataset for easier manipulation**

In [8]:
bucc_df = pd.DataFrame()
for k in bucc_ds['test'].features.keys():
    bucc_df[k] =  bucc_ds['test'][k]

In [9]:
len(bucc_df)

35000

**Check for empty columns**

In [10]:
for col in bucc_df.columns:
    print(pd.isnull(bucc_df[col]).unique())

[False]
[False]
[False]


**We see that the dataset has 5 different languages, from all of which we are going to use for experiments**

In [11]:
bucc_df['lang'].value_counts()

lang
ru-en    14435
de-en     9580
fr-en     9086
zh-en     1899
Name: count, dtype: int64

# Performing benchmarks on BUCC dataset

**Libraries for generating embeddings**

In [12]:
models = ['microsoft/infoxlm-base', 
          'LaBSE',
          'mUSE']

**Import all examined libraries**

In [13]:
labse_preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/cmlm-multilingual-preprocess/2")
labse_encoder = hub.KerasLayer("https://www.kaggle.com/models/google/labse/TensorFlow2/labse/2")

**Import models from huggingface**

In [14]:
mUSE = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/multilingual-large/2")

In [15]:
xlmb_tknzr = AutoTokenizer.from_pretrained('microsoft/infoxlm-base')
xlmb_mdl = AutoModel.from_pretrained('microsoft/infoxlm-base')

config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/942M [00:00<?, ?B/s]

In [16]:
def prep_res_df(input_df:pd.DataFrame)->pd.DataFrame:
    '''
    Creates result dataframe for experiments
    
    input_df
    --------
        pd.DataFrame that provides columns for result dataframe

    Returns
    -------
        pd.DataFrame Empty dataframe that uses the same column with extra column for model.
    '''
    a = ['model'] 
    b = input_df['lang'].unique()
    cols = [*a, *b]
    
    return pd.DataFrame(columns=cols)

In [17]:
langs = bucc_df['lang'].unique()

**Constants that define the comparing environment**

In [18]:
batch_size = 512
device = "cuda:0" if torch.cuda.is_available() else "cpu"

**Now, we define functions for working with different models** 

In [19]:
def model_embd(model:AutoModel,tokenizer:AutoTokenizer,new_snts:list,eng_snts:list)->float:
    '''
    This function is used for evaluating XLM model.
    It computes embedding of provided sentences and compare them with English counterparts.
    Then computes average accuracy based on cosine similarity over batches.

    model
    -----

    new_snts
    --------
        list of sentences in one of the supported languages
    eng_snts
    --------
        list of sentences in english to be compared with
    Returns
    -------
        float, average accuracy among all batches 
    '''
    acc = []
    for i in range(0, len(new_snts), batch_size):
        new_snts_b = new_snts[i:i + batch_size]
        eng_snts_b = eng_snts[i:i + batch_size]
        
        real = [i for i in range(0,len(eng_snts_b))]
       
        model = model.to(device)
        
        new_encoded = tokenizer(new_snts_b, padding=True,  return_tensors='pt').to(device)
        eng_encoded = tokenizer(eng_snts_b,padding=True, return_tensors='pt').to(device)
        
        with torch.no_grad():
            ot_new = model(**new_encoded)
            ot_eng = model(**eng_encoded)
    
        new_embd = cls_pooling(ot_new ).cpu()
        eng_embd = cls_pooling(ot_eng).cpu()
    
    
        sim_mtx= similarity_matrix = cosine_similarity(new_embd.numpy(), eng_embd.numpy())
        
        retrieved_indices = np.argmax(sim_mtx, axis=1)
        acc.append(round(accuracy_score(real,retrieved_indices),2))
    return np.mean(acc)

In [20]:
def labs_embd(new_snts:list,eng_snts:list)->float:
    '''
    This function is used for evaluating LaBSe model.
    It computes embedding of provided sentences and compare them with English counterparts.
    Then computes average accuracy based on cosine similarity over batches.

    new_snts
    --------
        list of sentences in one of the supported languages
    eng_snts
    --------
        list of sentences in english to be compared with
    Returns
    -------
        float, average accuracy among all batches 

    '''
    
    acc = []
    for i in range(0, len(new_snts), batch_size):
        #print("batch " + str(i))
        new_snts_b = new_snts[i:i + batch_size]
        eng_snts_b = eng_snts[i:i + batch_size]
        real = [i for i in range(0,len(eng_snts_b))]
        
        new_snts_b = tf.constant(new_snts_b)
        eng_snts_b = tf.constant(eng_snts_b)

    
        new_embeds = normalization(labse_encoder(labse_preprocessor(new_snts_b))['default'])
        
        
        eng_embeds = normalization(labse_encoder(labse_preprocessor(eng_snts_b))["default"])
        #print("embs 2 done")
        similarity_matrix = cosine_similarity(new_embeds.numpy(), eng_embeds.cpu().numpy())
        retrieved_indices = np.argmax(similarity_matrix, axis=1)
        acc.append(round(accuracy_score(real,retrieved_indices),2))
    return np.mean(acc)

In [21]:
def mUSE_embd(new_snts:list,eng_snts:list)->float:
    '''
    This function is used for evaluating mUSE model.
    It computes embedding of provided sentences and compare them with English counterparts.
    Then computes average accuracy based on cosine similarity over batches.

    new_snts
    --------
        list of sentences in one of the supported languages
    eng_snts
    --------
        list of sentences in english to be compared with
    Returns
    -------
        float, average accuracy among all batches 

    '''
    acc = []
    for i in range(0, len(new_snts), batch_size):
        new_snts_b = new_snts[i:i + batch_size]
        eng_snts_b = eng_snts[i:i + batch_size]
        real = [i for i in range(0,len(eng_snts_b))]
        new_snts_b = tf.constant(new_snts_b)
        eng_snts_b = tf.constant(eng_snts_b)

        new_embeds = mUSE(new_snts_b)
        eng_embeds = mUSE(eng_snts_b)
        similarity_matrix = cosine_similarity(new_embeds.numpy(), eng_embeds.cpu().numpy())
        retrieved_indices = np.argmax(similarity_matrix, axis=1)
        acc.append(round(accuracy_score(real,retrieved_indices),2))
    return np.mean(acc)

In [22]:
def perform_experiments(models:list, in_df:pd.DataFrame, langs:list)->pd.DataFrame:
    '''
    Performs experiment with given dataset on all provided modes and languages.
    Computes average accuracy for all of the languages and save it to dataframe.

    models
    ------
        list of models that are used for benchmarks
    in_df
    -----
        pandas.DataFrame that stores texts in all provided languages
    langs
    -----
        list of languages that are used in benchmarks

    Returns
    -------
        pandas.DataFrame of results

    
    '''

    res_df = prep_res_df(in_df)
    size=20000
    for idx,m in enumerate(models):
        
        acc = []
        acc.append(m)

        
        for l in langs:
            new_df = in_df[in_df['lang'] == l]
            new_size = size if len(new_df) > size else len(new_df)
            new_snts = list(new_df['sentence1'].head(new_size))
            eng_snts = list(new_df['sentence2'].head(new_size))
           
            if m == "LaBSE":
                res = labs_embd(new_snts,eng_snts)
                acc.append(res)
                          
            if m == 'microsoft/infoxlm-base':
                res = model_embd(xlmb_mdl,xlmb_tknzr,new_snts,eng_snts)
                acc.append(res)
            if m == 'mUSE':
                res = mUSE_embd(new_snts,eng_snts)
                acc.append(res)
           
        print(acc)
        res_df.loc[len(res_df)] = acc
    res_df = res_df.set_index('model')
    return res_df
        

In [23]:
def save_res(latex_table:str,name)->None:
    '''
    Saves the results to latex table
    latex_table
    -----------
        string that defines created latex table
    name
    ----
        string that defines the name of the file
    Returns
    -------
    None
    '''
    with open(name+ '.txt','w+') as f:
        f.write(latex_table)
        f.close()

In [24]:
res_df = perform_experiments(models, bucc_df,langs)

In [25]:
latex_table = res_df.style.format({
        "de-en": to_short,
        "fr-en": to_short,
        "ru-en": to_short,
        "zh-en":to_short
        }).to_latex()
latex_table = latex_table.replace('{lrrrr}', '{l||c|c|c|c}')
latex_table = latex_table.replace('model &  &  &  &  \\\\', '\\hline')

In [26]:
save_res(latex_table,'bucc')

**Tatoeba dataset import and analysis**

In [27]:
tatoeba_pairs = [('en','fr'),('en','ru'),('de','en'),('cs','en')]

In [28]:
def crt_tat_df(tatoeba_pairs)->pd.DataFrame:
    new_df = pd.DataFrame(columns=['sentence1','sentence2','lang'])
    for p in tatoeba_pairs:
        dataset = load_dataset("tatoeba",lang1=p[0], lang2=p[1],split= 'train',trust_remote_code=True)
        size = 200 if p[0] != 'cs' else 100
        e_idx = 0 if p[0] == 'en' else 1 
        n_idx = e_idx-1 % 2
        n_eng = p[n_idx]
        
        
        for d in dataset['translation']:
            if len(d[p[0]]) + len(d[p[1]]) > size:
                new_df.loc[len(new_df) ] = [d[p[n_idx]],d[p[e_idx]],n_eng]
                
                
    return new_df     
            

In [29]:
batch_size = 256

In [30]:
tat_df = crt_tat_df(tatoeba_pairs)

README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

tatoeba.py:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/9.88M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/944k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [31]:
tat_df['lang'].value_counts()

lang
de    5017
fr    3469
ru    2598
cs    2226
Name: count, dtype: int64

In [32]:
langs = list(tat_df['lang'].unique())

In [33]:
langs

['fr', 'ru', 'de', 'cs']

**Now we perform experiments with Tatoeba**

In [None]:
res_df = perform_experiments(models, tat_df, langs)

In [None]:
latex_table = res_df.style.format({
        "fr": to_short,
        "ru": to_short,
        "de": to_short,
        "cs":to_short
        }).to_latex()
latex_table = latex_table.replace('{lrrrr}', '{l||c|c|c|c}')
latex_table = latex_table.replace('model &  &  &  &  \\\\', '\\hline')

In [None]:
save_res(latex_table,'tatoeba')