In [1]:
# !pip install -U sentence-transformers

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  
import numpy as np
import os
import warnings
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import models
from transformers import (
    get_cosine_schedule_with_warmup,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding
)

from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score


warnings.simplefilter(action='ignore', category=FutureWarning)


**Embedding normalization for computing simmilarity**

In [None]:
def normalization(embeds):
  norms = np.linalg.norm(embeds, 2, axis=1, keepdims=True)
  return embeds/norms

**CLS pooling for transformer models**

In [None]:
def cls_pooling(model_output, attention_mask):
    return model_output[0][:,0]

**BUCC Dataset import & analysis**

In [None]:
bucc_ds = load_dataset("mteb/bucc-bitext-mining", "default")

In [None]:
bucc_ds

In [None]:
bucc_df = pd.DataFrame()
for k in bucc_ds['test'].features.keys():
    bucc_df[k] =  bucc_ds['test'][k]

In [None]:
len(bucc_df)

**Check for empty columns**

In [None]:
for col in bucc_df.columns:
    print(pd.isnull(bucc_df[col]).unique())

In [None]:
bucc_df['lang'].value_counts()

**Save used models**

In [None]:
models = [ 'info/base', 'infoxlm/large']

**Import LaBSE Since it is not on huggingface**

In [None]:
# labse_preprocessor = hub.KerasLayer(
#     "https://kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/cmlm-multilingual-preprocess/2")
# labse_encoder = hub.KerasLayer("https://www.kaggle.com/models/google/labse/TensorFlow2/labse/2")

**Import models from huggingface**

In [None]:

xlml_tknzr = AutoTokenizer.from_pretrained('KnutJaegersberg/infoxlm-large-sentence-embeddings')
xlml_mdl = AutoModel.from_pretrained('KnutJaegersberg/infoxlm-large-sentence-embeddings')

xlmb_tknzr = AutoTokenizer.from_pretrained('microsoft/infoxlm-base')
xlmb_mdl = AutoModel.from_pretrained('microsoft/infoxlm-base')

In [None]:
a = ['model'] 
b =bucc_df['lang'].unique()
cols = [*a, *b]
res_df = pd.DataFrame(columns=cols)

In [None]:
langs = bucc_df['lang'].unique()

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def model_embd(model,tokenizer,new_snts,eng_snts):
    real = [i for i in range(0,len(eng_snts))]
    tokenizer = tokenizer
    model = model.to(device)
    
    new_encoded = tokenizer(new_snts, padding=True,  return_tensors='pt').to(device)
    eng_encoded = tokenizer(eng_snts,padding=True, return_tensors='pt').to(device)
    
    with torch.no_grad():
        ot_new = model(**new_encoded)
        ot_eng = model(**eng_encoded)

    new_embd = cls_pooling(ot_new, new_encoded['attention_mask']).cpu()
    eng_embd = cls_pooling(ot_eng, eng_encoded['attention_mask']).cpu()



    sim_mtx=  similarity_matrix = cosine_similarity(new_embd.numpy(), eng_embd.numpy())
    
    retrieved_indices = np.argmax(sim_mtx, axis=1)
    return round(accuracy_score(real,retrieved_indices),2)

In [None]:
#function prepared for LaBSe experiments
def labs_embd(new_snts,eng_snts):
    new_snts = tf.constant(new_snts)
    eng_snts = tf.constant(eng_snts)
    
    new_embeds = normalization(labse_encoder(labse_preprocessor(new_snts))['default'])
    print("embs 1 done")
    
    eng_embeds = normalization(labse_encoder(labse_preprocessor(eng_snts))["default"])
    print("embs 2 done")
    similarity_matrix = cosine_similarity(new_embeds.numpy(), eng_embeds.cpu().numpy())
    retrieved_indices = np.argmax(similarity_matrix, axis=1)
    
    return round(accuracy_score(real,retrieved_indices),2)

In [None]:
size=3000
for idx,m in enumerate(models):
    
    acc = []
    acc.append(m)
    for l in langs:
        new_df = bucc_df[bucc_df['lang'] == l]
        new_size = size if len(new_df) > size else len(new_df)
        new_snts = list(new_df['sentence1'].head(new_size))
        eng_snts = list(new_df['sentence2'].head(new_size))
       
        
        # if m == "LaBSE":
        #     res = labs_embd(new_snts,eng_snts)
        #     acc.append(res)
                      
        if m == 'info/base':
            res = model_embd(xlmb_mdl,xlmb_tknzr,new_snts,eng_snts)
            acc.append(res)
        if m == 'infoxlm/large':
            res = model_embd(xlml_mdl,xlml_tknzr,new_snts,eng_snts)
            acc.append(res)
    print(acc)
    res_df.loc[len(res_df)] = acc
    
        

In [None]:
res_df = res_df.set_index('model')

In [None]:
def to_short(x):
    return f"{x:.2f}" 

In [None]:
with open('file.txt','w+') as f:
    latex_table = res_df.style.format({
        "de-en": to_short,
        "fr-en": to_short,
        "ru-en": to_short,
        "zh-en":to_short
        }).to_latex()
        
        
    latex_table = latex_table.replace('{lrrrr}', '{l||c|c|c|c}')
    latex_table = latex_table.replace('model &  &  &  &  \\\\', '\\hline')
    f.write(latex_table)
    f.close()