In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity

In [2]:
embedder = SentenceTransformer('sentence-transformers/LaBSE', device='cuda')
original_filename = "../../Data/pipeline_df_with_features.pickle"
updated_filename = "../../Data/pipeline_df_with_features_updated.pickle"

if os.path.exists(updated_filename):
    df = pd.read_pickle(updated_filename)
else:
    df = pd.read_pickle(original_filename)
    batch_size = 512
    source_embeddings = embedder.encode(
        df['fr'].tolist(),
        convert_to_tensor=True,
        batch_size=batch_size,
        device='cuda'
    )
    translated_embeddings = embedder.encode(
        df['en'].tolist(),
        convert_to_tensor=True,
        batch_size=batch_size,
        device='cuda'
    )
    similarities = cosine_similarity(source_embeddings, translated_embeddings)
    df['similarity_update'] = similarities.cpu().numpy()
    df['similarity_change'] = df["similarity_update"] - df["similarity"]

    df.to_pickle(updated_filename)

In [5]:
percentiles = [0.00135, 0.02275, 0.1587, 0.8413, 0.97725, 0.99865]
rows = ['0.1%', '2.3%', '15.9%', '50%', '84.1%', '97.7%', '99.9%']
columns = ["similarity", "similarity_update"]

df[columns].describe(percentiles).loc[rows].round(3)

Unnamed: 0,similarity,similarity_update
0.1%,0.701,0.395
2.3%,0.71,0.649
15.9%,0.758,0.796
50%,0.847,0.871
84.1%,0.922,0.917
97.7%,0.963,0.951
99.9%,0.983,0.976


In [None]:
# TODO: it looks like we need to do cleaning before we match text

df[['similarity_change']].describe(percentiles).loc[rows].round(3)

Unnamed: 0,similarity_change
0.1%,-0.361
2.3%,-0.152
15.9%,-0.057
50%,0.01
84.1%,0.09
97.7%,0.166
99.9%,0.222
