<a href="https://colab.research.google.com/github/LorenFiorini/ideal-octo-spork/blob/main/cross_en_de_sentence_xformer_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install packages
!pip install sentence-transformers torch pandas

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [2]:
# get German stsb dataset
!git clone https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark.git

Cloning into 'german-STSbenchmark'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 37 (delta 13), reused 27 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (37/37), 776.06 KiB | 9.58 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [3]:
# get English stsb dataset
!wget https://sbert.net/datasets/stsbenchmark.tsv.gz

--2024-10-11 12:55:02--  https://sbert.net/datasets/stsbenchmark.tsv.gz
Resolving sbert.net (sbert.net)... 188.114.97.0, 188.114.96.0, 2a06:98c1:3121::, ...
Connecting to sbert.net (sbert.net)|188.114.97.0|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/stsbenchmark.tsv.gz [following]
--2024-10-11 12:55:02--  https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/stsbenchmark.tsv.gz
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 392336 (383K) [application/octet-stream]
Saving to: ‘stsbenchmark.tsv.gz’


2024-10-11 12:55:03 (9.97 MB/s) - ‘stsbenchmark.tsv.gz’ saved [392336/392336]



In [4]:
# imports
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import pandas as pd
import csv
import gzip
import math

  from tqdm.autonotebook import tqdm, trange


In [6]:
# load German data to de_test_samples

def create_dataset(df):
    s1 = df['s1'].tolist()
    s2 = df['s2'].tolist()
    label = df['label'].tolist()

    samples = []
    for _s1, _s2, _label in zip(s1, s2, label):
        score = _label / 5.0
        _s1.strip()
        _s2.strip()
        assert type(_s1) == str
        assert len(_s1) > 0
        assert type(_s2) == str
        assert len(_s2) > 0
        assert type(score) == float
        assert score >= 0.0
        assert score <= 1.0
        samples.append(InputExample(texts=[_s1, _s2], label=score))
    return samples

de_test_samples = create_dataset(pd.read_csv('./german-STSbenchmark/data/deepl/stsb_de_test.csv', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 's1', 's2']))
assert len(de_test_samples) == 1379

In [None]:
de_test_samples

In [8]:
# load English data to en_test_samples

en_test_samples = []
with gzip.open('./stsbenchmark.tsv.gz', 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
        if row['split'] == 'test':
            en_test_samples.append(inp_example)

assert len(en_test_samples) == 1379

In [9]:
# create cross dataset
# sentence 1 English, sentence 2 German
# and
# sentence 1 German, sentence 2 English

all_cross_data = []

for de_data, en_data in zip(de_test_samples, en_test_samples):
    s1_de = de_data.texts[0]
    s2_de = de_data.texts[1]
    label_de = de_data.label
    s1_en = en_data.texts[0]
    s2_en = en_data.texts[1]
    label_en = en_data.label

    assert math.isclose(label_de, label_en)

    de_en_data = InputExample(texts=[s1_de, s2_en], label=label_en)
    all_cross_data.append(de_en_data)
    en_de_data = InputExample(texts=[s1_en, s2_de], label=label_en)
    all_cross_data.append(en_de_data)

assert len(all_cross_data) == 1379 * 2

In [10]:
# test the models on the English, German and crossed german stsb datasets

# de 0.5512762459373083 en 0.8180937799030599 cross 0.3422287583269103
#model = SentenceTransformer('distilroberta-base-paraphrase-v1')

# de 0.5018193907428476 en 0.754829679916898 cross 0.2836510709128864
#model = SentenceTransformer('distilroberta-base-msmarco-v1')

# de 0.7420090376116804 en 0.7866227257286015 cross 0.7215647724328543
#model = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

# de 0.7707814937419973 en 0.8061608176978062 cross 0.7451615698379142
#model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# both are the same models
# de 0.7467069304821321 en 0.8075479306663303 cross 0.7520566614202625
#model = SentenceTransformer('distiluse-base-multilingual-cased')
#model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

# other xlm models (both are the same models)
# de 0.7877384090793449 en 0.8465088382627035 cross 0.7908134754906919
#model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
#model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

# de 0.6370540131720054 en 0.863878348620906 cross 0.4109486147725896
#model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

# the xlm model we build on
# de 0.8078971446757308 en 0.8349525108990091 cross 0.798345566174531
#model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

# de 0.5324066326997688 en 0.8699420144792896 cross 0.30267258818332154
#model = SentenceTransformer('paraphrase-mpnet-base-v2')

# our new German sentence embedding:
# see https://huggingface.co/T-Systems-onsite/german-roberta-sentence-transformer-v2
# de 0.8529009675124531 en 0.8633503778062248 cross 0.8414830531625959
#model = SentenceTransformer('T-Systems-onsite/german-roberta-sentence-transformer-v2')

# our best model trained with multilingual finetuning with language-crossing for English and German
# see https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer
# de 0.8549768717756436 en 0.8660333530928567 cross 0.8525445612883897
#model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

#model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
# de 0.835493733814926 en 0.8682218476677823 cross 0.8308535528202963


#model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# de 0.7885019509536335 en 0.8441678852971709 cross 0.782272202630328

#model = SentenceTransformer('gtr-t5-base')
# de 0.705372106282006 en 0.7957576958343425 cross 0.5210933687005789

#model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
# de 0.7467069304821321 en 0.8075470645543675 cross 0.7520566478198119

# de 0.835493733814926 en 0.8682219599128399 cross 0.8308535669697383
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# de 0.7622132514106488 en 0.7852445147258174 cross 0.7427459028377776
#model = SentenceTransformer('sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned')

de_test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    de_test_samples,
    name='sts-test-de',
    main_similarity=SimilarityFunction.COSINE
)
result_de = de_test_evaluator(model)

en_test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    en_test_samples,
    name='sts-test-en',
    main_similarity=SimilarityFunction.COSINE
)
result_en = en_test_evaluator(model)

cross_en_de_test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    all_cross_data,
    name='sts-test-en',
    main_similarity=SimilarityFunction.COSINE
)
result_cross_en_de = cross_en_de_test_evaluator(model)

print('######################################################')
print(f'# de {result_de} en {result_en} cross {result_cross_en_de}')
print('######################################################')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

######################################################
# de {'sts-test-de_pearson_cosine': 0.8193534998904709, 'sts-test-de_spearman_cosine': 0.835493733814926, 'sts-test-de_pearson_manhattan': 0.8247196446577835, 'sts-test-de_spearman_manhattan': 0.8250699906185207, 'sts-test-de_pearson_euclidean': 0.8253465261275925, 'sts-test-de_spearman_euclidean': 0.8263680507330718, 'sts-test-de_pearson_dot': 0.7573571814580122, 'sts-test-de_spearman_dot': 0.7400984818898023, 'sts-test-de_pearson_max': 0.8253465261275925, 'sts-test-de_spearman_max': 0.835493733814926} en {'sts-test-en_pearson_cosine': 0.8543448194038268, 'sts-test-en_spearman_cosine': 0.8682219047835629, 'sts-test-en_pearson_manhattan': 0.8634667345554121, 'sts-test-en_spearman_manhattan': 0.8613152809921908, 'sts-test-en_pearson_euclidean': 0.8647595877263146, 'sts-test-en_spearman_euclidean': 0.8630528917520772, 'sts-test-en_pearson_dot': 0.8271525796883319, 'sts-test-en_spearman_dot': 0.8226733139098199, 'sts-test-en_pearson_m