In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=""

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
import ast, json, numpy as np
import tensorflow_text

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from time import time
from sklearn.preprocessing import normalize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import bert
from sentence_transformers import SentenceTransformer
from laserembeddings import Laser

In [3]:
def get_cosine_sim(sents_1, sents_2):
    sims = []
    for s1, s2 in zip(sents_1, sents_2):
        cs = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))
        sims.append(cs)
    sims = np.array(sims)
    sims = sims.squeeze(1).squeeze(1)
    print(sims.shape)
    return sims

In [4]:
df = pd.read_csv('2017_multilingual_eval_set.csv')
df.head()

Unnamed: 0,s1,s2,l1,l2,score
0,شخص ما يحمل لوح التزلج ليلا على الرصيف.,رجل جالس بمفرده يقرأ على طاولة مستديرة ، خارج ...,ar,ar,0.8
1,تتسابق النساء في سباق الدايتونا 500.,يتسابق بعض الرجال ضمن مسابقة التزلج.,ar,ar,1.0
2,تمشي النساء جنبا إلى جنب.,هناك فتيات يمشين متجاورات,ar,ar,2.6
3,يقفز الرجل ذو القميص الأخضر عاليا على العشب.,يمشي الرجل ذو القميص الأبيض على العشب الطويل م...,ar,ar,2.2
4,رجلان يجلسان على العشب ومعهما موز.,ثلاثة رجال يتسكعون عند فرشة بيع الفاكهة.,ar,ar,1.4


In [5]:


def get_model(model_url, max_seq_length):
  labse_layer = hub.KerasLayer(model_url, trainable=True)

  # Define input.
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")

  # LaBSE layer.
  pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

  # The embedding is l2 normalized.
  pooled_output = tf.keras.layers.Lambda(
      lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

  # Define model.
  return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer

max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [6]:

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, max_seq_length)
  return labse_model([input_ids, input_mask, segment_ids])

In [7]:
t0 = time()
sents_1 = np.asarray(df.s1)
sents_1 = encode(sents_1)
sents_2 = np.asarray(df.s2)
sents_2 = encode(sents_2)
print(sents_1.shape, sents_2.shape)
enc_time = time()-t0
print(f'LaBSE Encoding time: {enc_time:.2f}')

(1750, 768) (1750, 768)
LaBSE Encoding time: 135.79


In [8]:
sents_1, sents_2 = np.array(sents_1), np.array(sents_2)

In [9]:
np.save('labse_sent_1.npy', sents_1)
np.save('labse_sent_2.npy', sents_2)
sims = get_cosine_sim(sents_1, sents_2)

(1750,)


In [10]:
df['labse_cosine_sim'] = sims

In [11]:
df.head()

Unnamed: 0,s1,s2,l1,l2,score,labse_cosine_sim
0,شخص ما يحمل لوح التزلج ليلا على الرصيف.,رجل جالس بمفرده يقرأ على طاولة مستديرة ، خارج ...,ar,ar,0.8,0.47278
1,تتسابق النساء في سباق الدايتونا 500.,يتسابق بعض الرجال ضمن مسابقة التزلج.,ar,ar,1.0,0.436365
2,تمشي النساء جنبا إلى جنب.,هناك فتيات يمشين متجاورات,ar,ar,2.6,0.523607
3,يقفز الرجل ذو القميص الأخضر عاليا على العشب.,يمشي الرجل ذو القميص الأبيض على العشب الطويل م...,ar,ar,2.2,0.735741
4,رجلان يجلسان على العشب ومعهما موز.,ثلاثة رجال يتسكعون عند فرشة بيع الفاكهة.,ar,ar,1.4,0.61904


In [12]:
df.score.corr(df.labse_cosine_sim, method='spearman')

0.755018885185218

In [13]:
t0 = time()
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
sents_1 = embed(df.s1)
sents_2 = embed(df.s2)
print(sents_1.shape, sents_2.shape)
enc_time = time()-t0
print(f'mUSE (tf-hub) encoding time: {enc_time:.4f}')

(1750, 512) (1750, 512)
mUSE (tf-hub) encoding time: 4.1700


In [14]:
sents_1, sents_2 = np.array(sents_1), np.array(sents_2)

In [15]:
np.save('muse_sent_1.npy', sents_1)
np.save('muse_sent_2.npy', sents_2)

In [16]:
sims = get_cosine_sim(sents_1, sents_2)

(1750,)


In [17]:
df['muse_cosine_sim'] = sims

In [18]:
df.score.corr(df.muse_cosine_sim, method='spearman')

0.7859717171575162

In [19]:
model = SentenceTransformer('xlm-roberta-base')

Exception when trying to download https://sbert.net/models/xlm-roberta-base.zip. Response 404


In [20]:
t0 = time()
sents_1 = model.encode(df.s1)
sents_2 = model.encode(df.s2)
print(sents_1.shape, sents_2.shape)
enc_time = time()-t0
print(f'XLM RoBERTa (sBERT) encoding time: {enc_time:.2f}')

(1750, 768) (1750, 768)
XLM RoBERTa (sBERT) encoding time: 32.83


In [21]:
sims = get_cosine_sim(sents_1, sents_2)

(1750,)


In [22]:
df['xlmr_cosine_sim'] = sims

In [23]:
model = SentenceTransformer('average_word_embeddings_glove.6B.300d')

In [24]:
t0 = time()
sents_1 = model.encode(df.s1)
sents_2 = model.encode(df.s2)
print(sents_1.shape, sents_2.shape)
enc_time = time()-t0
print(f'Glove (sBERT) encoding time: {enc_time:.2f}')

(1750, 300) (1750, 300)
Glove (sBERT) encoding time: 0.17


In [25]:
np.save('glove_avg_sent_1.npy', sents_1)
np.save('glove_avg_sent_2.npy', sents_2)

In [26]:
sims = get_cosine_sim(sents_1, sents_2)

(1750,)


In [27]:
df['glove_avg_sim'] = sims

In [28]:
# lang_pairs = [('ar','ar'),('ar','en'),('es','es'), ('es','en'), ('en','en'), ('tr','en')]
# for p in lang_pairs:
#     sdf = df[(df.l1 == p[0]) & (df.l2 == p[1])]
#     print(p)
#     print(f'labse: {sdf.score.corr(sdf.labse_cosine_sim)*100:.2f}, muse: {sdf.score.corr(sdf.muse_cosine_sim)*100:.2f},' 
#     f'xlmr: {sdf.score.corr(sdf.xlmr_cosine_sim)*100:.2f}, distil_muse: {sdf.score.corr(sdf.distilmuse_cosine_sim)*100:.2f},'
#     f'xlmr_pt: {sdf.score.corr(sdf.xlmr_senttrans_sim)*100:.2f}, glove_avg: {sdf.score.corr(sdf.glove_avg_sim)*100:.2f}')

In [29]:
# lang_pairs = [('ar','ar'),('ar','en'),('es','es'), ('es','en'), ('en','en'), ('tr','en')]
# for p in lang_pairs:
#     sdf = df[(df.l1 == p[0]) & (df.l2 == p[1])]
#     print(f'{sdf.score.corr(sdf.glove_avg_sim)*100:.2f}')

In [30]:
laser = Laser() 

In [31]:
s1, l1 = df.s1, df.l1
s2, l2 = df.s2, df.l2

In [32]:
t0 = time()
sents_1 = laser.embed_sentences(list(s1), lang=list(l1))
sents_2 = laser.embed_sentences(list(s2), lang=list(l2))
enc_time = time()-t0
print(f'LASER (3rd party lib) encoding time: {enc_time:.2f}')

LASER (3rd party lib) encoding time: 11.208090543746948:.2f


In [33]:
sents_1.shape, sents_2.shape

((1750, 1024), (1750, 1024))

In [34]:
np.save('laser_sent_1.npy', sents_1)
np.save('laser_sent_2.npy', sents_2)

In [35]:
sims = get_cosine_sim(sents_1, sents_2)

(1750,)


In [36]:
df['laser_sim'] = sims
df.tail(5)

Unnamed: 0,s1,s2,l1,l2,score,labse_cosine_sim,muse_cosine_sim,xlmr_cosine_sim,glove_avg_sim,laser_sim
1745,A coach smiles at a player.,"Bir koç, oyuncularıyla birlikte kenarda durur.",tr,en,2.4,0.567708,0.383282,0.992406,-0.171986,0.664306
1746,Six women wearing black jackets and bright red...,Sarı gömlekli üç kadın kameraya gülümsüyor.,tr,en,1.6,0.725851,0.547926,0.995444,-0.110193,0.727363
1747,There are 70 participants on each team on the ...,Sahada iki takım var.,tr,en,2.8,0.637072,0.345799,0.994337,-0.232244,0.659134
1748,A large family poses for a photo.,Fotoğraf çeken bir aile var,tr,en,4.0,0.613873,0.595023,0.99579,-0.181362,0.738873
1749,A girl is inspecting a machine.,Bir kadın bir makineyle çalışıyor.,tr,en,3.4,0.661635,0.580531,0.995913,-0.071709,0.833054


In [37]:
lang_pairs = [('ar','ar'),('ar','en'),('es','es'), ('es','en'), ('en','en'), ('tr','en')]
for p in lang_pairs:
    sdf = df[(df.l1 == p[0]) & (df.l2 == p[1])]
    print(f'{sdf.score.corr(sdf.laser_sim)*100:.2f}')

69.33
65.56
79.73
70.52
77.13
72.14


In [38]:
for c in df.columns:
    if 'sim' not in c:
        continue
    else:
        print(f'{c}, {df.score.corr(df[c], method="spearman")*100:.2f}')

labse_cosine_sim, 75.50
muse_cosine_sim, 78.60
xlmr_cosine_sim, 24.04
glove_avg_sim, 3.01
laser_sim, 73.55


In [39]:
model_names = ['distiluse-base-multilingual-cased', 
               'distilbert-multilingual-nli-stsb-quora-ranking',
               'xlm-r-100langs-bert-base-nli-mean-tokens',
               'xlm-r-100langs-bert-base-nli-stsb-mean-tokens']
for mname in tqdm(model_names):
    t0 = time()
    model = SentenceTransformer(mname)
    sents_1 = model.encode(df.s1)
    sents_2 = model.encode(df.s2)
    enc_time = time() - t0
    enc_time = time()-t0
    print(f'{mname} (sBERT) encoding time: {enc_time:.2f}')
    sims = get_cosine_sim(sents_1, sents_2)
    df[mname+'_sim'] = sims

  0%|          | 0/4 [00:00<?, ?it/s]

distiluse-base-multilingual-cased (sBERT) encoding time: 21.36


 25%|██▌       | 1/4 [00:21<01:05, 21.71s/it]

(1750,)
distilbert-multilingual-nli-stsb-quora-ranking (sBERT) encoding time: 21.93


 50%|█████     | 2/4 [00:43<00:43, 21.88s/it]

(1750,)
xlm-r-100langs-bert-base-nli-mean-tokens (sBERT) encoding time: 47.98


 75%|███████▌  | 3/4 [01:32<00:29, 29.81s/it]

(1750,)
xlm-r-100langs-bert-base-nli-stsb-mean-tokens (sBERT) encoding time: 48.11


100%|██████████| 4/4 [02:20<00:00, 35.19s/it]

(1750,)





In [40]:
cols = [c for c in df.columns if 'sim' in c]

for c in cols:
    lang_pairs = [('ar','ar'),('ar','en'),('es','es'), ('es','en'), ('en','en'), ('tr','en')]
    res = []
    for p in lang_pairs:
        sdf = df[(df.l1 == p[0]) & (df.l2 == p[1])]
        scc = f'{sdf.score.corr(sdf[c], method="spearman")*100:.2f}'
        res.append(scc)
    overall_scc = f'{df.score.corr(df[c], method="spearman")*100:.2f}'
    res.append(overall_scc)
    print(f'{c} {" ".join(res)}')


labse_cosine_sim 69.09 74.51 80.81 68.69 79.37 72.02 75.50
muse_cosine_sim 71.81 74.71 84.01 71.64 85.23 71.74 78.60
xlmr_cosine_sim 25.49 15.71 49.58 44.46 52.17 12.07 24.04
glove_avg_sim 8.12 6.89 49.84 6.39 77.93 2.60 3.01
laser_sim 68.84 66.53 79.69 69.09 77.62 72.19 73.55
distiluse-base-multilingual-cased_sim 75.86 77.55 85.33 69.38 85.37 75.51 80.70
distilbert-multilingual-nli-stsb-quora-ranking_sim 70.77 70.97 78.59 71.04 79.00 62.74 75.72
xlm-r-100langs-bert-base-nli-mean-tokens_sim 75.41 72.52 77.11 74.51 78.20 70.26 77.77
xlm-r-100langs-bert-base-nli-stsb-mean-tokens_sim 78.66 77.39 83.13 75.27 82.40 75.89 81.44
