In [3]:
%pip install sentence-transformers==3.1.1
%pip install datasets
%pip install polars
%pip install torch
%pip install datasets
%pip install transformers==4.45.2
%pip install matplotlib
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Importing Libraries

In [None]:
import polars as pl
from sentence_transformers import SentenceTransformer, losses, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, InputExample
from transformers import TrainerCallback
import matplotlib.pyplot as plt
from datasets import Dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

import os
os.environ["WANDB_DISABLED"] = "true"

  from tqdm.autonotebook import tqdm, trange


# Data Processing 

In [2]:
def remove_punctuation(df, col):
    return df.with_columns(
        pl.col(col).str.replace_all(r"[^\w\s]", "")
    )

In [3]:
def lowercase(df, col):
    return df.with_columns(
        pl.col(col).str.to_lowercase()
    )

# Reading the Data

In [4]:
def dataset(lang, test=False):
    if test:
        return pl.read_parquet(f"hf://datasets/SemRel/SemRel2024/{lang}/test-*")
    else:
        try:
            return pl.read_parquet(f"hf://datasets/SemRel/SemRel2024/{lang}/train-*")
        except:
            return pl.read_parquet(f"hf://datasets/SemRel/SemRel2024/{lang}/dev-*")

In [5]:
languages = [
    "afr",
    "arq",
    "amh",
    "eng",
    "hau",
    "ind",
    "hin",
    "kin",
    "mar",
    "arb",
    "ary",
    "pan",
    "esp",
    "tel",
]

def read_datasets(test=False):
    datasets_train = {}
    for lang in languages:
        # Load language-specific dataset
        print("Loading langauge:", lang)
        
        df = dataset(lang, test=test)
        df = remove_punctuation(df, 'sentence1')
        df = remove_punctuation(df, 'sentence2')
        df = lowercase(df, 'sentence1')
        df = lowercase(df, 'sentence2')

        datasets_train[lang] = df

    return datasets_train

def read_dataset():
    df = dataset('*')
    df = remove_punctuation(df, 'sentence1')
    df = remove_punctuation(df, 'sentence2')
    df = lowercase(df, 'sentence1')
    df = lowercase(df, 'sentence2')

    return df

# Using SBERT

In [6]:
def get_model():
    # Load the multilingual SBERT model
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    print(model.device)

    return model

def train_model(model, train_data, batch_size=32, num_epochs=5):
    training = []
    for row in train_data.iter_rows(named=True):
        training.append(InputExample(
            texts=[row['sentence1'], row['sentence2']],
            label=row['label']
        ))

    # Loss function
    train_loss = losses.CosineSimilarityLoss(model=model)

    # dataset
    dataset = Dataset.from_pandas(train_data.to_pandas())

    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir="checkpoints",
        logging_dir="logs",
        logging_steps=10,
        # Optional training parameters:
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
    )

    class LossLogger(TrainerCallback):
        def __init__(self):
            super().__init__()
            self.loss_values = []

        def on_log(self, args, state, control, logs=None, **kwargs):
            if logs is not None:
                self.loss_values.append(logs.get('loss'))
                print(f"Step {state.global_step} - Loss: {logs.get('loss', 'N/A')}")

    logger = LossLogger()

    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=dataset,
        loss=train_loss,
        callbacks=[logger]
    )
    trainer.train()

    return logger.loss_values

In [7]:
train_datasets = read_datasets()
test_datasets = read_datasets(test=True)

Loading langauge: afr
Loading langauge: arq
Loading langauge: amh
Loading langauge: eng
Loading langauge: hau
Loading langauge: ind
Loading langauge: hin
Loading langauge: kin
Loading langauge: mar
Loading langauge: arb
Loading langauge: ary
Loading langauge: pan
Loading langauge: esp
Loading langauge: tel
Loading langauge: afr
Loading langauge: arq
Loading langauge: amh
Loading langauge: eng
Loading langauge: hau
Loading langauge: ind
Loading langauge: hin
Loading langauge: kin
Loading langauge: mar
Loading langauge: arb
Loading langauge: ary
Loading langauge: pan
Loading langauge: esp
Loading langauge: tel


In [14]:
def prepare_test_data(test_data):
    # Extract columns from the Polars DataFrame
    sentences1 = test_data['sentence1'].to_list()
    sentences2 = test_data['sentence2'].to_list()
    labels = test_data['label'].to_list()
    
    return sentences1, sentences2, labels

def evaluate(model):
    results = {}
    for lang in test_datasets:
        print("Testing lang:", lang)

        sentences1, sentences2, labels = prepare_test_data(test_datasets[lang])

        # Create an evaluator
        evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, labels)

        # Evaluate the model
        scores = evaluator(model)

        # Print the evaluation metrics
        print(scores)

        results[lang] = scores
        
    return results

In [None]:
loss_values = []
results = {}

model = get_model()
results['base'] = evaluate(model)

for lang in languages:
    print("Training lang:", lang)
    loss_values.extend(train_model(model, train_datasets[lang], batch_size=32))

    results[lang] = evaluate(model)

mps:0
Training lang: afr


 17%|█▋        | 10/60 [00:04<00:21,  2.38it/s]

Step 10 - Loss: 0.0269
{'loss': 0.0269, 'grad_norm': 0.757507860660553, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.83}


 33%|███▎      | 20/60 [00:09<00:17,  2.27it/s]

Step 20 - Loss: 0.0186
{'loss': 0.0186, 'grad_norm': 0.32401755452156067, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.67}


 50%|█████     | 30/60 [00:13<00:12,  2.31it/s]

Step 30 - Loss: 0.0082
{'loss': 0.0082, 'grad_norm': 0.3484363555908203, 'learning_rate': 2.5e-05, 'epoch': 2.5}


 67%|██████▋   | 40/60 [00:18<00:08,  2.26it/s]

Step 40 - Loss: 0.0061
{'loss': 0.0061, 'grad_norm': 0.2984454333782196, 'learning_rate': 1.6666666666666667e-05, 'epoch': 3.33}


 83%|████████▎ | 50/60 [00:22<00:04,  2.25it/s]

Step 50 - Loss: 0.0047
{'loss': 0.0047, 'grad_norm': 0.23639771342277527, 'learning_rate': 8.333333333333334e-06, 'epoch': 4.17}


100%|██████████| 60/60 [00:27<00:00,  2.24it/s]

Step 60 - Loss: 0.0046
{'loss': 0.0046, 'grad_norm': 0.3911512792110443, 'learning_rate': 0.0, 'epoch': 5.0}


100%|██████████| 60/60 [00:29<00:00,  2.06it/s]

Step 60 - Loss: N/A
{'train_runtime': 29.0763, 'train_samples_per_second': 64.485, 'train_steps_per_second': 2.064, 'train_loss': 0.011523957115908463, 'epoch': 5.0}
Testing lang: afr





{'pearson_cosine': np.float64(0.7868652004012613), 'spearman_cosine': np.float64(0.7665480668308411), 'pearson_manhattan': np.float64(0.7270303139104044), 'spearman_manhattan': np.float64(0.7302457660956317), 'pearson_euclidean': np.float64(0.7290486888153145), 'spearman_euclidean': np.float64(0.7322007304964278), 'pearson_dot': np.float64(0.7057491205775741), 'spearman_dot': np.float64(0.7267207490243766), 'pearson_max': np.float64(0.7868652004012613), 'spearman_max': np.float64(0.7665480668308411)}
Testing lang: arq
{'pearson_cosine': np.float64(0.3223381544647036), 'spearman_cosine': np.float64(0.30014309324638916), 'pearson_manhattan': np.float64(0.24536732832067465), 'spearman_manhattan': np.float64(0.2545254510603962), 'pearson_euclidean': np.float64(0.2400548119614196), 'spearman_euclidean': np.float64(0.25230850837501884), 'pearson_dot': np.float64(0.31411952702687823), 'spearman_dot': np.float64(0.24858982155885004), 'pearson_max': np.float64(0.3223381544647036), 'spearman_max

  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


{'pearson_cosine': np.float64(nan), 'spearman_cosine': nan, 'pearson_manhattan': np.float64(nan), 'spearman_manhattan': nan, 'pearson_euclidean': np.float64(nan), 'spearman_euclidean': nan, 'pearson_dot': np.float64(nan), 'spearman_dot': nan, 'pearson_max': np.float64(nan), 'spearman_max': nan}
Testing lang: tel
{'pearson_cosine': np.float64(0.5032735868086706), 'spearman_cosine': np.float64(0.5310174654957392), 'pearson_manhattan': np.float64(0.5699847684731315), 'spearman_manhattan': np.float64(0.5727854966783053), 'pearson_euclidean': np.float64(0.5679317011403691), 'spearman_euclidean': np.float64(0.5714472079904468), 'pearson_dot': np.float64(0.25157038141694216), 'spearman_dot': np.float64(0.28346108974913736), 'pearson_max': np.float64(0.5699847684731315), 'spearman_max': np.float64(0.5727854966783053)}


{'afr': {'afr': {'pearson_cosine': np.float64(0.7868652004012613),
   'spearman_cosine': np.float64(0.7665480668308411),
   'pearson_manhattan': np.float64(0.7270303139104044),
   'spearman_manhattan': np.float64(0.7302457660956317),
   'pearson_euclidean': np.float64(0.7290486888153145),
   'spearman_euclidean': np.float64(0.7322007304964278),
   'pearson_dot': np.float64(0.7057491205775741),
   'spearman_dot': np.float64(0.7267207490243766),
   'pearson_max': np.float64(0.7868652004012613),
   'spearman_max': np.float64(0.7665480668308411)},
  'arq': {'pearson_cosine': np.float64(0.3223381544647036),
   'spearman_cosine': np.float64(0.30014309324638916),
   'pearson_manhattan': np.float64(0.24536732832067465),
   'spearman_manhattan': np.float64(0.2545254510603962),
   'pearson_euclidean': np.float64(0.2400548119614196),
   'spearman_euclidean': np.float64(0.25230850837501884),
   'pearson_dot': np.float64(0.31411952702687823),
   'spearman_dot': np.float64(0.24858982155885004),
   '

# For Saving Results in Google Colab

In [None]:
from google.colab import drive
import json

drive.mount('/content/drive')

with open('/content/drive/COS802/results.json') as file:
    json.dump(file, results)
with open('/content/drive/COS802/loss_values.json') as file:
    json.dump(file, loss_values)