In [2]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
# Import necessary libraries
import math
import logging
from sentence_transformers import LoggingHandler
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator, CECorrelationEvaluator
from datasets import load_dataset
from datetime import datetime

# Configure logging settings
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)


In [4]:
# Authenticate with Hugging Face Hub
from huggingface_hub import login
login('hf_your_token')

# Load training and test datasets from Hugging Face Datasets
df_train = load_dataset("your_username/your_dataset_name", split="train")
df_test = load_dataset("your_username/your_dataset_name", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/969 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/49.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.83M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/241957 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/39359 [00:00<?, ? examples/s]

In [None]:
# Convert dataset into a list of InputExample objects for model training
def convert_dataset(dataset):
    dataset_samples=[]
    for df in dataset:
        inp_example = InputExample(
            texts=[df['text1'], df['text2']],
            label=df['label']
            )
        dataset_samples.append(inp_example)
    return dataset_samples

In [6]:
# Convert datasets into the required format
train_samples = convert_dataset(df_train)
test_samples = convert_dataset(df_test)

# Select appropriate evaluator based on the type of labels in the dataset:
# - If labels are purely binary (0 or 1), use `CEBinaryClassificationEvaluator`.
# - If labels contain continuous values (e.g., STS scores), use `CECorrelationEvaluator`.
# evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_samples, name="CE-binary-dev")
evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='CE-Corr-dev')
train_batch_size = 16
num_epochs = 4

# Training configuration
model_save_path = f'output/training_crossencoder_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'

In [7]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

# Define the model base; can be changed based on requirements and experiments.
# Here, we use `distilroberta-base`, which outputs a similarity score between 0 and 1.
model_base = "distilroberta-base"
model = CrossEncoder(model_base, num_labels=1)

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
from tqdm.autonotebook import tqdm, trange

# Custom callback function to log training loss and evaluation scores
class LossAndScoreCallback:
    def __call__(self, score, epoch, steps):
        print(f'Epoch {epoch}, Step {steps}, score: {score}')

# Configure the number of warm-up steps (10% of total training steps)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

# Initialize callback
callback = LossAndScoreCallback()

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=5000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    save_best_model=True,
    callback=callback
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15123 [00:00<?, ?it/s]

In [None]:
# Evaluate the model on the test set and save the results
evaluator(model, output_path="./")

In [12]:
# Function to evaluate a single data pair using the trained model
# Example: Using the STS-B dataset
# If working with other QA datasets like PIAF or SQuAD-French,
# please refer to reranker-prepare-dataset.ipynb for dataset conversion.

def convert_dataset_stsd(dataset):
    """
    Converts the STS-D dataset into a structured format.

    Args:
        dataset (Dataset): The input dataset containing sentence pairs and similarity scores.

    Returns:
        List[dict]: A list of structured samples with normalized similarity scores.
    """
    dataset_samples=[]
    for df in dataset:
        score = float(df['similarity_score'])/5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(
            texts=[df['sentence1'], df['sentence2']],
            label=score
          )
        dataset_samples.append(inp_example)
    return dataset_samples

# Load STS-B dataset in French for evaluation
df_dev = load_dataset("stsb_multi_mt", name="fr", split="dev")
df_test = load_dataset("stsb_multi_mt", name="fr", split="test")

# Convert the dev set for evaluation
dev_samples = convert_dataset_stsd(df_dev)

# Evaluate the dev set using CECorrelationEvaluator (for continuous similarity scores)
val_evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')
val_evaluator(model, output_path="./")

0.9187334632822911

In [11]:
dev_samples

[{'texts': ['Un homme avec un casque de sécurité est en train de danser.',
   'Un homme portant un casque de sécurité est en train de danser.'],
  'label': 1.0,
  'source': 'stsd-fr'},
 {'texts': ['Un jeune enfant monte à cheval.', 'Un enfant monte à cheval.'],
  'label': 0.95,
  'source': 'stsd-fr'},
 {'texts': ['Un homme donne une souris à un serpent.',
   "L'homme donne une souris au serpent."],
  'label': 1.0,
  'source': 'stsd-fr'},
 {'texts': ['Une femme joue de la guitare.', 'Un homme joue de la guitare.'],
  'label': 0.48000001907348633,
  'source': 'stsd-fr'},
 {'texts': ['Une femme joue de la flûte.', 'Un homme joue de la flûte.'],
  'label': 0.55,
  'source': 'stsd-fr'},
 {'texts': ['Une femme est en train de couper un oignon.',
   'Un homme coupe des oignons.'],
  'label': 0.5230000019073486,
  'source': 'stsd-fr'},
 {'texts': ['Un homme efface un tableau à craie.',
   "L'homme efface le tableau noir."],
  'label': 1.0,
  'source': 'stsd-fr'},
 {'texts': ['Une femme porte u