In [None]:
!pip install sentence_transformers

In [None]:
import os
import logging
from datetime import datetime
from datasets import load_dataset
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator

In [9]:
!huggingface-cli login --token {token}

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 49, in main
    service.run()
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/user.py", line 98, in run
    login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/_login.py", line 109, in login
    _login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission)
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/_login.py", line 305, in _login
    raise ValueError("Invalid token passed!")
ValueError: Invalid token passed!


In [12]:
# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)


# Define paths and model configurations
data_folder = "arabic-msmarco-data"
os.makedirs(data_folder, exist_ok=True)
model_name = "Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
train_batch_size = 8
num_epochs = 1
model_save_path = (
    "output/training_matroV2_1N_arabic_msmarco_cross-encoder-"
    + model_name.replace("/", "-")
    + "-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [13]:

# Load the Arabic MSMARCO triplet dataset for training samples
logging.info("Loading training triplets dataset")
triplet_dataset = load_dataset("oddadmix/arabic-triplets", split="train")


# Set positive-to-negative sample ratio
pos_neg_ratio = 1
max_train_samples = int(1e6)
max_dev_samples = 200  # Number of evaluation samples


# Set up model with a continuous score output
model = CrossEncoder(model_name, num_labels=1, max_length=512)


# Prepare training and development samples
train_samples = []
dev_samples = {}


# Create training samples with positive-to-negative ratio
for i, example in enumerate(triplet_dataset):
    query = example["question"]
    positive_passage = example["positive"]
    negative_passage = example["negative"]


    # Add the positive example
    train_samples.append(InputExample(texts=[query, positive_passage], label=1))


    # Add negative examples (according to pos_neg_ratio)
    for _ in range(pos_neg_ratio):
        train_samples.append(InputExample(texts=[query, negative_passage], label=0))


    # Collect some samples for development evaluation
    if len(dev_samples) < max_dev_samples:
        if query not in dev_samples:
            dev_samples[query] = {"query": query, "positive": set(), "negative": set()}
        dev_samples[query]["positive"].add(positive_passage)
        if len(dev_samples[query]["negative"]) < pos_neg_ratio:
            dev_samples[query]["negative"].add(negative_passage)


    # Limit the number of training samples if necessary
    if len(train_samples) >= max_train_samples:
        break



2024-11-07 02:44:05 - Loading training triplets dataset


config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/761k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

2024-11-07 02:44:20 - Use pytorch device: cuda


In [None]:
train_batch_size = 8
num_epochs = 1


# Create a DataLoader to load training samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


# Set up evaluator with development samples for monitoring performance
evaluator = CERerankingEvaluator(dev_samples, name="train-eval")


# Configure the training
warmup_steps = 5000  # Adjust based on dataset size
logging.info(f"Warmup-steps: {warmup_steps}")


# Train the model with evaluation at intervals
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=10000,  # Adjust evaluation frequency as needed
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=True,  # Use automatic mixed precision for faster training,
)


# Save the latest model
model.save(model_save_path + "-latest")

## if need to push to the huggingface repo
#model.model.push_to_hub("oddadmix/arabic-reranker")
#model.tokenizer.push_to_hub("oddadmix/arabic-reranker")


2024-11-07 02:45:01 - Warmup-steps: 5000


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4520 [00:00<?, ?it/s]

In [None]:
##Evaluation

from sentence_transformers import CrossEncoder
model = CrossEncoder(model_save_path + "-latest", max_length=512)


Query = 'كيف يمكن استخدام التعلم العميق في معالجة الصور الطبية؟'
Paragraph1 = 'التعلم العميق يساعد في تحليل الصور الطبية وتشخيص الأمراض'
Paragraph2 = 'الذكاء الاصطناعي يستخدم في تحسين الإنتاجية في الصناعات'
Paragraph3 = 'التعلم العميق يساعد في تحليل الصور الطبية '

scores = model.predict([(Query, Paragraph1), (Query, Paragraph2), (Query, Paragraph3)])

print(scores)