In [None]:
!pip install setfit
!pip install optimum[onnxruntime-gpu] -qqq

In [31]:
from datasets import load_dataset
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset


dataset = load_dataset("tweet_eval", "sentiment")
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
# Remap the labels from integers to strings
for split in dataset.keys():
    dataset[split] = dataset[split].map(
        lambda example: {"label": id2label[example["label"]]},
        desc=f"Remapping labels in {split} split"
    )


# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=5)
eval_dataset = dataset["validation"].select(range(100))
test_dataset = dataset["validation"].select(range(100, len(dataset["validation"])))

# Load a SetFit model from Hub
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
    labels=["negative", "neutral", "positive"],
)

args = TrainingArguments(
    batch_size=32,
    num_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    #column_mapping={"text": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate(test_dataset)
print(metrics)
# Run inference
preds = model.predict(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
print(preds)
# ["positive", "negative"]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 15/15 [00:00<00:00, 5564.21 examples/s]
***** Running training *****
  Num unique pairs = 150
  Batch size = 32
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.2476,0.275839


***** Running evaluation *****


{'accuracy': 0.41526315789473683}
['positive', 'negative']


## Inference Speed

In [32]:
# Measure inference speed of the trained model
import time
import numpy as np

# Prepare a list of sample texts for inference
sample_texts = ["i loved the spiderman movie!"] * 100  # Create 100 identical samples for consistent measurement

# Warm-up run (to avoid measuring initialization overhead)
_ = model.predict(sample_texts[:5])

# Measure inference time
num_runs = 100
times = []

for _ in range(num_runs):
    start_time = time.time()
    _ = model.predict(sample_texts)
    end_time = time.time()
    times.append(end_time - start_time)

# Calculate statistics
avg_time = np.mean(times)
std_time = np.std(times)
samples_per_second = len(sample_texts) / avg_time

print(f"Inference speed:")
print(f"- Average time for {len(sample_texts)} samples: {avg_time:.4f} seconds (±{std_time:.4f})")
print(f"- Samples per second: {samples_per_second:.2f}")
print(f"- Time per sample: {(avg_time / len(sample_texts)) * 1000:.2f} ms")


Inference speed:
- Average time for 100 samples: 0.0378 seconds (±0.0039)
- Samples per second: 2647.50
- Time per sample: 0.38 ms


In [33]:
model.save_pretrained("setfit-model")

In [49]:
!optimum-cli export onnx \
  --model setfit-model \
  --task feature-extraction \
  --optimize O4 \
  --device cuda \
  setfit-model_opt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0;93m2025-04-09 22:30:33.780422069 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 4 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-04-09 22:30:33.784217183 [W:onnxruntime:, session_state.cc:1263 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-04-09 22:30:33.784238531 [W:onnxruntime:, session_state.cc:1265 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
Overridding for_gpu=False to for_gpu=True as half precision is available only on GPU.
[0;93m2025-04-09 22:30:35.221968747 [W:onnxruntime:, session_state.cc:1263 V

In [53]:

import torch
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('setfit-model', model_max_length=512)
ort_model = ORTModelForFeatureExtraction.from_pretrained('setfit-model_opt', provider="CUDAExecutionProvider",use_io_binding=False)


[0;93m2025-04-09 22:38:35.504044169 [W:onnxruntime:, session_state.cc:1263 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-04-09 22:38:35.504069912 [W:onnxruntime:, session_state.cc:1265 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [59]:
from setfit.exporters.utils import mean_pooling


class OnnxSetFitModel:
    def __init__(self, ort_model, tokenizer, model_head):
        self.ort_model = ort_model
        self.tokenizer = tokenizer
        self.model_head = model_head

    def predict(self, inputs):
        encoded_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, return_tensors="pt"
        ).to(self.ort_model.device)

        outputs = self.ort_model(**encoded_inputs)
        embeddings = mean_pooling(
            outputs["last_hidden_state"], encoded_inputs["attention_mask"]
        )
        return self.model_head.predict(embeddings.cpu())

    def __call__(self, inputs):
        return self.predict(inputs)

In [60]:
encoded_inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt"
).to(ort_model.device)

outputs = ort_model(**encoded_inputs)

In [61]:
onnx_setfit_model = OnnxSetFitModel(ort_model, tokenizer, model.model_head)

In [63]:
# Measure inference speed of the trained model
import time
import numpy as np

# Prepare a list of sample texts for inference
sample_texts = ["i loved the spiderman movie!"] * 100  # Create 100 identical samples for consistent measurement

# Warm-up run (to avoid measuring initialization overhead)
_ = onnx_setfit_model.predict(sample_texts[:5])

# Measure inference time
num_runs = 100
times = []

for _ in range(num_runs):
    start_time = time.time()
    _ = onnx_setfit_model.predict(sample_texts)
    end_time = time.time()
    times.append(end_time - start_time)

# Calculate statistics
avg_time = np.mean(times)
std_time = np.std(times)
samples_per_second = len(sample_texts) / avg_time

print(f"Inference speed:")
print(f"- Average time for {len(sample_texts)} samples: {avg_time:.4f} seconds (±{std_time:.4f})")
print(f"- Samples per second: {samples_per_second:.2f}")
print(f"- Time per sample: {(avg_time / len(sample_texts)) * 1000:.2f} ms")


Inference speed:
- Average time for 100 samples: 0.0100 seconds (±0.0013)
- Samples per second: 9976.33
- Time per sample: 0.10 ms


In [64]:
# Evaluate the ONNX model on the test dataset
from sklearn.metrics import accuracy_score, classification_report

# Get predictions from the ONNX model
test_texts = [example["text"] for example in test_dataset]
test_labels = [example["label"] for example in test_dataset]

# Make predictions using the ONNX model
predictions = onnx_setfit_model.predict(test_texts)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"ONNX Model Test Accuracy: {accuracy:.4f}")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(test_labels, predictions))

# Show some example predictions
print("\nSample Predictions:")
for i in range(min(5, len(test_texts))):
    print(f"Text: {test_texts[i]}")
    print(f"True label: {test_labels[i]}")
    print(f"Predicted label: {predictions[i]}")
    print("-" * 50)


ONNX Model Test Accuracy: 0.4158

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.59      0.39       295
           1       0.45      0.51      0.48       830
           2       0.53      0.25      0.34       775

    accuracy                           0.42      1900
   macro avg       0.42      0.45      0.40      1900
weighted avg       0.46      0.42      0.41      1900


Sample Predictions:
Text: @user first meet December 8th in the Murphy center
True label: 1
Predicted label: 1
--------------------------------------------------
Text: "Sir Terry Leahy, the man behind Tesco's success, is speaking at @user in WGC tomorrow:
True label: 1
Predicted label: 1
--------------------------------------------------
Text: @user " Good morning Kerry! Happy Friday! Wishing you a safe and awesome labor day weekend!
True label: 2
Predicted label: 1
--------------------------------------------------
Text: @user There is more Islam in Austr

In [66]:
predictions

array([1, 1, 1, ..., 1, 1, 1], shape=(1900,))

## Czech Langauge

In [9]:
dataset_cz = load_dataset("CZLC/fb_sentiment_balanced")
dataset_cz["train"][1]

{'id': 1,
 'query': 'Mám ji je skvělá!',
 'choices': ['negativní', 'neutrální', 'pozitivní'],
 'gold': 2}

In [28]:
from transformers import AutoModel, AutoTokenizer

id2label = {0:'negativní', 1:'neutrální', 2:'pozitivní'}
# Remap the labels from integers to strings
for split in dataset_cz.keys():
    dataset_cz[split] = dataset_cz[split].map(
        lambda example: {"label": id2label[example["gold"]],"text":example["query"]},
        desc=f"Remapping labels in {split} split"
    )


# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = sample_dataset(dataset_cz["train"], label_column="label", num_samples=50)
eval_dataset = dataset_cz["validation"].select(range(100))
test_dataset = dataset_cz["validation"].select(range(100, 200))

# Load a SetFit model from Hub
cz_model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-multilingual-base",
    labels=['negativní', 'neutrální', 'pozitivní'],
    trust_remote_code=True
)
args = TrainingArguments(
    batch_size=32,
    num_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=cz_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy",
    #column_mapping={"text": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate(test_dataset)
print(metrics)
# {'accuracy': 0.8691709844559585}

# Push model to the Hub
#model = trainer.export_model()

# Download from Hub
 #model = SetFitModel.from_pretrained("tomaarsen/setfit-paraphrase-mpnet-base-v2-sst2")
# Run inference
preds = cz_model.predict(["Nepamatuju si z toho vůbec nic!", "Už bych na to znovu nešel"])
print(preds)

Remapping labels in train split: 100%|██████████| 3876/3876 [00:00<00:00, 29404.86 examples/s]
Remapping labels in validation split: 100%|██████████| 597/597 [00:00<00:00, 25536.68 examples/s]
Remapping labels in test split: 100%|██████████| 1500/1500 [00:00<00:00, 28188.16 examples/s]
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You sh

Epoch,Training Loss,Validation Loss
1,0.0019,0.402759


***** Running evaluation *****


{'accuracy': 0.66}
['negativní' 'negativní']
