In [14]:
import torch
from sentence_transformers.models import Pooling, Transformer
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.losses import ContrastiveLoss, MultipleNegativesRankingLoss, SoftmaxLoss, CoSENTLoss
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sentence_transformers.util import cos_sim
from torch.utils.data import DataLoader

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [3]:
# Define model
## Step 1: use an existing language model
word_embedding_model = Transformer('distilroberta-base')

## Step 2: use a pool function over the token embeddings
pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension(), 
                               pooling_mode = 'cls',
                               pooling_mode_cls_token=True, 
                               pooling_mode_mean_tokens = False)

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# CONFIGURA LORA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"]
)
model.add_adapter(peft_config)

In [4]:
from datasets import load_dataset

dataset = load_dataset("glue", "mrpc")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
# Format training data
train_examples = []
for example in dataset['train']:
    train_examples.append(InputExample(texts=[example['sentence1'], example['sentence2']], label=float(example['label'])))

In [7]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

In [8]:
train_loss = ContrastiveLoss(model=model)
# (anchor, positive), (anchor, positive, negative)
mnrl_loss = MultipleNegativesRankingLoss(model)
# (sentence_A, sentence_B) + class
softmax_loss = SoftmaxLoss(model, model.get_sentence_embedding_dimension(), 3)
# (sentence_A, sentence_B) + score
cosent_loss = CoSENTLoss(model)

In [9]:
# Format evaluation data
sentences1 = []
sentences2 = []
scores = []
for example in dataset['validation']:
    sentences1.append(example['sentence1'])
    sentences2.append(example['sentence2'])
    scores.append(float(example['label']))

In [10]:
evaluator = BinaryClassificationEvaluator(sentences1, sentences2, scores)

In [11]:
# Start training
model.fit(
    train_objectives=[(train_dataloader, train_loss)], 
    evaluator=evaluator,
    evaluation_steps=500,
    epochs=1, 
    warmup_steps=0,
    output_path='./sentence_transformer/',
    weight_decay=0.01,
    optimizer_params={'lr': 0.00004},
    save_best_model=True,
    show_progress_bar=True,
)

                                                                     

Step,Training Loss,Validation Loss,Cosine Accuracy,Cosine Accuracy Threshold,Cosine F1,Cosine F1 Threshold,Cosine Precision,Cosine Recall,Cosine Ap,Cosine Mcc
500,0.0306,No log,0.703431,0.939111,0.813609,0.863224,0.692695,0.985663,0.851923,0.114622
917,0.0306,No log,0.70098,0.947623,0.816024,0.885471,0.696203,0.985663,0.863261,0.146749


In [12]:
model.save("output/mrpc-sbert")

In [16]:
# Format evaluation data
sentences1 = []
sentences2 = []
scores = []
for example in dataset['validation']:
    sentences1.append(example['sentence1'])
    sentences2.append(example['sentence2'])
    scores.append(float(example['label']))

test_evaluator = BinaryClassificationEvaluator(sentences1, sentences2, scores)

In [17]:
test_evaluator(model)

{'cosine_accuracy': 0.7009803921568627,
 'cosine_accuracy_threshold': 0.9476230144500732,
 'cosine_f1': 0.8160237388724035,
 'cosine_f1_threshold': 0.8854706287384033,
 'cosine_precision': 0.6962025316455697,
 'cosine_recall': 0.985663082437276,
 'cosine_ap': 0.8632612748196974,
 'cosine_mcc': 0.146749192191174}

In [15]:

correct = 0
for row in dataset['test']:
    u = model.encode(row['sentence1'])
    v = model.encode(row['sentence2'])
    cos_score = cos_sim(u, v)[0].numpy()[0]
    if cos_score > 0.5 and row['label'] == 1:
        correct += 1
    if cos_score <= 0.5 and row['label'] == 0:
        correct += 1

print(correct/100)

11.47


In [56]:
from datasets import load_dataset

dataset = load_dataset("glue", "qqp")

In [57]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 363846
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})

In [58]:
# Format training data
train_examples = []
for example in dataset['train']:
    train_examples.append(InputExample(texts=[example['question1'], example['question2']], label=float(example['label'])))

In [59]:
train_dataloader = DataLoader(train_examples[:5000], shuffle=True, batch_size=4)

In [60]:
# Format evaluation data
sentences1 = []
sentences2 = []
scores = []
for example in dataset['validation']:
    sentences1.append(example['question1'])
    sentences2.append(example['question2'])
    scores.append(float(example['label']))

In [62]:
evaluator = BinaryClassificationEvaluator(sentences1, sentences2, scores)

In [67]:
# Start training
model.fit(
    train_objectives=[(train_dataloader, train_loss)], 
    evaluator=evaluator,
    evaluation_steps=500,
    epochs=1, 
    warmup_steps=0,
    output_path='./sentence_transformer/',
    weight_decay=0.01,
    optimizer_params={'lr': 0.00004},
    save_best_model=True,
    show_progress_bar=True,
)

                                                                     

Step,Training Loss,Validation Loss,Cosine Accuracy,Cosine Accuracy Threshold,Cosine F1,Cosine F1 Threshold,Cosine Precision,Cosine Recall,Cosine Ap,Cosine Mcc
500,0.0377,No log,0.736928,0.91924,0.679202,0.871564,0.566372,0.848169,0.660986,0.455563
1000,0.0264,No log,0.741974,0.916392,0.683243,0.865479,0.571132,0.850118,0.668946,0.463368
1250,0.0264,No log,0.743112,0.921109,0.684717,0.870346,0.573104,0.850319,0.671124,0.466215


In [52]:
# 4. Load several loss functions to train with
# (anchor, positive), (anchor, positive, negative)
mnrl_loss = MultipleNegativesRankingLoss(model)
# (sentence_A, sentence_B) + class
softmax_loss = SoftmaxLoss(model, model.get_sentence_embedding_dimension(), 3)
# (sentence_A, sentence_B) + score
cosent_loss = CoSENTLoss(model)

# Create a mapping with dataset names to loss functions, so the trainer knows which loss to apply where.
# Note that you can also just use one loss if all of your training/evaluation datasets use the same loss
losses = {
    "quora": softmax_loss,
}