In [2]:
import os
os.environ['FOR_DISABLE_CONSOLE_CTRL_HANDLER'] = '1'

import torch
import pandas as pd
import numpy as np
# from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses, models
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from datasets import Dataset, DatasetDict, concatenate_datasets

## make sure cuda is available
torch.cuda.is_available()

True

In [3]:
dataset_dict = DatasetDict.load_from_disk('../data/train_dataset_dict')

## combine matching and relation
dataset_matching = dataset_dict['matching']
dataset_relation = dataset_dict['relation']

In [4]:
## train test split
dataset_matching_split = dataset_matching.train_test_split(test_size=0.0005, seed=42)
dataset_relation_split = dataset_relation.train_test_split(test_size=0.0001, seed=42)

dataset_matching_train = dataset_matching_split['train']
dataset_matching_test = dataset_matching_split['test']
dataset_relation_train = dataset_relation_split['train']
dataset_relation_test = dataset_relation_split['test']


train_dataset = {
    'matching': dataset_matching_train,
    'relation': dataset_relation_train
}
test_dataset = {
    'matching': dataset_matching_test,
    'relation': dataset_relation_test
}

print(train_dataset)
print(test_dataset)

{'matching': Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 3530014
}), 'relation': Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 14683026
})}
{'matching': Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 1766
}), 'relation': Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 1469
})}


In [5]:
model = SentenceTransformer('../models/all-MiniLM-L6-v2')

## add special tokens
transformer = model[0]  
tokenizer = transformer.tokenizer
auto_model = transformer.auto_model

# Now you can add special tokens to the tokenizer
special_tokens_dict = {
    "additional_special_tokens": ["[MATCHING]", "[RELATION]"]
}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

# Resize the model embeddings if we added tokens
if num_added_tokens > 0:
    auto_model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

# Initialize the evaluator
dev_evaluator1 = BinaryClassificationEvaluator(
    sentences1=dataset_matching_test["sentence1"],
    sentences2=dataset_matching_test["sentence2"],
    labels=dataset_matching_test["label"],
    name="evaluation1",
)
dev_evaluator1(model)

train_loss = {
    'matching' : losses.ContrastiveLoss(model=model),
    'relation' : losses.ContrastiveLoss(model=model)
}

output_dir = "../models/fine-tuned2"

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if your GPU can't handle FP16
    bf16=False,  # Set to True if your GPU supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # Losses using "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    logging_steps=500,
    run_name="mpnet-base-all-nli-triplet",  # Used in W&B if `wandb` is installed
)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=train_loss,
    args=args,
    evaluator=dev_evaluator1,
)

                                                                             

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
model.save(f"../{output_dir}/final")