# Text Classification Finetuning

This curated notebook demonstrates text classification finetuning.

## Contents
1. Setup
2. Tutorial
3. Next Steps


# Fine-tuning for Text Classification


In [None]:
# TODO: describe cell
!pip uninstall -y sentence-transformers transformers tokenizers huggingface_hub accelerate safetensors torch torchvision torchaudio

!pip install -U pip

# PyTorch CPU
!pip install "torch==2.3.1" "torchvision==0.18.1" --index-url https://download.pytorch.org/whl/cpu

# Stack HF + ST allineato
!pip install "transformers==4.45.2" "sentence-transformers==3.0.1" "datasets>=2.20" "accelerate>=0.33" "safetensors>=0.4.3"

In [None]:
# TODO: describe cell
from sentence_transformers import SentenceTransformer, models, InputExample, evaluation, losses
from datasets import load_dataset
from torch.utils.data import DataLoader

In [None]:
#Configuration of model choosen for training
model_checkpoint = "dbmdz/bert-base-italian-uncased"
word_embedding_model = models.Transformer(model_checkpoint)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
#Dataset loading
dataset_train = load_dataset("stsb_multi_mt", name="it", split="train")
dataset_test = load_dataset("stsb_multi_mt", name="it", split="test")

print("train size: ", dataset_train.shape)
print("test size: ", dataset_test.shape)
print("Esempio del dataset: ", dataset_train[0:5])

In [None]:
#Preprocessing of dataset to have format compatible with Sentence Transofrmers
train_samples = []
for i in range(len(dataset_train)):
    example = dataset_train[i]
    train_samples.append(
        InputExample(texts=[example['sentence1'], example['sentence2']],  #two sentences to compare
        label=float(example["similarity_score"]) / 5.0)
    )

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

In [None]:
#Configuring evaluator of similarity between sentences to check performance on test dataset
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    dataset_test['sentence1'],
    dataset_test['sentence2'],
    [x / 5.0 for x in dataset_test['similarity_score']],
    main_similarity=evaluation.SimilarityFunction.COSINE,
    write_csv=True,
    show_progress_bar=True
)

train_loss = losses.CosineSimilarityLoss(model)

In [None]:
#Training parameters
num_epochs=10
evaluation_steps=500
warmup_steps = int(len(train_dataloader)*num_epochs * 0.1)

In [None]:
# TODO: describe cell
import os
os.environ["WANDB_DISABLED"] = "true"
from transformers import logging
logging.set_verbosity_info()

In [None]:
#Training process
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    save_best_model=True
)

model.save("my-sentence-transforer")

In [None]:
#Sharing of the model on HugguingFace
"""
from huggingface_hub import login
login('token')
model.save_to_hub("my-sentence-transforer", organization='myOrg', train_datasets=[stsb_multi_mt])
"""

In [None]:
#Unit test of the model
sentences = ["Una ragazza si pettina", "Una ragazza si sta spazzolando"]
embeddings = model.encode(sentences)
print("Sentence embeddings:")
print(embeddings)

In [None]:
#Complete test
sentences = [
    "Una ragazza legge un libro",
    "Una donna legge un romanzo",
    "Un uomo cammina per strada"
    "Una persona legge un saggio"
]

embeddings = model.encode(sentences)

#Display similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(embeddings)

import pandas as pd
df = pd.DataFrame(similarity_matrix, columns=sentences, index=sentences)
print("Similarity matrix:")
display(df)