## SentenceBERT model training, contents include
#### 1. Training data preparation
    x: Input features are text (dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"]) and sentiment label
    y: Selected text/answer
#### 2. Model training
#### 3. Inference
**Note: refer to original post for detail https://towardsdatascience.com/sbert-vs-data2vec-on-text-classification-e3c35b19c949**

In [1]:
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
%load_ext memory_profiler

  from .autonotebook import tqdm as notebook_tqdm
2022-08-17 14:01:13.420187: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-17 14:01:13.420235: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
df = pd.read_csv("bbc-text.csv")
df.head(3)

In [3]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    if is_tf_available():
        import tensorflow as tf
 
        tf.random.set_seed(seed)
 
set_seed(42)

In [4]:
def TextClassification_with_Transformer(model_name: str, Data: pd.Series, Target:pd.Series, test_size: np.float64, max_length: int, num_labels: int, num_epochs: int, metrics_name: str):
    
    # Make data
    X = Data
    y = Target
    y = pd.factorize(y)[0]

    # Load Metrics
    metric = load_metric(metrics_name)

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=test_size)

    # Call the Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

    # Encode the text
    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)



    class MakeTorchData(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
            item["labels"] = torch.tensor([self.labels[idx]])
            return item

        def __len__(self):
            return len(self.labels)

    # convert our tokenized data into a torch Dataset
    train_dataset = MakeTorchData(train_encodings, y_train.ravel())
    valid_dataset = MakeTorchData(valid_encodings, y_test.ravel())

    # Call Model (Refere to "https://stackoverflow.com/questions/67948945/force-bert-transformer-to-use-cuda" to use GPU or CPU)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels) # .to("cuda")

    # Create Metrics
    def compute_metrics(eval_pred):
        
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        # 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
        return metric.compute(predictions=predictions, references=labels, average="micro")  

    # Specifiy the arguments for the trainer  
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=num_epochs,     # total number of training epochs
        per_device_train_batch_size=8,   # batch size per device during training
        per_device_eval_batch_size=20,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
        metric_for_best_model = metrics_name,    # select the base metrics
        logging_steps=200,               # log & save weights each logging_steps
        save_steps=200,
        evaluation_strategy="steps",     # evaluate each `logging_steps`
      ) 
    
    # Call the Trainer
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
      )

    # Train the model
    trainer.train()
  
    # Call the summary
    trainer.evaluate()



    return trainer, model

In [None]:
%%time
%%memit
sbert_trainer, sbert_model = TextClassification_with_Transformer(model_name = 'sentence-transformers/all-mpnet-base-v2', 
                                                                 Data = df.text, 
                                                                 Target = df.category, 
                                                                 test_size = 0.33, 
                                                                 max_length = 512, 
                                                                 num_labels = 5, 
                                                                 num_epochs = 5, 
                                                                 metrics_name='f1')

Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a

Step,Training Loss,Validation Loss
