### In this notebook I will do the following:

* Run several experiments to train the SetFit Transformer using hard negative/positive sampling instead of random sampling, which is the current implementation of SetFit source code, across different datasets, nr of training examples and training seeds in order to have reliable results.

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation, util
from torch.utils.data import DataLoader
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from warnings import simplefilter
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
from setfit import SetFitModel, SetFitTrainer
from helper_functions import get_data, preprocess_ade_sent
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
simplefilter(action='ignore', category=FutureWarning)



In [None]:
# SetFit - Hard negative/positive sampling

def select_pairs(idx_to_cossim_pos, idx_to_cossim_neg, current_sentence, sentences, pairs):
    # select the example from the same class
    second_idx = min(idx_to_cossim_pos, key=idx_to_cossim_pos.get)
    # Prepare a positive pair and update the sentences and labels lists, respectively
    positive_sentence = sentences[second_idx]
    pairs.append(InputExample(texts=[current_sentence, positive_sentence], label=1.0))

    # select the example from the other class
    third_idx = max(idx_to_cossim_neg, key=idx_to_cossim_neg.get)
    # Prepare a negative pair and update the sentences and labels lists, respectively
    negative_sentence = sentences[third_idx]
    pairs.append(InputExample(texts=[current_sentence, negative_sentence], label=0.0))

    # remove the selected indeces to avoid selecting the same sentences for a given sentence
    idx_to_cossim_pos.pop(second_idx)
    idx_to_cossim_neg.pop(third_idx)

    return (pairs)


def sentence_pairs_generation(sent_idx, sentences, labels, embeddings, pairs, num_itr):
    # get the unique labels 
    num_classes = np.unique(labels)
    # if needed, convert the labels into numerical representations (0 and 1)
    #label_to_idx = {x: i for i, x in enumerate(num_classes)}  

    # get the indexes of positive and negative examples
    negative_idxs = np.where(labels == num_classes[0])[0]
    positive_idxs = np.where(labels == num_classes[1])[0]

    # create a dictionary to store for the cosine similarities between a given sentence and all sentences of positive or negative class
    idx_to_cossim_pos = {}
    for idx_pos, embed in zip(positive_idxs, embeddings[positive_idxs]):
        cossim_value = util.pytorch_cos_sim(embeddings[sent_idx], embed)
        idx_to_cossim_pos[idx_pos] = cossim_value

    idx_to_cossim_neg = {}
    for idx_pos, embed in zip(negative_idxs, embeddings[negative_idxs]):
        cossim_value = util.pytorch_cos_sim(embeddings[sent_idx], embed)
        idx_to_cossim_neg[idx_pos] = cossim_value

    # for each iteration and for a given sentence, we build 2 pairs where each pair includes the sentence itself and the selected sentence from the same class or the selected sentence from other class
    for _ in range(num_itr):
        # get the current sentence
        current_sentence = sentences[sent_idx]
        label = labels[sent_idx]

        # build the 2 pairs for the current sentence and add to the training examples
        if label == num_classes[1]: # positive example
            pairs = select_pairs(idx_to_cossim_pos, idx_to_cossim_neg, current_sentence, sentences, pairs)

        elif label == num_classes[0]: # negative example
            pairs = select_pairs(idx_to_cossim_neg, idx_to_cossim_pos, current_sentence, sentences, pairs)

    return (pairs)


def generate_sentence_pairs(sentences, labels, embeddings, num_itr):  
    train_examples = [] 
    for sent_idx in range(len(sentences)):  
        train_examples = sentence_pairs_generation(sent_idx, np.array(sentences), np.array(labels), embeddings, train_examples, num_itr)
    return train_examples


def finetune_st_logreg(model_name, train_df, test_df, num_iterations=20):   
    # get the sentences and labels from the training and test dataset
    train_sentences = train_df['text']
    train_labels = train_df['label']
    test_sentences = test_df['text']
    test_labels = test_df['label']
    
    # generate the pre-trained sentence embeddings for the training sentences from ST
    st_model = SentenceTransformer(model_name)
    train_embeddings = st_model.encode(train_sentences)
    # generate the sentence pairs which are needed for fine-tune the ST pre-trained embeddings
    train_examples = generate_sentence_pairs(train_sentences, train_labels, train_embeddings, num_iterations)
    
    # fine-tune ST pre-trained sentence embeddings
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(st_model)
    st_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=math.ceil(len(train_dataloader) * 0.1),
              show_progress_bar=True, optimizer_params={"lr": 2e-5})
    
    # encode the training and test sentences using the fine-tuned model
    finetuned_train_embeddings = st_model.encode(train_sentences)
    finedtuned_test_embeddings = st_model.encode(test_sentences)
    # train a Logistic Regression using the fine-tuned training embeddings
    logreg_model = LogisticRegression()
    logreg_model.fit(finetuned_train_embeddings, train_labels)
    
    # evaluate the model performance on test dataset using accuracy
    acc_score = accuracy_score(test_labels, logreg_model.predict(finedtuned_test_embeddings))
    return st_model, logreg_model, acc_score

In [None]:
# SetFit - Random Sampling
def train_setfit(model_name, train_data, eval_data, num_iterations=20, column_mapping=None): 
    """
        Fine-Tuning of SetFit
    """
    model = SetFitModel.from_pretrained(model_name)

  # Create trainer
    trainer = SetFitTrainer(
          model=model,
          train_dataset=train_data,
          eval_dataset=eval_data,
          loss_class=CosineSimilarityLoss,
          batch_size=16,
          num_iterations=num_iterations, # Number of text pairs to generate for contrastive learning
          num_epochs=1, # Number of epochs to use for contrastive learning,
          column_mapping=column_mapping
      )

    trainer.train()
    metrics = trainer.evaluate()
    return trainer, metrics['accuracy']

* Prepare the datasets for training.

In [None]:
ade_df = load_dataset('SetFit/ade_corpus_v2_classification')
sent_df = load_dataset('SetFit/SentEval-CR')
ade_df, sent_df = preprocess_ade_sent(ade_df, sent_df)
sst2_df = load_dataset('SetFit/sst2')

dataset_dic = {'SST2' : sst2_df, 
               'ADE' : ade_df,
               'SENT' : sent_df}


seeds = [10, 20, 30, 40, 50]
nr_ex_class = [50]

In [None]:
def run_experiments_hardnegative(seeds, dataset_dic, nr_ex_class):  
    """
        Run all experiments across different parameters 
    """
    
    cols = ['dataset_name', 'nr_ex_per_class' ,'seed', 'model', 'accuracy']
    result_df = pd.DataFrame()
    model_name = 'all-mpnet-base-v2'
    
    for dataset_name in tqdm(dataset_dic):
        for nr in nr_ex_class:
            for seed in seeds: 
                train_df, test_df = get_data(dataset_dic[dataset_name], seed=seed, nr_example_per_class=nr)
                st_model, logreg_model, acc_score = finetune_st_logreg('sentence-transformers/' + model_name, train_df, test_df, 20)

                row_data = [dataset_name, nr ,seed, (st_model, logreg_model), acc_score]
                row_dic = {el1:[el2] for el1, el2 in zip(cols, row_data)}
                result_df = pd.concat([result_df, pd.DataFrame.from_dict(row_dic)], ignore_index=True)

    return result_df

In [None]:
result_df_ownapproach = run_experiments_hardnegative(seeds, dataset_dic, nr_ex_class)

In [None]:
# save the results
result_df_ownapproach.to_csv('Results_DF/result_df_ownapproach.csv', index=False)
