<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/main/run_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets==2.9.0



In [None]:
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

import numpy as np
import pandas as pd
import os

In [None]:


# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "microsoft/deberta-v3-small"
BATCH_SIZE = 16
DATA_DIRECTORY = 'data/data_samples'

""" Change the name of the DATA_DIRECTORY to the folder where your orderings are:
it expects the data in the following format :

premise hypothesis label
...      ...        ...

The code automatically reverses the order of the data, and calculates the accuracy for this ordering.
It also does the random shuffle for the baseline measurement (and calculates accuracy of course).

The output will be a file of the format:

name_of_metric  baseline    curriculum  anti-curriculum

sentence_length     0.84           0.82            0.80
etc.
"""

# SNLI data needs to be cleaned as it contains -1s as a label
#for k in snli_data:
    #snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )


# Define the column names
columns = ['ordering', 'baseline', 'curriculum', 'anti-curriculum']
# Create an empty DataFrame with the specified columns
out_df = pd.DataFrame(columns=columns)

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)


model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=4)
model.config.pad_token_id = model.config.eos_token_id

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# validation gets encoded outside the loop
snli_validation = load_dataset("snli", split="validation")
encoded_snli_validation = snli_validation.map(preprocess_function, batched=True, load_from_cache_file=True)

data_files = [f for f in os.listdir(DATA_DIRECTORY) if f.endswith('.csv')]

for file_name in data_files:

    file_path = os.path.join(DATA_DIRECTORY, file_name )
    snli_data = load_dataset("csv", data_files=file_path)

    snli_data = snli_data["train"].select_columns(["premise", "hypothesis", "label"])
    snli_data_reversed = snli_data.select(range(len(snli_data)-1, -1, -1))
    snli_data_random_shuffle = snli_data.shuffle(seed=100)


    new_row = [file_name]

    for data in [snli_data_random_shuffle, snli_data, snli_data_reversed]:

        encoded_snli_data = data.map(preprocess_function, batched=True, load_from_cache_file=True)

        #instantiate the model
        args = TrainingArguments(
        output_dir=MODEL_DIR,             # Directory to save model checkpoints
        evaluation_strategy="steps",      # Evaluates the model at regular intervals (defined by eval_steps)
        eval_steps=200,                   # Number of steps between evaluations
        save_steps=200,                   # Number of steps between saving checkpoints
        logging_steps=100,                # Number of steps between logging metrics
        per_device_train_batch_size=16, # Number of samples per training batch per device
        per_device_eval_batch_size=16,  # Number of samples per evaluation batch per device
        learning_rate=5.1e-05,               # Initial learning rate for the optimizer
        num_train_epochs= 4,               # Total number of training epochs
        weight_decay=0.0074,                # L2 weight regularization to prevent overfitting
        warmup_steps = 211,                 # Number of warmup steps for the learning rate scheduler
        save_total_limit=2,               # Limits the number of saved checkpoints to save disk space
        load_best_model_at_end=True,      # Automatically loads the best checkpoint after training
        metric_for_best_model="accuracy", # Metric used to determine the best model during evaluation
        greater_is_better=True,           # Indicates whether a higher metric value is better
        logging_dir="./logs",             # Directory to save TensorBoard logs
        fp16=True,                        # Enables mixed precision training for faster computation
        lr_scheduler_type= "cosine",       # Uses cosine learning rate decay for smoother transitions

        )
        trainer = Trainer(
            model,
            args,
            train_dataset=encoded_snli_data,
            eval_dataset=encoded_snli_validation,
            # You could use "test" here but it will be cheating then
            # to select the model checkpoint which gets highest score on test
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
            )

        trainer.train()


        # Evaluate the model on validation set
        eval_results = trainer.evaluate()
        new_row.append(eval_results["eval_accuracy"])
        eval_results

    # append the evaluations to the table
    out_df.loc[len(out_df)] = new_row

out_df.to_csv("evaluations.csv", index=False)