# Config
the model to train and the dataset to train on can be selected by changing the index in the given list. 
We provide a approximate batch_size for an Nvidia A-100 with 40GB memory. 

In [None]:
#use only in notebooks if converted to python file this needs to get removed
%env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

In [None]:
torch_device = 'cuda:0'

model_checkpoint, batch_size = [[("google/t5-v1_1-small",200),("google/t5-v1_1-base",140),("google/t5-v1_1-large",16)],
                [("facebook/bart-base",200),("facebook/bart-large",40)],
                [("gpt2-medium",100),("gpt2-large",20)]] [0][0]

#batch_size = 70 #force different batchsize if GPU not empty

model_name = model_checkpoint.split("/")[-1]
print('model: ',model_name)
dataset_name= ['para-1-1-small','para-1-1','idm-small','idm'][3]
print('dataset: ',dataset_name)

path = '/media/data3/proj_scisen/'

In [None]:
#tokenizer
max_input_length = 100
max_target_length = 100

In [None]:
#Hyperparameters
learning_rate=2e-5 #default 2e-5
weight_decay=0.001 #default 0.01

# Train model with modified input
https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb

https://huggingface.co/gpt2-large#how-to-get-started-with-the-model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer , GPT2Tokenizer, GPT2Model
import torch

if 'gpt' in model_name:
    tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
    model = GPT2Model.from_pretrained(model_checkpoint)

else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model.to(torch_device)
model.num_parameters()

In [None]:
def preprocess_function(examples):
    if 'gpt' in model_name:
        tokenizer.pad_token = tokenizer.eos_token
        #tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model_inputs = tokenizer(examples["para-1-1"],padding="max_length", max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["text"],padding="max_length", max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from datasets import load_from_disk
try:
    tokenized_datasets = load_from_disk(f'{path}datasets/style/tokenized/{dataset_name}-{model_name}')
    print('load already tokenized dataset')
except FileNotFoundError:
    print('load and tokenize dataset')
    dataset = load_from_disk(f'{path}datasets/style/{dataset_name}')
    if('idm' in dataset_name): #TODO change code to avoid this workaround 
        dataset = dataset.rename_column('idm','para-1-1')
        dataset = dataset.shuffle(seed=42)
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    tokenized_datasets.save_to_disk(f'{path}datasets/style/tokenized/{dataset_name}-{model_name}')
    
tokenized_datasets

### Hyperparameter

In [None]:
args = Seq2SeqTrainingArguments(
    f"{path}models/style/{model_name}-finetuned-{dataset_name}-lr-{learning_rate}-wd-{weight_decay}",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    push_to_hub=False,
    eval_accumulation_steps=1,
    remove_unused_columns=True,
    auto_find_batch_size =False,
)
args.device

In [None]:
import nltk
import numpy as np
from datasets import load_metric
#https://huggingface.co/metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    metric_bleu = load_metric("bleu") 
    metric_self_bleu = load_metric("bleu") 
    metric_rouge = load_metric("rouge") 
    metric_meteor = load_metric("meteor") 
    metric_bertscore = load_metric("bertscore") 
    metric_ppl = load_metric("perplexity") 

    result = {}

    for entry in range(len(decoded_preds)):
        x_out = decoded_preds[entry].lower().split(' ')
        x_ref = [x.lower().split(' ') for x in [decoded_labels[entry]]]

        metric_bleu.add_batch(predictions = [x_out], references= [x_ref])
        metric_meteor.add_batch(predictions = [x_out], references= [x_ref])
        metric_bertscore.add_batch(predictions = [x_out], references= [x_ref],)
    result['bleu'] =  metric_bleu.compute()['bleu']
    result['meteor']= metric_meteor.compute()['meteor']
    result['bertscore']= np.mean(metric_bertscore.compute(model_type='allenai/scibert_scivocab_uncased')['f1'])
    result['perplexity']= metric_ppl.compute(input_texts = [x.lower().split(' ') for x in decoded_preds if (len(x.lower().split(' '))>2) ], model_id='allenai/scibert_scivocab_uncased',add_start_token=False)['mean_perplexity']

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics,
)

### Start training process
The models are saved under "../proj_scisen/models/style/{model_name}-finetuned-{dataset_name}-lr-{learning_rate}-wd-{weight_decay}"

In [None]:
import os
trainer.train()