# NLLB use for translation task 

# Import Libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,DataCollatorForSeq2Seq,EarlyStoppingCallback, TrainingArguments,AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,TrainerCallback, Trainer, DataCollatorForLanguageModeling, logging
import pandas as pd
import json
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm.notebook import tqdm
import os
from huggingface_hub import login
from datasets import load_dataset

# Paths 

In [None]:
dataset_folder = "inputs/dataset_concatenato.csv" # path to the concatenated dataset

dataset_cleaned_folder = "inputs/dataset.csv" # path to the cleaned dataset

path_NNLB="Models/nllb/nllb-finetuned-antico-moderno" # path to the NLLB model

path_translations="outputs" # path to save the translations

# Load Datasets

In [None]:
dataset = load_dataset("csv", data_files=dataset_folder)["train"] # Load the dataset from the CSV file
dataset = dataset.train_test_split(test_size=0.3) # Split the dataset into training and test sets

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [None]:
os.makedirs(path_NNLB, exist_ok=True) # Create the directory path_NNLB if it doesn't exist

# Loading model NLLB 200

In [None]:
model_name = "facebook/nllb-200-distilled-600M"    

src_lang = tgt_lang = "ita_Latn" # Italian language code in NLLB format to define source and target languages

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map="auto") # Load the pre-trained model in device-optimized mode

tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the tokenizer for the model 

tokenizer.src_lang = "ita_Latn" # Set the source language for the tokenizer

forced_bos_token_id = tokenizer.convert_tokens_to_ids("ita_Latn") # Convert the source language token to its ID for forced beginning of sequence token

# Tokenize sentences and prompt for fine tuning step 

In [None]:
max_len=256 # Define the maximum length for input sequences

# Function to preprocess the dataset by tokenizing the input text and target translation text
def preprocess(example):
    model_inputs = tokenizer(
        example["text"], # Tokenize the input text without prompt strategy but dirctly using the sentence
        max_length=max_len,              
        truncation=True, # Truncate sequences longer than max_len
        padding="max_length" # Pad sequences to max_len
    )
    labels = tokenizer(
        example["translation"], # Tokenize the target translation text
        max_length=max_len,  # Define the maximum length for target sequences
        truncation=True, 
        padding="max_length" 
    )

    model_inputs["labels"] = labels["input_ids"] # Add the tokenized labels to the model inputs
    return model_inputs

# Tokenize the dataset using the preprocess function mapping over the train and test splits
tokenized_dataset = {
    "train": dataset["train"].map(preprocess, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(preprocess, remove_columns=dataset["test"].column_names)
}

Map: 100%|██████████| 207/207 [00:00<00:00, 1478.75 examples/s]
Map: 100%|██████████| 89/89 [00:00<00:00, 1420.72 examples/s]


# Setup of Training Arguments

In [None]:
# Define the data collator for sequence-to-sequence tasks to free memory of the GPU
class ClearCUDACacheCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

In [None]:
# Define the training arguments for the Seq2SeqTrainer   
training_args = Seq2SeqTrainingArguments(
    output_dir=path_NNLB, # Directory to save the model and training outputs
    
    eval_strategy="epoch", # Evaluate the model at the end of each epoch
    
    save_strategy="epoch", # Save the model at the end of each epoch
    
    gradient_accumulation_steps=4, # Accumulate gradients over 4 forward steps before performing a backward pass
    per_device_train_batch_size=2, # Batch size for training on each device
    per_device_eval_batch_size=2, # Batch size for evaluation on each device
    
    learning_rate=2e-4, # Learning rate for the optimizer
    num_train_epochs=8, # Number of training epochs
    predict_with_generate=True, #   Enable generation during prediction
    load_best_model_at_end=True, # Load the best model at the end of training based on evaluation metrics
    
    fp16=True,      # Enable mixed precision training for faster training on compatible hardware
    logging_dir=path_NNLB+"/logs",  # Directory to save training logs
    save_total_limit=1, # Limit the total number of saved checkpoints to 1 to save disk space        
    
    seed=42,    # Set a random seed for reproducibility
    report_to="none",
    metric_for_best_model="eval_loss",  # Metric to determine the best model during training
    generation_max_length=max_len,  # Maximum length for generated sequences
    generation_num_beams=4      # Number of beams for beam search during generation
)
# Initialize the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Function to compute metrics for evaluation (currently returns an empty dictionary)
def compute_metrics(eval_preds):
    return {} 
# Initialize the Seq2SeqTrainer with the model, training arguments, datasets, data collator, tokenizer, metrics function, and callbacks
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ClearCUDACacheCallback(),EarlyStoppingCallback(early_stopping_patience=1)] # Early stopping callback to stop training if no improvement is seen for 1 epoch
)

  trainer = Seq2SeqTrainer(


# Training Steps

In [None]:
trainer.train() # Start the training process
metrics = trainer.evaluate() # Evaluate the model on the test dataset and store the metrics
print(metrics)
model.save_pretrained(path_NNLB) #  Save the trained model to the specified path
tokenizer.save_pretrained(path_NNLB) # Save the tokenizer to the specified path

Epoch,Training Loss,Validation Loss
1,No log,2.86661
2,No log,0.844395
3,No log,0.409088
4,No log,0.419878


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 0.40908780694007874, 'eval_runtime': 157.1696, 'eval_samples_per_second': 0.566, 'eval_steps_per_second': 0.286, 'epoch': 4.0}


('Models/nllb/nllb-finetuned-antico-moderno\\tokenizer_config.json',
 'Models/nllb/nllb-finetuned-antico-moderno\\special_tokens_map.json',
 'Models/nllb/nllb-finetuned-antico-moderno\\sentencepiece.bpe.model',
 'Models/nllb/nllb-finetuned-antico-moderno\\added_tokens.json',
 'Models/nllb/nllb-finetuned-antico-moderno\\tokenizer.json')

# Inference of fine-tuned Model

In [None]:
df = pd.read_csv(dataset_cleaned_folder) # Load the cleaned dataset from the CSV file

Unnamed: 0,Author,Date,Region,Sentence
0,Brunetto Latini,1260-61,fior.,quella guerra ben fatta l' opera perché etc. E...
1,Bono Giamboni,1292,fior.,"crudele, e di tutte le colpe pigli vendetta, c..."
2,Valerio Massimo (red. V1,1336,fior.,Non d' altra forza d' animo fue ornato Ponzio ...
3,Lucano volg. (ed. Marinoni),1330/40,prat.,Se questo piace a tutti e se 'l tempo hae biso...
4,Brunetto Latini,1260-61,fior.,Officio di questa arte pare che sia dicere app...


In [None]:

#Load the fine-tuned model and tokenizer for translation
tokenizer = AutoTokenizer.from_pretrained(path_NNLB) 
# Load the fine-tuned model for sequence-to-sequence tasks

model = AutoModelForSeq2SeqLM.from_pretrained(path_NNLB,
    device_map="auto")
# Set the eos token to pad token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.eval().to(device)

# Add a new column to the CSV file to store generated translations   
df["generated_translation"] = ""

# Function to translate a sequence using the fine-tuned model
def traduci_seq_to_seq(s):

    # Define the source and target languages for the translation
    src_lang = "ita_Latn"
    tgt_lang = "ita_Latn"

    # Set the source language in the tokenizer
    tokenizer.src_lang = src_lang

    # Tokenize the input sentence
    inputs = tokenizer(s, return_tensors="pt", truncation=True,max_length=384).to(model.device)
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100, # Set the maximum number of new tokens to generate
            temperature=0.7, # Set the temperature for sampling to control randomness
            top_p=0.9, # Use top-p sampling to control diversity
            do_sample=True, # Enable sampling for more diverse translations
            use_cache=True, # Enable caching for faster inference
            num_beams=4, # Use beam search with 4 beams for better translation quality
            forced_bos_token_id=forced_bos_token_id # Set the forced beginning of sequence token ID for the target language
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


results = []
i =0

# Iterate over each sentence in the CSV and translate it using the fine-tuned model
for s in tqdm(df["Sentence"].tolist()):
    results.append(traduci_seq_to_seq(s))
        
df["generated_translation"] = results

# Create the output directory if it doesn't exist
os.makedirs(path_translations, exist_ok=True)

translation_file=path_translations+"/dataset_with_translation_NNLB.csv"
# Save the translation into a CSV file
df["score_human"] = 0
df.to_csv(translation_file, index=False)

100%|██████████| 97/97 [01:11<00:00,  1.35it/s]
