In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
import sacrebleu


model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load your translations.csv
df = pd.read_csv("../dataset/nllb_corpus_train.csv")
eval_df = pd.read_csv("../dataset/nllb_corpus_test.csv")
# Rename columns if needed
df.columns = ['ro', 'rup']
# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)
eval_dataset = Dataset.from_pandas(eval_df)


In [3]:
def preprocess_function(examples, max_length=128):
    inputs = examples['ro']
    targets = examples['rup']
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    
    # Tokenize the target (Aromanian) with the target tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing to the entire dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 27033/27033 [00:02<00:00, 9474.01 examples/s] 
Map: 100%|██████████| 3004/3004 [00:00<00:00, 8516.78 examples/s]


In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  
    eval_dataset=tokenized_eval_dataset,  
)

trainer.train()


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  5%|▍         | 500/10140 [02:34<48:50,  3.29it/s]

{'loss': 1.1083, 'grad_norm': 1.3328168392181396, 'learning_rate': 4.7534516765285995e-05, 'epoch': 0.15}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1957, 'grad_norm': 0.7041230797767639, 'learning_rate': 4.5069033530571994e-05, 'epoch': 0.3}


 15%|█▍        | 1500/10140 [07:38<43:13,  3.33it/s]  

{'loss': 0.1755, 'grad_norm': 0.5219510197639465, 'learning_rate': 4.260355029585799e-05, 'epoch': 0.44}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1733, 'grad_norm': 0.5312537550926208, 'learning_rate': 4.0138067061143986e-05, 'epoch': 0.59}


 25%|██▍       | 2500/10140 [12:36<36:57,  3.44it/s]  

{'loss': 0.1759, 'grad_norm': 0.4619382917881012, 'learning_rate': 3.767258382642998e-05, 'epoch': 0.74}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.163, 'grad_norm': 0.4619596600532532, 'learning_rate': 3.520710059171598e-05, 'epoch': 0.89}


                                                      
 33%|███▎      | 3380/10140 [17:39<29:33,  3.81it/s]

{'eval_loss': 0.16245995461940765, 'eval_runtime': 27.9783, 'eval_samples_per_second': 107.369, 'eval_steps_per_second': 13.439, 'epoch': 1.0}


 35%|███▍      | 3500/10140 [18:16<34:06,  3.24it/s]   

{'loss': 0.1574, 'grad_norm': 0.6137378215789795, 'learning_rate': 3.2741617357001976e-05, 'epoch': 1.04}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1455, 'grad_norm': 0.8199504017829895, 'learning_rate': 3.027613412228797e-05, 'epoch': 1.18}


 44%|████▍     | 4500/10140 [23:36<30:09,  3.12it/s]  

{'loss': 0.1447, 'grad_norm': 1.044874906539917, 'learning_rate': 2.7810650887573965e-05, 'epoch': 1.33}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1421, 'grad_norm': 0.706843376159668, 'learning_rate': 2.5345167652859964e-05, 'epoch': 1.48}


 54%|█████▍    | 5500/10140 [29:01<25:00,  3.09it/s]  

{'loss': 0.1372, 'grad_norm': 0.5742181539535522, 'learning_rate': 2.287968441814596e-05, 'epoch': 1.63}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.131, 'grad_norm': 0.694061279296875, 'learning_rate': 2.0414201183431952e-05, 'epoch': 1.78}


 64%|██████▍   | 6500/10140 [34:12<17:32,  3.46it/s]  

{'loss': 0.1348, 'grad_norm': 0.7428207993507385, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.92}


                                                    
 67%|██████▋   | 6760/10140 [35:54<14:22,  3.92it/s]

{'eval_loss': 0.1432987004518509, 'eval_runtime': 27.3978, 'eval_samples_per_second': 109.644, 'eval_steps_per_second': 13.724, 'epoch': 2.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1245, 'grad_norm': 0.5974081158638, 'learning_rate': 1.5483234714003947e-05, 'epoch': 2.07}


 74%|███████▍  | 7500/10140 [39:46<13:34,  3.24it/s]  

{'loss': 0.1156, 'grad_norm': 0.6151213049888611, 'learning_rate': 1.3017751479289941e-05, 'epoch': 2.22}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1187, 'grad_norm': 0.5143307447433472, 'learning_rate': 1.0552268244575937e-05, 'epoch': 2.37}


 84%|████████▍ | 8500/10140 [45:00<08:25,  3.24it/s]  

{'loss': 0.1173, 'grad_norm': 0.576835572719574, 'learning_rate': 8.086785009861933e-06, 'epoch': 2.51}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1185, 'grad_norm': 0.5130419135093689, 'learning_rate': 5.621301775147929e-06, 'epoch': 2.66}


 94%|█████████▎| 9500/10140 [50:06<03:08,  3.39it/s]

{'loss': 0.115, 'grad_norm': 0.7469008564949036, 'learning_rate': 3.1558185404339255e-06, 'epoch': 2.81}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1106, 'grad_norm': 0.8834843635559082, 'learning_rate': 6.903353057199211e-07, 'epoch': 2.96}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
                                                     
100%|██████████| 10140/10140 [53:59<00:00,  3.13it/s]

{'eval_loss': 0.13884691894054413, 'eval_runtime': 28.8257, 'eval_samples_per_second': 104.213, 'eval_steps_per_second': 13.044, 'epoch': 3.0}
{'train_runtime': 3239.2661, 'train_samples_per_second': 25.036, 'train_steps_per_second': 3.13, 'train_loss': 0.18929129023053473, 'epoch': 3.0}





TrainOutput(global_step=10140, training_loss=0.18929129023053473, metrics={'train_runtime': 3239.2661, 'train_samples_per_second': 25.036, 'train_steps_per_second': 3.13, 'total_flos': 2.1968752015835136e+16, 'train_loss': 0.18929129023053473, 'epoch': 3.0})

In [7]:
def translate(sentence):
    inputs = tokenizer([sentence], return_tensors="pt", max_length=128, truncation=True)
    inputs = inputs.to(model.device)
    output = model.generate(**inputs)
    return tokenizer.decode(output[0], skip_special_tokens=True)

ro_sentence = "Mahnit adanc, porni spre casa, rusinat de neizbanda lui, de parca se ducea la taierea capului."
print(translate(ro_sentence))  


Mahnit adancu, cari s-dutea,


In [None]:
df_ro_rup_test = pd.read_csv("../dataset/nllb_corpus_test.csv")
df_ro_rup_test['ro_pred'] = ''
df_ro_rup_test['rup_pred'] = ''

test_len = len(df_ro_rup_test)
test_len = 200
for i in tqdm(range(0, test_len)):
    ro_texts = df_ro_rup_test.loc[i, 'ro']
    
    if ro_texts:
        df_ro_rup_test.loc[i, 'rup_pred'] = translate(ro_texts)

In [15]:
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF()
print(bleu_calc.corpus_score(df_ro_rup_test['rup'][:200].to_list(), [df_ro_rup_test['rup_pred'][:200].to_list()]))
print(chrf_calc.corpus_score(df_ro_rup_test['rup'][:200].to_list(), [df_ro_rup_test['rup_pred'][:200].to_list()]))

BLEU = 1.28 9.5/4.7/0.3/0.2 (BP = 1.000 ratio = 1.381 hyp_len = 370 ref_len = 268)
chrF2 = 19.90
