### This notebooks contains a peft training

In [9]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from trl import SFTTrainer, setup_chat_format
import pandas as pd
from datasets import Dataset
import unicodedata
import re
from tqdm import tqdm
import sacrebleu
import bitsandbytes as bnb


In [10]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [11]:
base_model = "meta-llama/Llama-3.2-1B"
new_model = "llama-3.2-1b-rup"

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [13]:

def apply_supplimentary_transformations(df_path):
    df = pd.read_csv(df_path)
    df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
    # There are some words that have a -mi at the end, we will eliminate them also
    df_transformed.replace(r'\s*-\s*mi\b', '', regex=True, inplace=True)
    df_transformed.replace(r'\(i\)', 'i', regex=True, inplace=True)
    df_transformed.replace('γ', 'y', regex=True, inplace=True)
    df_transformed.replace(r'’', '', regex=True, inplace=True)
    df_transformed.replace(r'“', '', regex=True, inplace=True)
    df_transformed.replace(r'„', '', regex=True, inplace=True)
    df_transformed.columns = [str(q).strip() for q in df_transformed.columns]
    return df_transformed
# df_transformed.drop(columns=['ro', 'rup', 'translations'], inplace=True)
train_dataset = Dataset.from_pandas(apply_supplimentary_transformations("../dataset/nllb_corpus_train.csv"))
train_dataset = train_dataset.shuffle(seed=42)

test_dataset = Dataset.from_pandas(apply_supplimentary_transformations("../dataset/nllb_corpus_test.csv"))
test_dataset = test_dataset.shuffle(seed=42) 


def generate_prompt(data_point):
    begin_of_text = "<|begin_of_text|>"
    end_of_text = "<|end_of_text|>"
    start_header_id = "<|start_header_id|>"
    end_header_id = "<|end_header_id|>"
    eot_id = "<|eot_id|>"
    return {"text":
    f"""{begin_of_text}
        {start_header_id}system{end_header_id} Tradu urmatorul text din aromana in romana:
        {start_header_id}user{end_header_id} {data_point["rup"]}{eot_id}
        {start_header_id}assistant{end_header_id} {data_point["ro"]}{eot_id}
        {end_of_text}"""}

train_dataset = train_dataset.map(generate_prompt)
test_dataset = test_dataset.map(generate_prompt)

# dataset['text'][3]

  df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
  df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
Map: 100%|██████████| 27033/27033 [00:00<00:00, 27793.41 examples/s]
Map: 100%|██████████| 3004/3004 [00:00<00:00, 29451.81 examples/s]


In [14]:

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: 
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [15]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [16]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=100,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
)

In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 27033/27033 [00:00<00:00, 52555.12 examples/s]
Map: 100%|██████████| 3004/3004 [00:00<00:00, 51091.77 examples/s]


In [18]:
trainer.train()

  1%|          | 100/13516 [00:29<59:04,  3.79it/s] 

{'loss': 3.111, 'grad_norm': 2.412323236465454, 'learning_rate': 0.00019866725899600178, 'epoch': 0.01}


  1%|▏         | 200/13516 [00:57<59:49,  3.71it/s]  

{'loss': 2.3463, 'grad_norm': 1.1858011484146118, 'learning_rate': 0.00019718643565822599, 'epoch': 0.01}


  2%|▏         | 300/13516 [01:24<59:08,  3.72it/s]  

{'loss': 2.3304, 'grad_norm': 2.8915157318115234, 'learning_rate': 0.00019570561232045016, 'epoch': 0.02}


  3%|▎         | 400/13516 [01:53<1:08:08,  3.21it/s]

{'loss': 2.2884, 'grad_norm': 3.1434037685394287, 'learning_rate': 0.0001942247889826744, 'epoch': 0.03}


  4%|▎         | 500/13516 [02:22<59:14,  3.66it/s]  

{'loss': 2.2828, 'grad_norm': 1.260321855545044, 'learning_rate': 0.0001927439656448986, 'epoch': 0.04}


  4%|▍         | 600/13516 [02:57<56:15,  3.83it/s]  

{'loss': 2.2234, 'grad_norm': 1.14825439453125, 'learning_rate': 0.00019126314230712277, 'epoch': 0.04}


  5%|▌         | 700/13516 [03:29<54:33,  3.92it/s]  

{'loss': 2.1878, 'grad_norm': 1.4314203262329102, 'learning_rate': 0.00018978231896934697, 'epoch': 0.05}


  6%|▌         | 800/13516 [03:56<56:39,  3.74it/s]  

{'loss': 2.215, 'grad_norm': 1.5863232612609863, 'learning_rate': 0.00018830149563157117, 'epoch': 0.06}


  7%|▋         | 900/13516 [04:23<58:06,  3.62it/s]  

{'loss': 2.1851, 'grad_norm': 1.0402705669403076, 'learning_rate': 0.00018682067229379537, 'epoch': 0.07}


  7%|▋         | 1000/13516 [04:50<56:23,  3.70it/s] 

{'loss': 2.2561, 'grad_norm': 1.375089406967163, 'learning_rate': 0.00018533984895601955, 'epoch': 0.07}


  8%|▊         | 1100/13516 [05:19<53:44,  3.85it/s]  

{'loss': 2.1795, 'grad_norm': 1.7213143110275269, 'learning_rate': 0.00018385902561824375, 'epoch': 0.08}


  9%|▉         | 1200/13516 [05:45<52:51,  3.88it/s]

{'loss': 2.1928, 'grad_norm': 1.1495988368988037, 'learning_rate': 0.00018237820228046795, 'epoch': 0.09}


 10%|▉         | 1300/13516 [06:12<54:43,  3.72it/s]

{'loss': 2.2004, 'grad_norm': 1.224915623664856, 'learning_rate': 0.00018089737894269215, 'epoch': 0.1}


 10%|█         | 1400/13516 [06:40<55:00,  3.67it/s]  

{'loss': 2.173, 'grad_norm': 2.104116439819336, 'learning_rate': 0.00017941655560491635, 'epoch': 0.1}


 11%|█         | 1500/13516 [07:07<57:31,  3.48it/s]  

{'loss': 2.119, 'grad_norm': 1.3905526399612427, 'learning_rate': 0.00017793573226714053, 'epoch': 0.11}


 12%|█▏        | 1600/13516 [07:38<50:15,  3.95it/s]  

{'loss': 2.1862, 'grad_norm': 1.2139753103256226, 'learning_rate': 0.00017645490892936473, 'epoch': 0.12}


 13%|█▎        | 1700/13516 [08:03<48:53,  4.03it/s]

{'loss': 2.0955, 'grad_norm': 1.0088883638381958, 'learning_rate': 0.00017497408559158893, 'epoch': 0.13}


 13%|█▎        | 1800/13516 [08:28<51:28,  3.79it/s]

{'loss': 2.187, 'grad_norm': 1.096885085105896, 'learning_rate': 0.00017349326225381314, 'epoch': 0.13}


 14%|█▍        | 1900/13516 [08:55<50:52,  3.81it/s]

{'loss': 2.1722, 'grad_norm': 1.3302189111709595, 'learning_rate': 0.00017201243891603734, 'epoch': 0.14}


 15%|█▍        | 2000/13516 [09:21<49:43,  3.86it/s]

{'loss': 2.156, 'grad_norm': 1.1915398836135864, 'learning_rate': 0.0001705316155782615, 'epoch': 0.15}


 16%|█▌        | 2100/13516 [09:50<47:27,  4.01it/s]  

{'loss': 2.1539, 'grad_norm': 1.2696499824523926, 'learning_rate': 0.00016905079224048571, 'epoch': 0.16}


 16%|█▋        | 2200/13516 [10:18<51:22,  3.67it/s]  

{'loss': 2.1306, 'grad_norm': 2.435851812362671, 'learning_rate': 0.00016756996890270992, 'epoch': 0.16}


 17%|█▋        | 2300/13516 [10:46<51:08,  3.65it/s]  

{'loss': 2.1573, 'grad_norm': 1.147026777267456, 'learning_rate': 0.00016608914556493412, 'epoch': 0.17}


 18%|█▊        | 2400/13516 [11:12<47:40,  3.89it/s]

{'loss': 2.0874, 'grad_norm': 2.2396349906921387, 'learning_rate': 0.00016460832222715832, 'epoch': 0.18}


 18%|█▊        | 2500/13516 [11:41<54:03,  3.40it/s]  

{'loss': 2.098, 'grad_norm': 1.8497337102890015, 'learning_rate': 0.0001631274988893825, 'epoch': 0.18}


 19%|█▉        | 2600/13516 [12:12<53:05,  3.43it/s]  

{'loss': 2.1409, 'grad_norm': 1.1692627668380737, 'learning_rate': 0.0001616466755516067, 'epoch': 0.19}


 20%|█▉        | 2700/13516 [12:41<50:20,  3.58it/s]  

{'loss': 2.1008, 'grad_norm': 1.5931237936019897, 'learning_rate': 0.0001601658522138309, 'epoch': 0.2}


 20%|██        | 2704/13516 [12:42<50:23,  3.58it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
                                                    
 20%|██        | 2704/13516 [15:30<50:23,  3.58it/s]

{'eval_loss': 2.1027841567993164, 'eval_runtime': 168.1427, 'eval_samples_per_second': 17.866, 'eval_steps_per_second': 17.866, 'epoch': 0.2}


 21%|██        | 2800/13516 [15:56<48:51,  3.66it/s]    

{'loss': 2.1299, 'grad_norm': 1.0155231952667236, 'learning_rate': 0.0001586850288760551, 'epoch': 0.21}


 21%|██▏       | 2900/13516 [16:28<48:23,  3.66it/s]  

{'loss': 2.0416, 'grad_norm': 1.0933787822723389, 'learning_rate': 0.00015720420553827928, 'epoch': 0.21}


 22%|██▏       | 3000/13516 [16:56<52:55,  3.31it/s]  

{'loss': 2.0668, 'grad_norm': 1.0882338285446167, 'learning_rate': 0.00015572338220050348, 'epoch': 0.22}


 23%|██▎       | 3100/13516 [17:33<1:04:38,  2.69it/s]

{'loss': 2.0967, 'grad_norm': 0.8817012906074524, 'learning_rate': 0.00015424255886272768, 'epoch': 0.23}


 24%|██▎       | 3200/13516 [17:59<46:14,  3.72it/s]  

{'loss': 2.0997, 'grad_norm': 1.2558766603469849, 'learning_rate': 0.00015276173552495188, 'epoch': 0.24}


 24%|██▍       | 3300/13516 [18:29<46:11,  3.69it/s]  

{'loss': 2.1025, 'grad_norm': 1.2644444704055786, 'learning_rate': 0.00015128091218717608, 'epoch': 0.24}


 25%|██▌       | 3400/13516 [18:58<46:58,  3.59it/s]  

{'loss': 2.0966, 'grad_norm': 1.2914115190505981, 'learning_rate': 0.00014980008884940026, 'epoch': 0.25}


 26%|██▌       | 3500/13516 [19:27<43:01,  3.88it/s]

{'loss': 2.0901, 'grad_norm': 1.174456238746643, 'learning_rate': 0.00014831926551162446, 'epoch': 0.26}


 27%|██▋       | 3600/13516 [19:59<44:29,  3.71it/s]  

{'loss': 2.0907, 'grad_norm': 0.8369913101196289, 'learning_rate': 0.00014683844217384866, 'epoch': 0.27}


 27%|██▋       | 3700/13516 [20:28<50:22,  3.25it/s]

{'loss': 2.0922, 'grad_norm': 1.0528473854064941, 'learning_rate': 0.00014535761883607286, 'epoch': 0.27}


 28%|██▊       | 3800/13516 [20:56<43:13,  3.75it/s]

{'loss': 2.0401, 'grad_norm': 1.1378284692764282, 'learning_rate': 0.00014387679549829707, 'epoch': 0.28}


 29%|██▉       | 3900/13516 [21:25<49:54,  3.21it/s]

{'loss': 2.0955, 'grad_norm': 1.2590655088424683, 'learning_rate': 0.00014239597216052124, 'epoch': 0.29}


 30%|██▉       | 4000/13516 [21:54<42:30,  3.73it/s]

{'loss': 2.0749, 'grad_norm': 0.8890972137451172, 'learning_rate': 0.00014091514882274547, 'epoch': 0.3}


 30%|███       | 4100/13516 [22:29<42:01,  3.73it/s]  

{'loss': 2.0823, 'grad_norm': 1.037575602531433, 'learning_rate': 0.00013943432548496964, 'epoch': 0.3}


 31%|███       | 4200/13516 [22:55<40:29,  3.83it/s]

{'loss': 2.0424, 'grad_norm': 1.3972703218460083, 'learning_rate': 0.00013795350214719385, 'epoch': 0.31}


 32%|███▏      | 4300/13516 [23:21<39:55,  3.85it/s]

{'loss': 2.0765, 'grad_norm': 1.7601792812347412, 'learning_rate': 0.00013647267880941805, 'epoch': 0.32}


 33%|███▎      | 4400/13516 [23:47<39:34,  3.84it/s]

{'loss': 2.0326, 'grad_norm': 3.012953519821167, 'learning_rate': 0.00013499185547164222, 'epoch': 0.33}


 33%|███▎      | 4500/13516 [24:14<38:06,  3.94it/s]

{'loss': 2.1102, 'grad_norm': 1.2102667093276978, 'learning_rate': 0.00013351103213386645, 'epoch': 0.33}


 34%|███▍      | 4600/13516 [24:44<39:02,  3.81it/s]  

{'loss': 2.0551, 'grad_norm': 1.8702739477157593, 'learning_rate': 0.00013203020879609063, 'epoch': 0.34}


 35%|███▍      | 4700/13516 [25:10<37:54,  3.88it/s]

{'loss': 2.038, 'grad_norm': 0.8918963670730591, 'learning_rate': 0.00013054938545831483, 'epoch': 0.35}


 36%|███▌      | 4800/13516 [25:36<36:40,  3.96it/s]

{'loss': 2.0783, 'grad_norm': 1.9435195922851562, 'learning_rate': 0.000129068562120539, 'epoch': 0.36}


 36%|███▋      | 4900/13516 [26:03<36:59,  3.88it/s]  

{'loss': 2.0213, 'grad_norm': 1.1007624864578247, 'learning_rate': 0.00012758773878276323, 'epoch': 0.36}


 37%|███▋      | 5000/13516 [26:30<37:11,  3.82it/s]

{'loss': 2.0078, 'grad_norm': 1.2996464967727661, 'learning_rate': 0.00012610691544498744, 'epoch': 0.37}


 38%|███▊      | 5100/13516 [27:01<37:17,  3.76it/s]  

{'loss': 2.0115, 'grad_norm': 0.8055060505867004, 'learning_rate': 0.0001246260921072116, 'epoch': 0.38}


 38%|███▊      | 5200/13516 [27:28<38:19,  3.62it/s]

{'loss': 2.0645, 'grad_norm': 1.1875742673873901, 'learning_rate': 0.0001231452687694358, 'epoch': 0.38}


 39%|███▉      | 5300/13516 [27:56<38:43,  3.54it/s]

{'loss': 2.0405, 'grad_norm': 1.2375236749649048, 'learning_rate': 0.00012166444543166, 'epoch': 0.39}


 40%|███▉      | 5400/13516 [28:23<44:22,  3.05it/s]

{'loss': 2.0725, 'grad_norm': 1.0479308366775513, 'learning_rate': 0.0001201836220938842, 'epoch': 0.4}


                                                    
 40%|████      | 5408/13516 [31:05<37:26,  3.61it/s]

{'eval_loss': 2.0427889823913574, 'eval_runtime': 160.0984, 'eval_samples_per_second': 18.763, 'eval_steps_per_second': 18.763, 'epoch': 0.4}


 41%|████      | 5500/13516 [31:31<35:15,  3.79it/s]    

{'loss': 2.0708, 'grad_norm': 2.5461952686309814, 'learning_rate': 0.00011870279875610839, 'epoch': 0.41}


 41%|████▏     | 5600/13516 [32:03<33:24,  3.95it/s]  

{'loss': 2.0674, 'grad_norm': 1.178473949432373, 'learning_rate': 0.00011722197541833259, 'epoch': 0.41}


 42%|████▏     | 5700/13516 [32:30<38:57,  3.34it/s]

{'loss': 2.0585, 'grad_norm': 1.1190996170043945, 'learning_rate': 0.00011574115208055681, 'epoch': 0.42}


 43%|████▎     | 5800/13516 [32:56<33:05,  3.89it/s]

{'loss': 2.018, 'grad_norm': 1.491291880607605, 'learning_rate': 0.00011426032874278098, 'epoch': 0.43}


 44%|████▎     | 5900/13516 [33:24<35:04,  3.62it/s]

{'loss': 2.0054, 'grad_norm': 1.3244954347610474, 'learning_rate': 0.0001127795054050052, 'epoch': 0.44}


 44%|████▍     | 6000/13516 [33:51<35:47,  3.50it/s]

{'loss': 2.0373, 'grad_norm': 1.1604071855545044, 'learning_rate': 0.00011129868206722937, 'epoch': 0.44}


 45%|████▌     | 6100/13516 [34:20<31:42,  3.90it/s]  

{'loss': 1.9981, 'grad_norm': 2.3408429622650146, 'learning_rate': 0.00010981785872945358, 'epoch': 0.45}


 46%|████▌     | 6200/13516 [34:46<30:34,  3.99it/s]

{'loss': 2.055, 'grad_norm': 1.1543606519699097, 'learning_rate': 0.00010833703539167779, 'epoch': 0.46}


 47%|████▋     | 6300/13516 [35:15<31:28,  3.82it/s]  

{'loss': 2.0186, 'grad_norm': 2.0293033123016357, 'learning_rate': 0.00010685621205390197, 'epoch': 0.47}


 47%|████▋     | 6400/13516 [35:42<30:50,  3.85it/s]

{'loss': 2.0167, 'grad_norm': 3.809053897857666, 'learning_rate': 0.00010537538871612618, 'epoch': 0.47}


 48%|████▊     | 6500/13516 [36:09<30:40,  3.81it/s]

{'loss': 2.0467, 'grad_norm': 1.7611063718795776, 'learning_rate': 0.00010389456537835036, 'epoch': 0.48}


 49%|████▉     | 6600/13516 [36:38<31:13,  3.69it/s]  

{'loss': 2.0021, 'grad_norm': 0.8990897536277771, 'learning_rate': 0.00010241374204057457, 'epoch': 0.49}


 50%|████▉     | 6700/13516 [37:04<28:34,  3.98it/s]

{'loss': 1.9932, 'grad_norm': 1.5459896326065063, 'learning_rate': 0.00010093291870279875, 'epoch': 0.5}


 50%|█████     | 6800/13516 [37:31<29:37,  3.78it/s]

{'loss': 2.0008, 'grad_norm': 0.8017933964729309, 'learning_rate': 9.945209536502296e-05, 'epoch': 0.5}


 51%|█████     | 6900/13516 [37:57<28:58,  3.81it/s]

{'loss': 2.0368, 'grad_norm': 1.5895897150039673, 'learning_rate': 9.797127202724715e-05, 'epoch': 0.51}


 52%|█████▏    | 7000/13516 [38:23<28:55,  3.76it/s]

{'loss': 2.0129, 'grad_norm': 1.614537239074707, 'learning_rate': 9.649044868947135e-05, 'epoch': 0.52}


 53%|█████▎    | 7100/13516 [38:52<28:12,  3.79it/s]  

{'loss': 1.9829, 'grad_norm': 1.8790290355682373, 'learning_rate': 9.500962535169555e-05, 'epoch': 0.53}


 53%|█████▎    | 7200/13516 [39:19<26:45,  3.93it/s]

{'loss': 1.9787, 'grad_norm': 2.255636692047119, 'learning_rate': 9.352880201391974e-05, 'epoch': 0.53}


 54%|█████▍    | 7300/13516 [39:45<27:56,  3.71it/s]

{'loss': 2.0586, 'grad_norm': 1.4887460470199585, 'learning_rate': 9.204797867614395e-05, 'epoch': 0.54}


 55%|█████▍    | 7400/13516 [40:11<25:46,  3.95it/s]

{'loss': 2.0389, 'grad_norm': 2.6482436656951904, 'learning_rate': 9.056715533836813e-05, 'epoch': 0.55}


 55%|█████▌    | 7500/13516 [40:38<26:05,  3.84it/s]

{'loss': 1.985, 'grad_norm': 1.0881457328796387, 'learning_rate': 8.908633200059234e-05, 'epoch': 0.55}


 56%|█████▌    | 7600/13516 [41:07<25:58,  3.80it/s]  

{'loss': 2.0254, 'grad_norm': 1.4006227254867554, 'learning_rate': 8.760550866281652e-05, 'epoch': 0.56}


 57%|█████▋    | 7700/13516 [41:34<25:59,  3.73it/s]

{'loss': 2.0473, 'grad_norm': 1.9796605110168457, 'learning_rate': 8.612468532504073e-05, 'epoch': 0.57}


 58%|█████▊    | 7800/13516 [42:01<24:25,  3.90it/s]

{'loss': 2.0119, 'grad_norm': 1.9064167737960815, 'learning_rate': 8.464386198726493e-05, 'epoch': 0.58}


 58%|█████▊    | 7900/13516 [42:28<27:24,  3.41it/s]

{'loss': 1.9946, 'grad_norm': 1.2862344980239868, 'learning_rate': 8.316303864948912e-05, 'epoch': 0.58}


 59%|█████▉    | 8000/13516 [42:56<23:17,  3.95it/s]

{'loss': 1.9681, 'grad_norm': 1.1008754968643188, 'learning_rate': 8.168221531171332e-05, 'epoch': 0.59}


 60%|█████▉    | 8100/13516 [43:26<23:55,  3.77it/s]  

{'loss': 1.9679, 'grad_norm': 1.6383261680603027, 'learning_rate': 8.02013919739375e-05, 'epoch': 0.6}


                                                    
 60%|██████    | 8112/13516 [46:07<25:17,  3.56it/s]

{'eval_loss': 1.9928597211837769, 'eval_runtime': 156.9714, 'eval_samples_per_second': 19.137, 'eval_steps_per_second': 19.137, 'epoch': 0.6}


 61%|██████    | 8200/13516 [46:29<23:09,  3.83it/s]   

{'loss': 1.9743, 'grad_norm': 1.1610630750656128, 'learning_rate': 7.872056863616171e-05, 'epoch': 0.61}


 61%|██████▏   | 8300/13516 [46:55<23:22,  3.72it/s]

{'loss': 2.0292, 'grad_norm': 1.2612155675888062, 'learning_rate': 7.72397452983859e-05, 'epoch': 0.61}


 62%|██████▏   | 8400/13516 [47:20<21:30,  3.97it/s]

{'loss': 2.0106, 'grad_norm': 1.2549808025360107, 'learning_rate': 7.575892196061011e-05, 'epoch': 0.62}


 63%|██████▎   | 8500/13516 [47:45<20:56,  3.99it/s]

{'loss': 1.9629, 'grad_norm': 1.2736836671829224, 'learning_rate': 7.42780986228343e-05, 'epoch': 0.63}


 64%|██████▎   | 8600/13516 [48:13<20:55,  3.92it/s]  

{'loss': 2.0052, 'grad_norm': 1.2897534370422363, 'learning_rate': 7.27972752850585e-05, 'epoch': 0.64}


 64%|██████▍   | 8700/13516 [48:39<20:40,  3.88it/s]

{'loss': 1.966, 'grad_norm': 1.3071428537368774, 'learning_rate': 7.131645194728269e-05, 'epoch': 0.64}


 65%|██████▌   | 8800/13516 [49:05<20:16,  3.88it/s]

{'loss': 2.0342, 'grad_norm': 1.0707776546478271, 'learning_rate': 6.98356286095069e-05, 'epoch': 0.65}


 66%|██████▌   | 8900/13516 [49:29<19:07,  4.02it/s]

{'loss': 1.9732, 'grad_norm': 1.4192588329315186, 'learning_rate': 6.835480527173108e-05, 'epoch': 0.66}


 67%|██████▋   | 9000/13516 [49:54<19:03,  3.95it/s]

{'loss': 1.9562, 'grad_norm': 1.7135225534439087, 'learning_rate': 6.687398193395528e-05, 'epoch': 0.67}


 67%|██████▋   | 9100/13516 [50:22<19:12,  3.83it/s]

{'loss': 1.9818, 'grad_norm': 1.5292508602142334, 'learning_rate': 6.539315859617949e-05, 'epoch': 0.67}


 68%|██████▊   | 9200/13516 [50:48<19:58,  3.60it/s]

{'loss': 2.0053, 'grad_norm': 2.398350238800049, 'learning_rate': 6.391233525840367e-05, 'epoch': 0.68}


 69%|██████▉   | 9300/13516 [51:14<18:45,  3.74it/s]

{'loss': 1.9332, 'grad_norm': 1.4831699132919312, 'learning_rate': 6.243151192062788e-05, 'epoch': 0.69}


 70%|██████▉   | 9400/13516 [51:40<17:15,  3.97it/s]

{'loss': 1.9978, 'grad_norm': 1.1958141326904297, 'learning_rate': 6.0950688582852065e-05, 'epoch': 0.7}


 70%|███████   | 9500/13516 [52:05<17:27,  3.83it/s]

{'loss': 1.999, 'grad_norm': 1.8207091093063354, 'learning_rate': 5.946986524507626e-05, 'epoch': 0.7}


 71%|███████   | 9600/13516 [52:35<17:03,  3.83it/s]  

{'loss': 1.9921, 'grad_norm': 1.0972926616668701, 'learning_rate': 5.7989041907300455e-05, 'epoch': 0.71}


 72%|███████▏  | 9700/13516 [53:00<15:44,  4.04it/s]

{'loss': 1.958, 'grad_norm': 1.3895002603530884, 'learning_rate': 5.6508218569524664e-05, 'epoch': 0.72}


 73%|███████▎  | 9800/13516 [53:26<15:36,  3.97it/s]

{'loss': 1.982, 'grad_norm': 1.0897735357284546, 'learning_rate': 5.502739523174886e-05, 'epoch': 0.73}


 73%|███████▎  | 9900/13516 [53:52<15:09,  3.98it/s]

{'loss': 1.9432, 'grad_norm': 1.837407112121582, 'learning_rate': 5.3546571893973054e-05, 'epoch': 0.73}


 74%|███████▍  | 10000/13516 [54:17<15:17,  3.83it/s]

{'loss': 1.9758, 'grad_norm': 1.5680663585662842, 'learning_rate': 5.206574855619725e-05, 'epoch': 0.74}


 75%|███████▍  | 10100/13516 [54:46<14:52,  3.83it/s]

{'loss': 1.9462, 'grad_norm': 1.7713606357574463, 'learning_rate': 5.0584925218421444e-05, 'epoch': 0.75}


 75%|███████▌  | 10200/13516 [55:12<14:58,  3.69it/s]

{'loss': 2.0063, 'grad_norm': 1.1422772407531738, 'learning_rate': 4.910410188064564e-05, 'epoch': 0.75}


 76%|███████▌  | 10300/13516 [55:39<13:52,  3.86it/s]

{'loss': 1.9787, 'grad_norm': 1.1926109790802002, 'learning_rate': 4.7623278542869835e-05, 'epoch': 0.76}


 77%|███████▋  | 10400/13516 [56:05<12:18,  4.22it/s]

{'loss': 1.9677, 'grad_norm': 2.0516750812530518, 'learning_rate': 4.614245520509403e-05, 'epoch': 0.77}


 78%|███████▊  | 10500/13516 [56:31<13:15,  3.79it/s]

{'loss': 1.9266, 'grad_norm': 1.355449914932251, 'learning_rate': 4.466163186731823e-05, 'epoch': 0.78}


 78%|███████▊  | 10600/13516 [56:59<12:31,  3.88it/s]

{'loss': 1.9692, 'grad_norm': 1.1972086429595947, 'learning_rate': 4.318080852954243e-05, 'epoch': 0.78}


 79%|███████▉  | 10700/13516 [57:25<13:03,  3.59it/s]

{'loss': 1.9523, 'grad_norm': 1.2707912921905518, 'learning_rate': 4.169998519176662e-05, 'epoch': 0.79}


 80%|███████▉  | 10800/13516 [57:51<11:23,  3.97it/s]

{'loss': 1.9542, 'grad_norm': 1.192301630973816, 'learning_rate': 4.0219161853990824e-05, 'epoch': 0.8}


                                                     
 80%|████████  | 10816/13516 [1:00:30<11:22,  3.96it/s]

{'eval_loss': 1.9493467807769775, 'eval_runtime': 154.8356, 'eval_samples_per_second': 19.401, 'eval_steps_per_second': 19.401, 'epoch': 0.8}


 81%|████████  | 10900/13516 [1:00:52<12:42,  3.43it/s]   

{'loss': 1.9454, 'grad_norm': 1.4599419832229614, 'learning_rate': 3.873833851621502e-05, 'epoch': 0.81}


 81%|████████▏ | 11000/13516 [1:01:18<10:42,  3.92it/s]

{'loss': 1.9721, 'grad_norm': 1.3066242933273315, 'learning_rate': 3.7257515178439215e-05, 'epoch': 0.81}


 82%|████████▏ | 11100/13516 [1:01:48<10:26,  3.86it/s]

{'loss': 1.9321, 'grad_norm': 1.511890172958374, 'learning_rate': 3.577669184066341e-05, 'epoch': 0.82}


 83%|████████▎ | 11200/13516 [1:02:14<09:59,  3.86it/s]

{'loss': 1.9939, 'grad_norm': 1.1289749145507812, 'learning_rate': 3.4295868502887605e-05, 'epoch': 0.83}


 84%|████████▎ | 11300/13516 [1:02:40<10:08,  3.64it/s]

{'loss': 1.9466, 'grad_norm': 1.3162590265274048, 'learning_rate': 3.28150451651118e-05, 'epoch': 0.84}


 84%|████████▍ | 11400/13516 [1:03:07<09:22,  3.76it/s]

{'loss': 1.9419, 'grad_norm': 1.2363221645355225, 'learning_rate': 3.1334221827336e-05, 'epoch': 0.84}


 85%|████████▌ | 11500/13516 [1:03:34<08:39,  3.88it/s]

{'loss': 1.9154, 'grad_norm': 1.906964898109436, 'learning_rate': 2.9853398489560197e-05, 'epoch': 0.85}


 86%|████████▌ | 11600/13516 [1:04:02<08:19,  3.84it/s]

{'loss': 1.9799, 'grad_norm': 1.7263284921646118, 'learning_rate': 2.8372575151784393e-05, 'epoch': 0.86}


 87%|████████▋ | 11700/13516 [1:04:28<07:49,  3.87it/s]

{'loss': 1.944, 'grad_norm': 1.2244606018066406, 'learning_rate': 2.689175181400859e-05, 'epoch': 0.87}


 87%|████████▋ | 11800/13516 [1:04:54<07:04,  4.05it/s]

{'loss': 1.9574, 'grad_norm': 1.1685798168182373, 'learning_rate': 2.5410928476232786e-05, 'epoch': 0.87}


 88%|████████▊ | 11900/13516 [1:05:21<06:54,  3.90it/s]

{'loss': 1.9587, 'grad_norm': 1.9525651931762695, 'learning_rate': 2.393010513845698e-05, 'epoch': 0.88}


 89%|████████▉ | 12000/13516 [1:05:47<06:33,  3.86it/s]

{'loss': 1.9452, 'grad_norm': 1.0687081813812256, 'learning_rate': 2.244928180068118e-05, 'epoch': 0.89}


 90%|████████▉ | 12100/13516 [1:06:15<05:53,  4.01it/s]

{'loss': 1.9132, 'grad_norm': 1.6398255825042725, 'learning_rate': 2.096845846290538e-05, 'epoch': 0.9}


 90%|█████████ | 12200/13516 [1:06:41<05:29,  4.00it/s]

{'loss': 1.938, 'grad_norm': 1.0928758382797241, 'learning_rate': 1.948763512512957e-05, 'epoch': 0.9}


 91%|█████████ | 12300/13516 [1:07:07<05:13,  3.88it/s]

{'loss': 1.9409, 'grad_norm': 1.1311906576156616, 'learning_rate': 1.800681178735377e-05, 'epoch': 0.91}


 92%|█████████▏| 12400/13516 [1:07:33<04:37,  4.03it/s]

{'loss': 1.9385, 'grad_norm': 1.8453664779663086, 'learning_rate': 1.6525988449577968e-05, 'epoch': 0.92}


 92%|█████████▏| 12500/13516 [1:07:58<04:13,  4.01it/s]

{'loss': 1.9293, 'grad_norm': 1.4330172538757324, 'learning_rate': 1.5045165111802161e-05, 'epoch': 0.92}


 93%|█████████▎| 12600/13516 [1:08:26<04:03,  3.76it/s]

{'loss': 1.9482, 'grad_norm': 1.4439367055892944, 'learning_rate': 1.356434177402636e-05, 'epoch': 0.93}


 94%|█████████▍| 12700/13516 [1:08:52<03:40,  3.70it/s]

{'loss': 1.9275, 'grad_norm': 1.4612382650375366, 'learning_rate': 1.2083518436250557e-05, 'epoch': 0.94}


 95%|█████████▍| 12800/13516 [1:09:18<03:10,  3.77it/s]

{'loss': 1.9508, 'grad_norm': 1.4471145868301392, 'learning_rate': 1.0602695098474752e-05, 'epoch': 0.95}


 95%|█████████▌| 12900/13516 [1:09:44<02:44,  3.74it/s]

{'loss': 1.9387, 'grad_norm': 2.3404409885406494, 'learning_rate': 9.12187176069895e-06, 'epoch': 0.95}


 96%|█████████▌| 13000/13516 [1:10:10<02:14,  3.85it/s]

{'loss': 1.9151, 'grad_norm': 1.465808629989624, 'learning_rate': 7.641048422923146e-06, 'epoch': 0.96}


 97%|█████████▋| 13100/13516 [1:10:40<02:59,  2.31it/s]

{'loss': 1.9406, 'grad_norm': 1.23361337184906, 'learning_rate': 6.160225085147342e-06, 'epoch': 0.97}


 98%|█████████▊| 13200/13516 [1:11:06<01:25,  3.69it/s]

{'loss': 1.919, 'grad_norm': 1.5034652948379517, 'learning_rate': 4.679401747371539e-06, 'epoch': 0.98}


 98%|█████████▊| 13300/13516 [1:11:35<00:54,  3.94it/s]

{'loss': 1.9297, 'grad_norm': 1.205229640007019, 'learning_rate': 3.1985784095957353e-06, 'epoch': 0.98}


 99%|█████████▉| 13400/13516 [1:12:02<00:28,  4.01it/s]

{'loss': 1.925, 'grad_norm': 2.311004638671875, 'learning_rate': 1.717755071819932e-06, 'epoch': 0.99}


100%|█████████▉| 13500/13516 [1:12:28<00:04,  3.70it/s]

{'loss': 1.9589, 'grad_norm': 1.4190388917922974, 'learning_rate': 2.3693173404412857e-07, 'epoch': 1.0}


100%|██████████| 13516/13516 [1:12:38<00:00,  3.10it/s]

{'train_runtime': 4358.1605, 'train_samples_per_second': 6.203, 'train_steps_per_second': 3.101, 'train_loss': 2.0453084658927274, 'epoch': 1.0}





TrainOutput(global_step=13516, training_loss=2.0453084658927274, metrics={'train_runtime': 4358.1605, 'train_samples_per_second': 6.203, 'train_steps_per_second': 3.101, 'total_flos': 7099737032134656.0, 'train_loss': 2.0453084658927274, 'epoch': 0.9999630081751932})

In [None]:
messages = [
    {
        "role": "user",
        "content": "kiro"
    }
]
begin_of_text = "<|begin_of_text|>"
end_of_text = "<|end_of_text|>"
start_header_id = "<|start_header_id|>"
end_header_id = "<|end_header_id|>"
eot_id = "<|eot_id|>"
prompt =f"""{begin_of_text}
{start_header_id}system{end_header_id} Tradu această propoziție din aromână în română.
{start_header_id}user{end_header_id} Te s-hiba, greaste tata-su al Teatire, - ficiorlu-a meu easte!{eot_id}
{start_header_id}assistant{end_header_id}"""

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=150, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

In [20]:
def translate(text, src_lang='rup', tgt_lang='ro', max_length=256):
    """Translate a text or list of texts"""
    if isinstance(text, str):
        text = [text]
    
    results = []
    for sentence in text:
        prompt = f"""{begin_of_text}
        {start_header_id}system{end_header_id} Tradu această propoziție din aromână în română.
        {start_header_id}user{end_header_id} {sentence}{eot_id}
        {start_header_id}assistant{end_header_id}"""
        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1, temperature=0.4, top_k=50, top_p=0.95)
        
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translation = result.split("assistant")[-1].strip()
        results.append(translation)
    
    return results

# Example usage
t = 'Ma, a lor la si paru c-amintara na vasilie-ntreaga'
print(translate(t, 'aromanian', 'romanian'))



['A fost, de-aici, si paru ca au trecut inainte un bine intreaga - pe cineva \n        \n        \n        \n        \n        атися\n        \n        \n        аракт\n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        lásil\n        \n        \n        \n        ЎыџNЎыџN\n        \n        \n         ForCanBeConverted\n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n        \n         ForCanBeConvertedToF']


In [None]:

df_ro_rup_test = pd.read_csv("../dataset/nllb_corpus_test.csv")
df_ro_rup_test['ro_pred'] = ''
df_ro_rup_test['rup_pred'] = ''

test_len = len(df_ro_rup_test)
test_len = 200
for i in tqdm(range(0, test_len)):
    rup_texts = df_ro_rup_test.loc[i, 'rup']
    
    if rup_texts:
        df_ro_rup_test.loc[i, 'ro_pred'] = translate(rup_texts, 'romanian', 'aromanian')



In [45]:
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF()
df_ro_rup_test_t = [re.sub(r'[\n:/]', '', el[0]) for el in df_ro_rup_test['ro_pred'].tolist() if el]

# df_ro_rup_test_t = [el[0] for el in df_ro_rup_test['ro_pred'].tolist() if el]

print("Aromanian to Romanian BLEU:", bleu_calc.corpus_score(df_ro_rup_test['ro'][:200].tolist(), [df_ro_rup_test_t][:200]))


Aromanian to Romanian BLEU: BLEU = 0.00 17.4/3.6/2.6/2.0 (BP = 0.000 ratio = 0.060 hyp_len = 478 ref_len = 8019)


In [46]:
df_ro_rup_test_v = []
for ref, pred in zip(df_ro_rup_test['ro'][:200].tolist(), df_ro_rup_test_t):
    if pred:
        df_ro_rup_test_v.append(ref)

print(len(df_ro_rup_test_v))
df_ro_rup_test_t = [el[:40] for el in df_ro_rup_test_t if el]
print(len(df_ro_rup_test_t))
print("Aromanian to Romanian BLEU:", bleu_calc.corpus_score(df_ro_rup_test_v, [df_ro_rup_test_t]))
print("Aromanian to Romanian CHRF:", chrf_calc.corpus_score(df_ro_rup_test_v, [df_ro_rup_test_t]))

200
200
Aromanian to Romanian BLEU: BLEU = 0.59 11.5/2.2/1.3/0.5 (BP = 0.292 ratio = 0.448 hyp_len = 478 ref_len = 1066)
Aromanian to Romanian CHRF: chrF2 = 8.83
