### This notebooks contains a full-parameter training --- (not feasible with limited resources)

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "meta-llama/Llama-3.2-1B"
new_model = "llama-3.2-1b-chat-aromanian_v3"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model)

In [3]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [5]:
import pandas as pd
from datasets import Dataset
import unicodedata
def apply_supplimentary_transformations(df_path):
    df = pd.read_csv(df_path)
    df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
    # There are some words that have a -mi at the end, we will eliminate them also
    df_transformed.replace(r'\s*-\s*mi\b', '', regex=True, inplace=True)
    df_transformed.replace(r'\(i\)', 'i', regex=True, inplace=True)
    df_transformed.replace('γ', 'y', regex=True, inplace=True)
    df_transformed.replace(r'’', '', regex=True, inplace=True)
    df_transformed.replace(r'“', '', regex=True, inplace=True)
    df_transformed.replace(r'„', '', regex=True, inplace=True)
    df_transformed.columns = [str(q).strip() for q in df_transformed.columns]
    return df_transformed
# df_transformed.drop(columns=['ro', 'rup', 'translations'], inplace=True)
train_dataset = Dataset.from_pandas(apply_supplimentary_transformations("../dataset/nllb_corpus_train.csv"))
train_dataset = train_dataset.shuffle(seed=42)

test_dataset = Dataset.from_pandas(apply_supplimentary_transformations("../dataset/nllb_corpus_test.csv"))
test_dataset = test_dataset.shuffle(seed=42) 


def generate_prompt(data_point):
    begin_of_text = "<|begin_of_text|>"
    end_of_text = "<|end_of_text|>"
    start_header_id = "<|start_header_id|>"
    end_header_id = "<|end_header_id|>"
    eot_id = "<|eot_id|>"
    return {"text":
    f"""{begin_of_text}
        {start_header_id}system{end_header_id} Tradu urmatorul text din aromana in romana:
        {start_header_id}user{end_header_id} {data_point["rup"]}{eot_id}
        {start_header_id}assistant{end_header_id} {data_point["ro"]}{eot_id}
        {end_of_text}"""}

train_dataset = train_dataset.map(generate_prompt)
test_dataset = test_dataset.map(generate_prompt)

# dataset['text'][3]

  df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
  df_transformed = df.applymap(lambda x: ''.join([c for c in unicodedata.normalize('NFKD', x)  if unicodedata.category(c) != 'Mn']) if type(x) == str else x)
Map: 100%|██████████| 27033/27033 [00:00<00:00, 28669.40 examples/s]
Map: 100%|██████████| 3004/3004 [00:00<00:00, 27573.27 examples/s]


In [6]:
def tokenize_function(examples):
    tokenized_inputs = {"input_ids": [], "attention_mask": [], "labels": []}

    for text in examples['text']:
        tokenized_input = tokenizer(text, truncation=True, padding='max_length', max_length=512)

        labels = tokenized_input["input_ids"].copy()
        
        assistant_idx = text.find("<|start_header_id|> assistant")
        assistant_token_idx = len(tokenizer(text[:assistant_idx])["input_ids"])

        labels[:assistant_token_idx] = [-100] * assistant_token_idx
        
        tokenized_inputs["input_ids"].append(tokenized_input["input_ids"])
        tokenized_inputs["attention_mask"].append(tokenized_input["attention_mask"])
        tokenized_inputs["labels"].append(labels)
    
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 27033/27033 [00:08<00:00, 3177.02 examples/s]
Map: 100%|██████████| 3004/3004 [00:00<00:00, 3183.00 examples/s]


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,   
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 10/27033 [03:08<150:17:48, 20.02s/it]

{'loss': 1.1984, 'grad_norm': 1.93274971138635e-07, 'learning_rate': 1.9992601635038658e-05, 'epoch': 0.0}


  0%|          | 19/27033 [07:10<225:52:54, 30.10s/it]