In [1]:
from datasets import load_dataset
# Important note: datasets version 3.6.0 will work if you got an error like: runtimeerror: dataset scripts are no longer supported, but found amazon_reviews_multi.py

en_ds = load_dataset("abisee/cnn_dailymail", "2.0.0")

tr_ds = load_dataset("reciTAL/mlsum", "tu") # 'tu' is the code for Turkish

In [2]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> text: {example['text']}'")
        print(f"'>> summary: {example['summary']}'")

In [3]:
tr_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 249277
    })
    validation: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 11565
    })
    test: Dataset({
        features: ['text', 'summary', 'topic', 'url', 'title', 'date'],
        num_rows: 12775
    })
})

In [4]:
tr_ds = tr_ds.select_columns(['text', 'summary'])
tr_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 249277
    })
    validation: Dataset({
        features: ['text', 'summary'],
        num_rows: 11565
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 12775
    })
})

In [5]:
en_ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [6]:
en_ds = en_ds.select_columns(["article","highlights"])
column_mapping = {
    "article": "text",
    "highlights": "summary"
}
en_ds = en_ds.rename_columns(column_mapping)
en_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['text', 'summary'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 11490
    })
})

In [7]:
en_ds = en_ds.filter(lambda x: len(x['summary'].split()) > 5)
tr_ds = tr_ds.filter(lambda x: len(x['summary'].split()) > 5)

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch

In [None]:
max_input_length = 512
max_target_length = 30 #i should check this too, because probably most of the labels didin't stop i just cut them.

def filter_long_inputs(example):
    input_length = len(tokenizer(example["text"], truncation=False)["input_ids"])
    return input_length <= max_input_length

filtered_ds_en = en_ds.filter(filter_long_inputs)
filtered_ds_tr = tr_ds.filter(filter_long_inputs)
original_columns = filtered_ds_tr['train'].column_names

def preprocess_function(examples):
    
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True, #this is be sure that there is no any row contain more then 512 tokens
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

filtered_ds_en = filtered_ds_en.map(preprocess_function, batched=True, remove_columns=original_columns)
filtered_ds_tr = filtered_ds_tr.map(preprocess_function, batched=True,remove_columns=original_columns)

In [1]:
from datasets import load_from_disk

#filtered_ds_en.save_to_disk("tokenized_english_dataset")
#filtered_ds_tr.save_to_disk("tokenized_turkish_dataset")

filtered_ds_en = load_from_disk("tokenized_english_dataset")
filtered_ds_tr = load_from_disk("tokenized_turkish_dataset")

In [2]:
filtered_ds_tr

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 152689
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8133
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9134
    })
})

In [3]:
filtered_ds_en

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 33189
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1952
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1574
    })
})

In [4]:
filtered_ds_tr['train'] = filtered_ds_tr['train'].shuffle(seed=42).select(range(33189))
filtered_ds_tr['validation'] = filtered_ds_tr['validation'].shuffle(seed=42).select(range(1952))
filtered_ds_tr['test'] = filtered_ds_tr['test'].shuffle(seed=42).select(range(1574))

In [5]:
from datasets import concatenate_datasets, DatasetDict

total_ds = DatasetDict()
for split in filtered_ds_tr.keys():
    total_ds[split] = concatenate_datasets(
        [filtered_ds_tr[split], filtered_ds_en[split]]
    )
    total_ds[split] = total_ds[split].shuffle(seed = 42)

In [6]:
total_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 66378
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3904
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3148
    })
})

# Model

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

model_id = "google/mt5-small"

# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",             # 4-bit NormalFloat
    bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for training to calculate weights but don't use fp16= True on args with this.
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model in 4-bit
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"  # This will automatically put the model on your RTX 3050
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,  # The "rank" of the new adapters (higher r = more params, 8 or 16 is a good start)
    lora_alpha=32,  # A scaling factor (often 2x r)
    # For mt5, we target the query ("q") and value ("v") layers in the attention blocks
    target_modules=["q", "v"], 
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"  # This is CRITICAL for T5/MT5 models
)
# This wraps your frozen, quantized model with the new, trainable LoRA adapters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


trainable params: 688,128 || all params: 300,864,896 || trainable%: 0.2287


`Recall` measures what percentage of the words in the human-written reference summary the model can capture. It looks at how much the reference summary "covers."

`Precision` measures how accurately the model extracts some of the words from its own context (i.e., the reference summary is also included). It penalizes how much "nonsense" or unnecessary words the model uses.

In [8]:
import evaluate

rouge_score = evaluate.load("rouge")

In [9]:
reference_summary =  "The flight is delayed due to bad weather."
generated_summary = "The flight is canceled due to weather."

In [10]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.7999999999999999,
 'rouge2': 0.4615384615384615,
 'rougeL': 0.7999999999999999,
 'rougeLsum': 0.7999999999999999}

* rouge1: Unigram Overlap ---> This score looks at the overlap of individual words (unigrams) between your generated summary and the reference summary.
* rouge2: Bigram Overlap ---> This score looks at the overlap of pairs of adjacent words (bigrams). It's a better measure of fluency and sentence structure than rouge1
* rougeL: Longest Common Subsequence ---> This score finds the Longest Common Subsequence (LCS) between the two summaries. An LCS is the longest sequence of words that appears in both summaries in the same order, but not necessarily right next to each other
* rougeLsum: Summary-Level LCS ---> This is the same as rougeL but applied at the summary level (it looks for the LCS across the entire text).

In [11]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [12]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/gokhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
import nltk
import evaluate
import numpy as np

rouge_metric = evaluate.load("rouge")
bertscore_metric = evaluate.load("bertscore")


def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    #predictions = np.argmax(predictions, axis=-1) we already use predict_with_generate 

    if torch.is_tensor(predictions):
        predictions = predictions.cpu().numpy()
    if torch.is_tensor(labels):
        labels = labels.cpu().numpy()


    predictions = np.nan_to_num(predictions, nan=tokenizer.pad_token_id, # on train mode it turning nan, inf values we avoid them with this lines.
                                 posinf=tokenizer.pad_token_id, 
                                 neginf=tokenizer.pad_token_id)
    
    labels = np.nan_to_num(labels, nan=tokenizer.pad_token_id) 

    predictions[predictions == -100] = tokenizer.pad_token_id
    labels[labels == -100] = tokenizer.pad_token_id

    predictions = predictions.astype(np.int64) #It guaranteed that we would get an array of type int (integer), which is exactly the format it expected, and it resolved the OverflowError error once and for all.
    labels = labels.astype(np.int64)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) #i got error for this line.
    #the error line was: OverflowError: out of range integral type conversion attempted
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    
    decoded_preds_nltk = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels_nltk = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    rouge_result = rouge_metric.compute(predictions=decoded_preds_nltk,
                                      references=decoded_labels_nltk,
                                      use_stemmer=True)
    
    bert_result = bertscore_metric.compute(predictions=decoded_preds,
                                         references=decoded_labels,
                                         model_type="bert-base-multilingual-cased",
                                         device="cpu")

    
    result = {k: v for k, v in rouge_result.items()}

    result["bertscore_f1"] = np.mean(bert_result["f1"])
    
    # Add generation length to see if your model is too verbose/short
    gen_len = np.mean([np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions])
    result["gen_len"] = gen_len

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
print("testing compute_metrics function...")

try:
    print("Caching BERTScore model (downloading if necessary)...")
    bertscore_metric.compute(predictions=["test"], references=["test"], 
                             model_type="bert-base-multilingual-cased", device="cpu")
    print("BERTScore modeli hazır.")
except Exception as e:
    print(f"BERTScore model download failed, metric will return 0. Error: {e}")



mock_labels_text = [
    "Türkiye Büyük Millet Meclisi Ankara'da bulunmaktadır.", # Turkish referans
    "The cat sat on the mat.", # english referans
    "Bu üçüncü bir referans cümlesidir."
]

mock_preds_text = [
    "TBMM Ankara'dadır.", # best preds
    "",
    "Bu üçüncü bir cümledir."
]

max_len = 30 

labels_tokenized = tokenizer(mock_labels_text, max_length=max_len, padding="max_length", truncation=True)
mock_labels_ids = np.array(labels_tokenized['input_ids'])
mock_labels_ids[mock_labels_ids == tokenizer.pad_token_id] = -100

preds_tokenized = tokenizer(mock_preds_text, max_length=max_len, padding="max_length", truncation=True)
mock_preds_ids = np.array(preds_tokenized['input_ids'], dtype=float) 

# OverflowError'u testing 
mock_preds_ids[1, 5] = np.inf 
mock_preds_ids[1, 6] = np.nan

print("\n--- Test Data (Token IDs) ---")
print("Mock Predictions (contains NaN/Inf):\n", mock_preds_ids)
print("\nMock Labels (contains -100):\n", mock_labels_ids)


mock_eval_preds = (mock_preds_ids, mock_labels_ids)

try:
    print("\n--- compute_metrics calling ---")
    metrics = compute_metrics(mock_eval_preds)
    print("\n--- Succesfull ---")
    print("calculated matrics:")
    print(metrics)

    # Sonuçları Doğrula
    assert "rouge1" in metrics
    assert "bertscore_f1" in metrics
    assert "gen_len" in metrics
    assert metrics["gen_len"] > 0 # gen_len 0 olmamalı
    print("\n all of the metrics are exist.")
    
except Exception as e:
    print("\n--- fail  ---")
    print(f"An error occurred while the function was running: {e}")
    import traceback
    traceback.print_exc()

testing compute_metrics function...
Caching BERTScore model (downloading if necessary)...
BERTScore modeli hazır.

--- Test Data (Token IDs) ---
Mock Predictions (contains NaN/Inf):
 [[4.17530e+04 1.14120e+04 2.17630e+04 2.77000e+02 1.53624e+05 2.60000e+02
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00         inf
          nan 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [1.15300e+03 2.59000e+02 6.73700e+03 5.03830e+04 6.98000e+02 3.17000e+02
 



In [16]:
from huggingface_hub import login,whoami

login(token="hf_XXXXXXXXXXXXXXXXXXXXXXXXXX")

try:
    user_info = whoami()
    print("User Name:", user_info['name'])
except Exception as e:
    print(" Failed to log in", e)

User Name: gokhanErgul


What was the Issue? The error was caused by the fp16=True setting in Seq2SeqTrainingArguments. This setting forced LoRA adapters and gradient calculations to be in the unstable float16 format, which resulted in a NaN/OverflowError.

In [19]:
from transformers import Seq2SeqTrainingArguments

# This is the batch size for one device (your GPU)
per_device_batch_size = 8 
# This is the number of steps to accumulate gradients for
# This gives you a larger *effective* batch size
gradient_accumulation_steps = 8
# This is your true, effective batch size
batch_size = per_device_batch_size * gradient_accumulation_steps

logging_steps = len(total_ds["train"]) // batch_size
logging_steps = 250
model_name = model_id.split("/")[-1]
args = Seq2SeqTrainingArguments(
    output_dir= f"{model_name}-finetuned-tr-en",
    
    eval_strategy="steps",
    eval_steps=250,
    save_strategy="steps",
    learning_rate=5.6e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8, # it was 3 before but i didn't like the performance.
    logging_steps=logging_steps,
    max_grad_norm=1.0, #this will ensure that there is no nan. The magnitude of the learning signal (gradient) can never exceed 1.0."
    
    per_device_train_batch_size=per_device_batch_size,

    gradient_accumulation_steps=gradient_accumulation_steps,
    
    per_device_eval_batch_size=16,
    
    #fp16=True, this caused to nan values at validation loss on traning 
    bf16=True,
    
    dataloader_num_workers=8,
    
    # torch_compile=True,  it doesn't work on me.
    
    predict_with_generate=True,
    push_to_hub=False, #i trained it withoud internet.
    report_to="none",
)


In [18]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
features = [total_ds["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': tensor([[  1641,    715,  71213,    276,    261,  36592,  29209,   2186,   5061,
          45215,   5522,  26618,  56304, 105132,    261,    313,  84301,    270,
          88025,  57990,  60811,  46829,   1099,   3436,  26584,    314,    261,
            390,   7942,    529,  33564, 114716,    263,    620,  67027,   2222,
          66716,  87181, 127915, 196210,  15109,    648,    529,  87202,    259,
          76323,   2172,    261,  89721,    259,  39557,   6400,   2346,  90749,
         213212,    259,  90008,    293,   2667,  54006, 200473,    330,  40357,
           2941,  28686,    293,    407, 153062,    314,    260,    419,    259,
            318,  96264,  75060,    272,   6789, 105132,   3143,   8377,  22796,
          34059,    259,  76323,  18434,    261,  46386,  23468,    698,   1099,
           9437,  57938,    259,   1986,  69277,    529,  24111,  41798,   2222,
          15846,    266,  43600,    266,  43780, 224921,  40373,  42278, 105612,
          1145

In [22]:
train_subset_size = 1600  # 1600 / 8 = 200 train steps
val_test_subset_size = 320 # 320 / 16 = 20 eval steps

debug_ds = DatasetDict({
    "train": total_ds["train"].select(range(train_subset_size)),
    "validation": total_ds["validation"].select(range(val_test_subset_size)),
    "test": total_ds["test"].select(range(val_test_subset_size))
})

In [23]:
debug_args = Seq2SeqTrainingArguments(
    output_dir="./debug_run",          
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",               
    learning_rate=5.6e-5,            
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=16,   
    gradient_accumulation_steps=8,   
    bf16=True,                       
    num_train_epochs=2,              
    
    logging_steps=10,                  
    
    predict_with_generate=True, 
    
    push_to_hub=False                  
)

Issue: When the model is loaded with QLoRA (4-bit quantization) and the fp16=True setting is used in Seq2SeqTrainingArguments, the Validation Loss appears as nan (Not a Number), and training crashes.

Cause: Training a model reduced to 4-bit with QLoRA using the fp16 (float16) format results in numerical instability. The dynamic range of fp16 is not sufficient to handle the very small or very large gradient values ​​generated during calculations with 4-bit weights.

This results in nan or inf (infinite) values ​​in the calculations, rendering the model loss mathematically undefined.

In [24]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args= debug_args,
    train_dataset=debug_ds['train'],
    eval_dataset=debug_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)
trainer.train()

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Currently logged in as: [33mgokhannergull[0m ([33mgokhannergull-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bertscore F1,Gen Len
10,23.752,13.589259,0.0279,0.0057,0.0267,0.0273,0.5674,5.9125
20,23.4059,13.356882,0.028,0.0059,0.0267,0.0273,0.5685,5.9031
30,23.6669,13.126556,0.0274,0.0059,0.0262,0.0267,0.5681,6.0031
40,23.9037,12.920822,0.0281,0.006,0.0268,0.0274,0.5685,5.9719
50,22.6662,12.978966,0.0271,0.0051,0.0257,0.0262,0.5682,6.0031


TrainOutput(global_step=50, training_loss=23.478935546875, metrics={'train_runtime': 292.6429, 'train_samples_per_second': 10.935, 'train_steps_per_second': 0.171, 'total_flos': 1581293418872832.0, 'train_loss': 23.478935546875, 'epoch': 2.0})

In [22]:
import torch
from transformers import DataCollatorForSeq2Seq
print("Starting STRESS TEST with real 'validation' data...")

try:
    batch_size = 16
    eval_sample = total_ds["validation"].select(range(batch_size))
    
    collated_batch = data_collator(eval_sample)
    
    device = model.device
    input_ids = collated_batch["input_ids"].to(device)
    attention_mask = collated_batch["attention_mask"].to(device)
    labels = collated_batch["labels"].to(device)
    
    print(f"Successfully received {batch_size} rows of real data and moved it to the GPU.")

 
    print("Running model.generate() (This will occur if there is instability)...")
    
    with torch.no_grad(): 
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            min_length=10, 
            max_length=30
        )
    
    print("model.generate() done.")

    predictions_np = generated_ids.cpu().numpy()
    labels_np = labels.cpu().numpy()

    eval_preds = (predictions_np, labels_np)

    print("\n--- CALLING compute_metrics (With Actual Model Output)) ---")
    metrics = compute_metrics(eval_preds)
    
    print("\n--- STRESS TEST succesfull ---")
    print("calculated metrics:")
    print(metrics)
    
    print(f"If you got here without getting an 'OverflowError',")
    print(f"Your compute_metrics function is the model's output (including inf/nan)")
    print(f"100% robust against all unstable outputs.")
    print(f"It's normal for ROUGE scores to be very low (model untrained).")
    print(f"It's also normal for gen_len to be ~10.0 (we forced it with min_length=10).")

except Exception as e:
    print("\n--- STRESS TEST fail ---")
    print(f"An error occurred during testing: {e}")
    import traceback
    traceback.print_exc()

Starting STRESS TEST with real 'validation' data...
Successfully received 16 rows of real data and moved it to the GPU.
Running model.generate() (This will occur if there is instability)...
model.generate() done.

--- CALLING compute_metrics (With Actual Model Output)) ---

--- STRESS TEST succesfull ---
calculated metrics:
{'rouge1': 0.032, 'rouge2': 0.0089, 'rougeL': 0.0282, 'rougeLsum': 0.0323, 'bertscore_f1': 0.5826, 'gen_len': 11.5625}
If you got here without getting an 'OverflowError',
Your compute_metrics function is the model's output (including inf/nan)
100% robust against all unstable outputs.
It's normal for ROUGE scores to be very low (model untrained).
It's also normal for gen_len to be ~10.0 (we forced it with min_length=10).


In [23]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args= args,
    train_dataset=total_ds['train'],
    eval_dataset=total_ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)
trainer.train(resume_from_checkpoint = "mt5-small-finetuned-tr-en/checkpoint-3000")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss


TrainOutput(global_step=3114, training_loss=0.1522744631415081, metrics={'train_runtime': 383.1088, 'train_samples_per_second': 519.784, 'train_steps_per_second': 8.128, 'total_flos': 9.89673909037056e+16, 'train_loss': 0.1522744631415081, 'epoch': 3.0})

In [24]:
evaluation_results = trainer.evaluate()
evaluation_results

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 3.309802293777466,
 'eval_rouge1': 0.2514,
 'eval_rouge2': 0.1323,
 'eval_rougeL': 0.2171,
 'eval_rougeLsum': 0.2322,
 'eval_bertscore_f1': 0.6773,
 'eval_gen_len': 19.8591,
 'eval_runtime': 387.8001,
 'eval_samples_per_second': 10.067,
 'eval_steps_per_second': 0.629,
 'epoch': 3.0}

In [18]:
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# i am gonna run the model withoud internet.
evaluate.load("rouge")
bertscore_metric = evaluate.load("bertscore")

try:
    bertscore_metric.compute(
        predictions=["hello"], 
        references=["world"], 
        model_type="bert-base-multilingual-cased"
    )
except Exception as e:
    pass

In [19]:
import os
os.environ["HF_HUB_OFFLINE"] = "1"

os.environ["WANDB_DISABLED"] = "true"

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [22]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args= args,
    train_dataset=total_ds['train'],
    eval_dataset=total_ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)
trainer.train(resume_from_checkpoint = "mt5-small-finetuned-tr-en/checkpoint-3000")

  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bertscore F1,Gen Len
3250,4.1478,3.298348,0.2537,0.135,0.219,0.2345,0.6779,19.822
3500,4.1147,3.278165,0.2478,0.1305,0.2136,0.2285,0.6763,19.814
3750,4.0672,3.263816,0.254,0.1349,0.2195,0.2351,0.6782,19.8317
4000,4.0476,3.251562,0.2561,0.1374,0.2226,0.2372,0.6785,19.8253
4250,4.0266,3.252216,0.2562,0.1382,0.2228,0.2372,0.6786,19.8064
4500,4.0228,3.225316,0.2571,0.1386,0.2237,0.2382,0.6789,19.7631
4750,4.0066,3.218869,0.2525,0.1345,0.2197,0.2338,0.6779,19.739
5000,4.0033,3.210943,0.2532,0.1348,0.2206,0.2345,0.6779,19.7754
5250,3.9924,3.195982,0.254,0.1363,0.2216,0.2352,0.6779,19.7597
5500,3.9818,3.208672,0.2562,0.1379,0.2237,0.2371,0.6788,19.7392


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=8304, training_loss=2.546525927637354, metrics={'train_runtime': 23927.9026, 'train_samples_per_second': 22.193, 'train_steps_per_second': 0.347, 'total_flos': 2.6389096594855526e+17, 'train_loss': 2.546525927637354, 'epoch': 8.0})

In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import PeftModel

model_id = "google/mt5-small"
adapter_path = "mt5-small-finetuned-tr-en/checkpoint-8304" 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_path)

model = model.merge_and_unload() 

model.eval()



MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear4bit(in_features=512, out_features=384, bias=False)
              (k): Linear4bit(in_features=512, out_features=384, bias=False)
              (v): Linear4bit(in_features=512, out_features=384, bias=False)
              (o): Linear4bit(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear4bit(in_features=512, out_features=1024

In [24]:
model.push_to_hub('gokhanErgul/mt5-small-finetuned-tr-en')

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/gokhanErgul/mt5-small-finetuned-tr-en/commit/d7315a8379f279fcef0e1af206fe6a0733b7df78', commit_message='Upload MT5ForConditionalGeneration', commit_description='', oid='d7315a8379f279fcef0e1af206fe6a0733b7df78', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gokhanErgul/mt5-small-finetuned-tr-en', endpoint='https://huggingface.co', repo_type='model', repo_id='gokhanErgul/mt5-small-finetuned-tr-en'), pr_revision=None, pr_num=None)

In [25]:
tokenizer.push_to_hub('gokhanErgul/mt5-small-finetuned-tr-en')

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/gokhanErgul/mt5-small-finetuned-tr-en/commit/56589b7288945821740ba6d2ea842f8b993b364f', commit_message='Upload tokenizer', commit_description='', oid='56589b7288945821740ba6d2ea842f8b993b364f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gokhanErgul/mt5-small-finetuned-tr-en', endpoint='https://huggingface.co', repo_type='model', repo_id='gokhanErgul/mt5-small-finetuned-tr-en'), pr_revision=None, pr_num=None)

In [10]:
from transformers import pipeline
pipe = pipeline('summarization', model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [11]:
text_tr = """
Türkiye, resmî adıyla Türkiye Cumhuriyeti, topraklarının büyük bölümü Batı Asya'da Anadolu'da, diğer bir bölümü ise Güneydoğu Avrupa'nın uzantısı Doğu Trakya'da olan kıtalararası bir ülkedir. Batıda Bulgaristan ve Yunanistan, doğuda Gürcistan, Ermenistan, İran ve Azerbaycan, güneyde ise Irak ve Suriye ile sınır komşusudur. Güneyini Kıbrıs ve Akdeniz, batısını Ege Denizi, kuzeyini ise Karadeniz çevreler. Marmara Denizi ise İstanbul Boğazı ve Çanakkale Boğazı ile birlikte Anadolu'yu Trakya'dan, yani Asya'yı Avrupa'dan ayırır. Resmî olarak laik bir devlet olan Türkiye'de nüfusun çoğunluğu Müslümandır. Ankara, Türkiye'nin başkenti ve ikinci en kalabalık şehri; İstanbul ise, Türkiye'nin en kalabalık şehri, ekonomik merkezi ve aynı zamanda Avrupa'nın en kalabalık şehridir.Türkiye toprakları üzerinde bulunan ilk yerleşmeler Yontma Taş Devri'nde başlar. Doğu Trakya'da Traklar olmak üzere, Hititler, Frigler, Lidyalılar ve Dor istilası sonucu Yunanistan'dan kaçan Akalar tarafından kurulan İyon medeniyeti gibi çeşitli eski Anadolu medeniyetlerinin ardından, Makedonya kralı Büyük İskender'in egemenliğiyle ve fetihleriyle birlikte Helenistik Dönem başladı. Daha sonra, sırasıyla Roma İmparatorluğu ve Anadolu'nun Hristiyanlaştığı Bizans dönemleri yaşandı. Selçuklu Türklerinin 1071 yılında Bizans'a karşı kazandığı Malazgirt Meydan Muharebesi ile Anadolu'daki Bizans üstünlüğü büyük ölçüde kırılarak Anadolu, kısa süre içerisinde Selçuklulara bağlı Türk beyleri tarafından ele geçirildi ve Anadolu toprakları üzerinde İslamlaşma ve Türkleşme faaliyetleri başladı.
"""

text_en = """
Fine-tuning in machine learning is the process of adapting a pre-trained model for specific tasks or use cases. It has become a fundamental deep learning technique, particularly in the training process of foundation models used for generative AI.
Fine-tuning could be considered a subset of the broader technique of transfer learning: the practice of leveraging knowledge an existing model has already learned as the starting point for learning new tasks.
The intuition behind fine-tuning is that, essentially, it’s easier and cheaper to hone the capabilities of a pre-trained base model that has already acquired broad learnings relevant to the task at hand than it is to train a new model from scratch for that specific purpose. This is especially true for deep learning models with millions or even billions of parameters, like the large language models (LLMs) that have risen to prominence in the field of natural language processing (NLP) or the complex convolutional neural networks (CNNs) and vision transformers (ViTs) used for computer vision tasks like image classification, object detection or image segmentation.By leveraging prior model training through transfer learning, fine-tuning can reduce the amount of expensive computing power and labeled data needed to obtain large models tailored to niche use cases and business needs. For example, fine-tuning can be used to simply adjust the conversational tone of a pre-trained LLM or the illustration style of a pre-trained image generation model; it could also be used to supplement learnings from a model’s original training dataset with proprietary data or specialized, domain-specific knowledge.Fine-tuning thus plays an important role in the real-world application of machine learning models, helping democratize access to and customization of sophisticated models.
Conversely, fine-tuning entails techniques to further train a model whose weights have already been updated through prior training. Using the base model’s previous knowledge as a starting point, fine-tuning tailors the model by training it on a smaller, task-specific dataset.While that task-specific dataset could theoretically have been used for the initial training, training a large model from scratch on a small dataset risks overfitting: the model might learn to perform well on the training examples, but generalize poorly to new data. This would render the model ill-suited to its given task and defeat the purpose of model training.Fine-tuning thus provides the best of both worlds: leveraging the broad knowledge and stability gained from pre-training on a massive set of data and honing the model’s understanding of more detailed, specific concepts. 
"""

Your model involves generating thousands of "half-lived," "truncated" sentences over 8 epochs.

Your model has "summary production" knowledge, but it has NEVER learned to "finish the summary and stop" (</s>).

He produces a few meaningful words (which is why your BERTScore is high and your ROUGE is low).

Then he panics because he doesn't know how to "stop."

Your Broken Fine-Tuning Task: You tried to teach this model to "summarize" for 8 epochs. BUT, because 99% of the labels were clipped without the </s> (stop) token, this summarization training failed and was "broken."

In [27]:
tr_output = pipe(
    [text_tr],
    max_new_tokens=30,  
    
    min_length=10,
    num_beams=4,
    truncation=True,
    
    no_repeat_ngram_size=3,  
    early_stopping=True      
)

print(tr_output[0]['summary_text'])

<extra_id_0>, Türkiye, resmî adıyla Türkiye Cumhuriyeti, Türkiye'nin en kalabalık şehri, ekonomik merkezidir. Türkiye, Türkiye


In [26]:
en_output = pipe(
    [text_en],
    max_new_tokens = 30,
    truncation = True,
    no_repeat_ngram_size = 3,
    min_length = 10,
    num_beams = 4
)


en_output[0]['summary_text']

'<extra_id_0> could be considered a subset of the broader technique of transfer learning. Fine-tuning is the process of adapting a'

# This is the error i got first but i fixed it.
## I wanted to keep the error.

In [20]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args= args,
    train_dataset=total_ds['train'],
    eval_dataset=total_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)
trainer.train()

  trainer = Seq2SeqTrainer(
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


OverflowError: out of range integral type conversion attempted

In [22]:
!nvidia-smi

Mon Oct 20 16:57:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.65                 Driver Version: 577.03         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   43C    P8              6W /   70W |    2393MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                