In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import nltk


  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:

nltk.download('punkt')

# Load dataset
dset = load_dataset("SEACrowd/liputan6", trust_remote_code=True)

# Use subsets of the dataset
train_data = dset["train"].select(range(10000))
val_data = dset["validation"].select(range(2000))
test_data = dset["test"].select(range(2000))

# Load tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:

# Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
tokenized_train = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
tokenized_val = val_data.map(preprocess_function, batched=True, remove_columns=val_data.column_names)
tokenized_test = test_data.map(preprocess_function, batched=True, remove_columns=test_data.column_names)


Map: 100%|██████████| 10000/10000 [00:12<00:00, 798.64 examples/s]
Map: 100%|██████████| 2000/2000 [00:02<00:00, 762.67 examples/s]
Map: 100%|██████████| 2000/2000 [00:02<00:00, 789.70 examples/s]


In [6]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [7]:

# Fine-tune the model
trainer.train()


                                                   
  0%|          | 6/7500 [02:29<17:32:26,  8.43s/it]

{'loss': 41.2839, 'grad_norm': 1699.947021484375, 'learning_rate': 1.9920000000000002e-05, 'epoch': 0.0}


                                                   
  0%|          | 6/7500 [03:48<17:32:26,  8.43s/it]

{'loss': 40.8443, 'grad_norm': 2572.601806640625, 'learning_rate': 1.9840000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 6/7500 [05:06<17:32:26,  8.43s/it]

{'loss': 37.6702, 'grad_norm': 1204.8099365234375, 'learning_rate': 1.976e-05, 'epoch': 0.01}


                                                   
  0%|          | 6/7500 [06:26<17:32:26,  8.43s/it]

{'loss': 37.4363, 'grad_norm': 3518.0849609375, 'learning_rate': 1.968e-05, 'epoch': 0.02}


                                                   
  0%|          | 6/7500 [07:45<17:32:26,  8.43s/it]

{'loss': 35.4291, 'grad_norm': 2676.642578125, 'learning_rate': 1.9600000000000002e-05, 'epoch': 0.02}


                                                   
  0%|          | 6/7500 [09:05<17:32:26,  8.43s/it]

{'loss': 37.0985, 'grad_norm': 1759.005126953125, 'learning_rate': 1.9520000000000003e-05, 'epoch': 0.02}


                                                   
  0%|          | 6/7500 [10:23<17:32:26,  8.43s/it]

{'loss': 34.4522, 'grad_norm': 9726.7744140625, 'learning_rate': 1.944e-05, 'epoch': 0.03}


                                                   
  0%|          | 6/7500 [11:46<17:32:26,  8.43s/it]

{'loss': 34.1453, 'grad_norm': 1339.78369140625, 'learning_rate': 1.936e-05, 'epoch': 0.03}


                                                   
  0%|          | 6/7500 [13:05<17:32:26,  8.43s/it]

{'loss': 33.1727, 'grad_norm': 2060.73486328125, 'learning_rate': 1.9280000000000002e-05, 'epoch': 0.04}


                                                   
  0%|          | 6/7500 [14:22<17:32:26,  8.43s/it] 

{'loss': 32.162, 'grad_norm': 2462.172119140625, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.04}


                                                   
  0%|          | 6/7500 [15:40<17:32:26,  8.43s/it] 

{'loss': 31.9432, 'grad_norm': 1852.0137939453125, 'learning_rate': 1.912e-05, 'epoch': 0.04}


                                                   
  0%|          | 6/7500 [16:58<17:32:26,  8.43s/it] 

{'loss': 31.7824, 'grad_norm': 12942.162109375, 'learning_rate': 1.904e-05, 'epoch': 0.05}


                                                   
  0%|          | 6/7500 [18:15<17:32:26,  8.43s/it] 

{'loss': 31.4428, 'grad_norm': 998.2999267578125, 'learning_rate': 1.896e-05, 'epoch': 0.05}


                                                   
  0%|          | 6/7500 [19:34<17:32:26,  8.43s/it] 

{'loss': 30.7354, 'grad_norm': 4799.044921875, 'learning_rate': 1.8880000000000002e-05, 'epoch': 0.06}


                                                   
  0%|          | 6/7500 [20:53<17:32:26,  8.43s/it] 

{'loss': 30.7985, 'grad_norm': 1245.0245361328125, 'learning_rate': 1.88e-05, 'epoch': 0.06}


                                                   
  0%|          | 6/7500 [22:12<17:32:26,  8.43s/it] 

{'loss': 29.6316, 'grad_norm': 4244.875, 'learning_rate': 1.8720000000000004e-05, 'epoch': 0.06}


                                                   
  0%|          | 6/7500 [23:29<17:32:26,  8.43s/it] 

{'loss': 29.7006, 'grad_norm': 5738.58447265625, 'learning_rate': 1.864e-05, 'epoch': 0.07}


                                                   
  0%|          | 6/7500 [24:47<17:32:26,  8.43s/it] 

{'loss': 28.4562, 'grad_norm': 5632.013671875, 'learning_rate': 1.8560000000000002e-05, 'epoch': 0.07}


                                                   
  0%|          | 6/7500 [26:05<17:32:26,  8.43s/it] 

{'loss': 29.2505, 'grad_norm': 997.3229370117188, 'learning_rate': 1.8480000000000003e-05, 'epoch': 0.08}


                                                   
  0%|          | 6/7500 [27:23<17:32:26,  8.43s/it] 

{'loss': 27.1263, 'grad_norm': 2253.961181640625, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.08}


                                                   
  0%|          | 6/7500 [28:40<17:32:26,  8.43s/it] 

{'loss': 25.5574, 'grad_norm': 770.4049072265625, 'learning_rate': 1.832e-05, 'epoch': 0.08}


                                                   
  0%|          | 6/7500 [29:57<17:32:26,  8.43s/it] 

{'loss': 26.1183, 'grad_norm': 3455.976318359375, 'learning_rate': 1.824e-05, 'epoch': 0.09}


                                                   
  0%|          | 6/7500 [31:15<17:32:26,  8.43s/it] 

{'loss': 25.4525, 'grad_norm': 1818.9271240234375, 'learning_rate': 1.8160000000000002e-05, 'epoch': 0.09}


                                                   
  0%|          | 6/7500 [32:33<17:32:26,  8.43s/it] 

{'loss': 25.3642, 'grad_norm': 3101.0126953125, 'learning_rate': 1.8080000000000003e-05, 'epoch': 0.1}


                                                   
  0%|          | 6/7500 [33:50<17:32:26,  8.43s/it] 

{'loss': 24.8705, 'grad_norm': 1299.878173828125, 'learning_rate': 1.8e-05, 'epoch': 0.1}


                                                   
  0%|          | 6/7500 [35:09<17:32:26,  8.43s/it] 

{'loss': 25.1122, 'grad_norm': 4287.37841796875, 'learning_rate': 1.792e-05, 'epoch': 0.1}


                                                   
  0%|          | 6/7500 [36:28<17:32:26,  8.43s/it] 

{'loss': 24.8595, 'grad_norm': 1730.5113525390625, 'learning_rate': 1.7840000000000002e-05, 'epoch': 0.11}


                                                   
  0%|          | 6/7500 [37:49<17:32:26,  8.43s/it] 

{'loss': 24.6616, 'grad_norm': 2426.045654296875, 'learning_rate': 1.7760000000000003e-05, 'epoch': 0.11}


                                                   
  0%|          | 6/7500 [39:08<17:32:26,  8.43s/it] 

{'loss': 23.0286, 'grad_norm': 1551.8365478515625, 'learning_rate': 1.768e-05, 'epoch': 0.12}


                                                   
  0%|          | 6/7500 [40:26<17:32:26,  8.43s/it] 

{'loss': 22.2743, 'grad_norm': 719.4118041992188, 'learning_rate': 1.76e-05, 'epoch': 0.12}


                                                   
  0%|          | 6/7500 [41:44<17:32:26,  8.43s/it] 

{'loss': 23.3362, 'grad_norm': 1228.757568359375, 'learning_rate': 1.752e-05, 'epoch': 0.12}


                                                   
  0%|          | 6/7500 [43:02<17:32:26,  8.43s/it] 

{'loss': 23.4166, 'grad_norm': 2378.04052734375, 'learning_rate': 1.7440000000000002e-05, 'epoch': 0.13}


                                                   
  0%|          | 6/7500 [44:20<17:32:26,  8.43s/it] 

{'loss': 22.6987, 'grad_norm': 2447.351806640625, 'learning_rate': 1.736e-05, 'epoch': 0.13}


                                                   
  0%|          | 6/7500 [45:38<17:32:26,  8.43s/it] 

{'loss': 22.3035, 'grad_norm': 4241.8681640625, 'learning_rate': 1.728e-05, 'epoch': 0.14}


                                                   
  0%|          | 6/7500 [46:56<17:32:26,  8.43s/it] 

{'loss': 22.3144, 'grad_norm': 1034.4288330078125, 'learning_rate': 1.72e-05, 'epoch': 0.14}


                                                   
  0%|          | 6/7500 [48:14<17:32:26,  8.43s/it] 

{'loss': 21.278, 'grad_norm': 1754.3931884765625, 'learning_rate': 1.7120000000000002e-05, 'epoch': 0.14}


                                                   
  0%|          | 6/7500 [49:32<17:32:26,  8.43s/it] 

{'loss': 20.2212, 'grad_norm': 7689.17041015625, 'learning_rate': 1.704e-05, 'epoch': 0.15}


                                                   
  0%|          | 6/7500 [50:51<17:32:26,  8.43s/it] 

{'loss': 20.9753, 'grad_norm': 592.1343994140625, 'learning_rate': 1.696e-05, 'epoch': 0.15}


                                                   
  0%|          | 6/7500 [52:10<17:32:26,  8.43s/it] 

{'loss': 19.8277, 'grad_norm': 1003.9406127929688, 'learning_rate': 1.688e-05, 'epoch': 0.16}


                                                   
  0%|          | 6/7500 [53:29<17:32:26,  8.43s/it] 

{'loss': 20.4984, 'grad_norm': 6641.44970703125, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.16}


                                                   
  0%|          | 6/7500 [54:48<17:32:26,  8.43s/it] 

{'loss': 21.0572, 'grad_norm': 1068.7742919921875, 'learning_rate': 1.672e-05, 'epoch': 0.16}


                                                   
  0%|          | 6/7500 [56:06<17:32:26,  8.43s/it] 

{'loss': 21.4193, 'grad_norm': 6534.4990234375, 'learning_rate': 1.664e-05, 'epoch': 0.17}


                                                   
  0%|          | 6/7500 [57:25<17:32:26,  8.43s/it] 

{'loss': 19.6513, 'grad_norm': 1836.2559814453125, 'learning_rate': 1.656e-05, 'epoch': 0.17}


                                                   
  0%|          | 6/7500 [58:43<17:32:26,  8.43s/it] 

{'loss': 19.9622, 'grad_norm': 509.830810546875, 'learning_rate': 1.648e-05, 'epoch': 0.18}


                                                   
  0%|          | 6/7500 [1:00:02<17:32:26,  8.43s/it]

{'loss': 19.1081, 'grad_norm': 4113.916015625, 'learning_rate': 1.64e-05, 'epoch': 0.18}


                                                     
  0%|          | 6/7500 [1:01:20<17:32:26,  8.43s/it] 

{'loss': 18.7832, 'grad_norm': 4734.01904296875, 'learning_rate': 1.632e-05, 'epoch': 0.18}


                                                     
  0%|          | 6/7500 [1:02:38<17:32:26,  8.43s/it] 

{'loss': 18.7586, 'grad_norm': 3434.83251953125, 'learning_rate': 1.6240000000000004e-05, 'epoch': 0.19}


                                                     
  0%|          | 6/7500 [1:03:56<17:32:26,  8.43s/it] 

{'loss': 18.5635, 'grad_norm': 1468.2576904296875, 'learning_rate': 1.616e-05, 'epoch': 0.19}


                                                     
  0%|          | 6/7500 [1:05:13<17:32:26,  8.43s/it] 

{'loss': 18.6405, 'grad_norm': 604.7415771484375, 'learning_rate': 1.6080000000000002e-05, 'epoch': 0.2}


                                                     
  0%|          | 6/7500 [1:06:31<17:32:26,  8.43s/it] 

{'loss': 17.9248, 'grad_norm': 1388.180419921875, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.2}


                                                     
  0%|          | 6/7500 [1:07:49<17:32:26,  8.43s/it] 

{'loss': 17.9138, 'grad_norm': 1443.2022705078125, 'learning_rate': 1.5920000000000003e-05, 'epoch': 0.2}


                                                     
  0%|          | 6/7500 [1:09:06<17:32:26,  8.43s/it] 

{'loss': 17.0875, 'grad_norm': 1105.114013671875, 'learning_rate': 1.584e-05, 'epoch': 0.21}


                                                     
  0%|          | 6/7500 [1:10:22<17:32:26,  8.43s/it] 

{'loss': 17.7066, 'grad_norm': 441.9109191894531, 'learning_rate': 1.576e-05, 'epoch': 0.21}


                                                     
  0%|          | 6/7500 [1:11:40<17:32:26,  8.43s/it] 

{'loss': 17.2254, 'grad_norm': 1288.018310546875, 'learning_rate': 1.5680000000000002e-05, 'epoch': 0.22}


                                                     
  0%|          | 6/7500 [1:12:57<17:32:26,  8.43s/it] 

{'loss': 17.0542, 'grad_norm': 3257.052001953125, 'learning_rate': 1.5600000000000003e-05, 'epoch': 0.22}


                                                     
  0%|          | 6/7500 [1:14:15<17:32:26,  8.43s/it] 

{'loss': 16.6557, 'grad_norm': 674.9089965820312, 'learning_rate': 1.552e-05, 'epoch': 0.22}


                                                     
  0%|          | 6/7500 [1:15:34<17:32:26,  8.43s/it] 

{'loss': 16.184, 'grad_norm': 1345.370361328125, 'learning_rate': 1.544e-05, 'epoch': 0.23}


                                                     
  0%|          | 6/7500 [1:16:54<17:32:26,  8.43s/it] 

{'loss': 16.4652, 'grad_norm': 2048.18505859375, 'learning_rate': 1.5360000000000002e-05, 'epoch': 0.23}


                                                     
  0%|          | 6/7500 [1:18:13<17:32:26,  8.43s/it] 

{'loss': 16.9308, 'grad_norm': 3614.949462890625, 'learning_rate': 1.5280000000000003e-05, 'epoch': 0.24}


                                                     
  0%|          | 6/7500 [1:19:30<17:32:26,  8.43s/it] 

{'loss': 15.8135, 'grad_norm': 1022.4423828125, 'learning_rate': 1.5200000000000002e-05, 'epoch': 0.24}


                                                     
  0%|          | 6/7500 [1:20:48<17:32:26,  8.43s/it] 

{'loss': 15.8214, 'grad_norm': 1351.6552734375, 'learning_rate': 1.5120000000000001e-05, 'epoch': 0.24}


                                                     
  0%|          | 6/7500 [1:22:06<17:32:26,  8.43s/it] 

{'loss': 15.3448, 'grad_norm': 951.930908203125, 'learning_rate': 1.5040000000000002e-05, 'epoch': 0.25}


                                                     
  0%|          | 6/7500 [1:23:25<17:32:26,  8.43s/it] 

{'loss': 15.0997, 'grad_norm': 1079.362548828125, 'learning_rate': 1.496e-05, 'epoch': 0.25}


                                                     
  0%|          | 6/7500 [1:24:43<17:32:26,  8.43s/it] 

{'loss': 15.4738, 'grad_norm': 466.77093505859375, 'learning_rate': 1.4880000000000002e-05, 'epoch': 0.26}


                                                     
  0%|          | 6/7500 [1:26:01<17:32:26,  8.43s/it] 

{'loss': 15.3558, 'grad_norm': 6836.259765625, 'learning_rate': 1.48e-05, 'epoch': 0.26}


                                                     
  0%|          | 6/7500 [1:27:18<17:32:26,  8.43s/it] 

{'loss': 14.3511, 'grad_norm': 921.91259765625, 'learning_rate': 1.4720000000000001e-05, 'epoch': 0.26}


                                                     
  0%|          | 6/7500 [1:28:36<17:32:26,  8.43s/it] 

{'loss': 14.1813, 'grad_norm': 427.8635559082031, 'learning_rate': 1.464e-05, 'epoch': 0.27}


                                                     
  0%|          | 6/7500 [1:29:54<17:32:26,  8.43s/it] 

{'loss': 14.7385, 'grad_norm': 1107.185791015625, 'learning_rate': 1.4560000000000001e-05, 'epoch': 0.27}


                                                     
  0%|          | 6/7500 [1:31:12<17:32:26,  8.43s/it] 

{'loss': 14.3286, 'grad_norm': 2264.27392578125, 'learning_rate': 1.448e-05, 'epoch': 0.28}


                                                     
  0%|          | 6/7500 [1:32:30<17:32:26,  8.43s/it] 

{'loss': 14.6418, 'grad_norm': 495.48870849609375, 'learning_rate': 1.4400000000000001e-05, 'epoch': 0.28}


                                                     
  0%|          | 6/7500 [1:33:48<17:32:26,  8.43s/it] 

{'loss': 13.709, 'grad_norm': 575.5645751953125, 'learning_rate': 1.432e-05, 'epoch': 0.28}


                                                     
  0%|          | 6/7500 [1:35:06<17:32:26,  8.43s/it] 

{'loss': 14.4812, 'grad_norm': 1339.8121337890625, 'learning_rate': 1.4240000000000001e-05, 'epoch': 0.29}


                                                     
  0%|          | 6/7500 [1:36:24<17:32:26,  8.43s/it] 

{'loss': 13.7096, 'grad_norm': 1653.6785888671875, 'learning_rate': 1.416e-05, 'epoch': 0.29}


                                                     
  0%|          | 6/7500 [1:37:41<17:32:26,  8.43s/it] 

{'loss': 13.3608, 'grad_norm': 1513.1954345703125, 'learning_rate': 1.408e-05, 'epoch': 0.3}


                                                     
  0%|          | 6/7500 [1:39:00<17:32:26,  8.43s/it] 

{'loss': 13.0755, 'grad_norm': 999.4191284179688, 'learning_rate': 1.4e-05, 'epoch': 0.3}


                                                     
  0%|          | 6/7500 [1:40:17<17:32:26,  8.43s/it] 

{'loss': 13.7237, 'grad_norm': 446.1192321777344, 'learning_rate': 1.392e-05, 'epoch': 0.3}


                                                     
  0%|          | 6/7500 [1:41:34<17:32:26,  8.43s/it] 

{'loss': 12.7479, 'grad_norm': 368.8188781738281, 'learning_rate': 1.384e-05, 'epoch': 0.31}


                                                     
  0%|          | 6/7500 [1:42:51<17:32:26,  8.43s/it] 

{'loss': 12.3271, 'grad_norm': 423.5849609375, 'learning_rate': 1.376e-05, 'epoch': 0.31}


                                                     
  0%|          | 6/7500 [1:44:07<17:32:26,  8.43s/it] 

{'loss': 12.397, 'grad_norm': 331.99822998046875, 'learning_rate': 1.3680000000000003e-05, 'epoch': 0.32}


                                                     
  0%|          | 6/7500 [1:45:25<17:32:26,  8.43s/it] 

{'loss': 12.394, 'grad_norm': 1179.2147216796875, 'learning_rate': 1.3600000000000002e-05, 'epoch': 0.32}


                                                     
  0%|          | 6/7500 [1:46:42<17:32:26,  8.43s/it] 

{'loss': 12.1939, 'grad_norm': 1178.8292236328125, 'learning_rate': 1.3520000000000003e-05, 'epoch': 0.32}


                                                     
  0%|          | 6/7500 [1:47:59<17:32:26,  8.43s/it] 

{'loss': 12.3399, 'grad_norm': 779.6805419921875, 'learning_rate': 1.3440000000000002e-05, 'epoch': 0.33}


                                                     
  0%|          | 6/7500 [1:49:16<17:32:26,  8.43s/it] 

{'loss': 12.3347, 'grad_norm': 269.21148681640625, 'learning_rate': 1.3360000000000003e-05, 'epoch': 0.33}


                                                     
  0%|          | 6/7500 [1:50:33<17:32:26,  8.43s/it] 

{'loss': 11.9208, 'grad_norm': 645.361328125, 'learning_rate': 1.3280000000000002e-05, 'epoch': 0.34}


                                                     
  0%|          | 6/7500 [1:51:52<17:32:26,  8.43s/it] 

{'loss': 11.7556, 'grad_norm': 338.7598876953125, 'learning_rate': 1.3200000000000002e-05, 'epoch': 0.34}


                                                     
  0%|          | 6/7500 [1:53:10<17:32:26,  8.43s/it] 

{'loss': 11.6089, 'grad_norm': 700.8136596679688, 'learning_rate': 1.3120000000000001e-05, 'epoch': 0.34}


                                                     
  0%|          | 6/7500 [1:54:27<17:32:26,  8.43s/it] 

{'loss': 11.5793, 'grad_norm': 1437.9871826171875, 'learning_rate': 1.3040000000000002e-05, 'epoch': 0.35}


                                                     
  0%|          | 6/7500 [1:55:45<17:32:26,  8.43s/it] 

{'loss': 11.4873, 'grad_norm': 262.7764892578125, 'learning_rate': 1.2960000000000001e-05, 'epoch': 0.35}


                                                     
  0%|          | 6/7500 [1:57:04<17:32:26,  8.43s/it] 

{'loss': 11.0406, 'grad_norm': 655.39013671875, 'learning_rate': 1.2880000000000002e-05, 'epoch': 0.36}


                                                     
  0%|          | 6/7500 [1:58:23<17:32:26,  8.43s/it] 

{'loss': 10.8982, 'grad_norm': 584.7565307617188, 'learning_rate': 1.2800000000000001e-05, 'epoch': 0.36}


                                                     
  0%|          | 6/7500 [1:59:40<17:32:26,  8.43s/it] 

{'loss': 10.9171, 'grad_norm': 1198.3580322265625, 'learning_rate': 1.2720000000000002e-05, 'epoch': 0.36}


                                                     
  0%|          | 6/7500 [2:00:57<17:32:26,  8.43s/it] 

{'loss': 11.2355, 'grad_norm': 1224.8868408203125, 'learning_rate': 1.2640000000000001e-05, 'epoch': 0.37}


                                                     
  0%|          | 6/7500 [2:02:14<17:32:26,  8.43s/it] 

{'loss': 10.9011, 'grad_norm': 5371.4970703125, 'learning_rate': 1.2560000000000002e-05, 'epoch': 0.37}


                                                     
  0%|          | 6/7500 [2:03:32<17:32:26,  8.43s/it] 

{'loss': 10.5679, 'grad_norm': 283.0589599609375, 'learning_rate': 1.248e-05, 'epoch': 0.38}


                                                     
  0%|          | 6/7500 [2:04:49<17:32:26,  8.43s/it] 

{'loss': 10.4441, 'grad_norm': 409.1608581542969, 'learning_rate': 1.2400000000000002e-05, 'epoch': 0.38}


                                                     
  0%|          | 6/7500 [2:06:07<17:32:26,  8.43s/it] 

{'loss': 10.4043, 'grad_norm': 1007.6096801757812, 'learning_rate': 1.232e-05, 'epoch': 0.38}


                                                     
  0%|          | 6/7500 [2:07:25<17:32:26,  8.43s/it] 

{'loss': 10.3868, 'grad_norm': 624.1979370117188, 'learning_rate': 1.2240000000000001e-05, 'epoch': 0.39}


                                                     
  0%|          | 6/7500 [2:08:42<17:32:26,  8.43s/it] 

{'loss': 10.3934, 'grad_norm': 247.43118286132812, 'learning_rate': 1.216e-05, 'epoch': 0.39}


                                                     
  0%|          | 6/7500 [2:10:00<17:32:26,  8.43s/it] 

{'loss': 10.4353, 'grad_norm': 1934.412353515625, 'learning_rate': 1.2080000000000001e-05, 'epoch': 0.4}


                                                     
  0%|          | 6/7500 [2:11:17<17:32:26,  8.43s/it]  

{'loss': 10.0785, 'grad_norm': 524.4196166992188, 'learning_rate': 1.2e-05, 'epoch': 0.4}


                                                     
  0%|          | 6/7500 [2:12:34<17:32:26,  8.43s/it]  

{'loss': 9.8074, 'grad_norm': 561.4884643554688, 'learning_rate': 1.1920000000000001e-05, 'epoch': 0.4}


                                                     
  0%|          | 6/7500 [2:13:51<17:32:26,  8.43s/it]  

{'loss': 10.1737, 'grad_norm': 389.45880126953125, 'learning_rate': 1.184e-05, 'epoch': 0.41}


                                                     
  0%|          | 6/7500 [2:15:08<17:32:26,  8.43s/it]  

{'loss': 9.9435, 'grad_norm': 443.2567443847656, 'learning_rate': 1.1760000000000001e-05, 'epoch': 0.41}


                                                     
  0%|          | 6/7500 [2:16:40<17:32:26,  8.43s/it]  

{'loss': 9.5347, 'grad_norm': 338.6741027832031, 'learning_rate': 1.168e-05, 'epoch': 0.42}


                                                     
  0%|          | 6/7500 [2:18:12<17:32:26,  8.43s/it]  

{'loss': 9.8522, 'grad_norm': 513.2328491210938, 'learning_rate': 1.16e-05, 'epoch': 0.42}


                                                     
  0%|          | 6/7500 [3:37:40<17:32:26,  8.43s/it]   

{'loss': 9.7707, 'grad_norm': 481.00299072265625, 'learning_rate': 1.152e-05, 'epoch': 0.42}


                                                     
  0%|          | 6/7500 [3:39:12<17:32:26,  8.43s/it]  

{'loss': 9.4917, 'grad_norm': 3740.028564453125, 'learning_rate': 1.144e-05, 'epoch': 0.43}


                                                     
  0%|          | 6/7500 [3:40:46<17:32:26,  8.43s/it]  

{'loss': 9.4348, 'grad_norm': 445.414306640625, 'learning_rate': 1.136e-05, 'epoch': 0.43}


                                                     
  0%|          | 6/7500 [3:42:21<17:32:26,  8.43s/it]  

{'loss': 9.1066, 'grad_norm': 722.503173828125, 'learning_rate': 1.128e-05, 'epoch': 0.44}


                                                     
  0%|          | 6/7500 [3:43:55<17:32:26,  8.43s/it]  

{'loss': 9.5531, 'grad_norm': 303.5098876953125, 'learning_rate': 1.1200000000000001e-05, 'epoch': 0.44}


                                                     
  0%|          | 6/7500 [3:45:29<17:32:26,  8.43s/it]  

{'loss': 9.3415, 'grad_norm': 398.78887939453125, 'learning_rate': 1.1120000000000002e-05, 'epoch': 0.44}


                                                     
  0%|          | 6/7500 [3:47:01<17:32:26,  8.43s/it]  

{'loss': 8.8703, 'grad_norm': 1123.004150390625, 'learning_rate': 1.1040000000000001e-05, 'epoch': 0.45}


                                                     
  0%|          | 6/7500 [3:48:34<17:32:26,  8.43s/it]  

{'loss': 9.1563, 'grad_norm': 362.2789306640625, 'learning_rate': 1.0960000000000002e-05, 'epoch': 0.45}


                                                     
  0%|          | 6/7500 [3:50:07<17:32:26,  8.43s/it]  

{'loss': 8.5854, 'grad_norm': 1545.5245361328125, 'learning_rate': 1.0880000000000001e-05, 'epoch': 0.46}


                                                     
  0%|          | 6/7500 [3:51:40<17:32:26,  8.43s/it]  

{'loss': 8.8714, 'grad_norm': 342.2921142578125, 'learning_rate': 1.0800000000000002e-05, 'epoch': 0.46}


                                                     
  0%|          | 6/7500 [3:53:13<17:32:26,  8.43s/it]  

{'loss': 9.012, 'grad_norm': 3883.854248046875, 'learning_rate': 1.072e-05, 'epoch': 0.46}


                                                     
  0%|          | 6/7500 [3:54:46<17:32:26,  8.43s/it]  

{'loss': 8.8698, 'grad_norm': 659.8582153320312, 'learning_rate': 1.0640000000000001e-05, 'epoch': 0.47}


                                                     
  0%|          | 6/7500 [3:56:19<17:32:26,  8.43s/it]  

{'loss': 8.8656, 'grad_norm': 363.3125305175781, 'learning_rate': 1.056e-05, 'epoch': 0.47}


                                                     
  0%|          | 6/7500 [3:57:52<17:32:26,  8.43s/it]  

{'loss': 8.8556, 'grad_norm': 1320.576416015625, 'learning_rate': 1.0480000000000001e-05, 'epoch': 0.48}


                                                     
  0%|          | 6/7500 [3:59:26<17:32:26,  8.43s/it]  

{'loss': 8.7485, 'grad_norm': 806.7913818359375, 'learning_rate': 1.04e-05, 'epoch': 0.48}


                                                     
  0%|          | 6/7500 [4:01:00<17:32:26,  8.43s/it]  

{'loss': 8.7196, 'grad_norm': 486.7948913574219, 'learning_rate': 1.0320000000000001e-05, 'epoch': 0.48}


                                                     
  0%|          | 6/7500 [4:02:34<17:32:26,  8.43s/it]  

{'loss': 8.2513, 'grad_norm': 568.4244995117188, 'learning_rate': 1.024e-05, 'epoch': 0.49}


                                                     
  0%|          | 6/7500 [4:03:57<17:32:26,  8.43s/it]  

{'loss': 8.4949, 'grad_norm': 344.3664855957031, 'learning_rate': 1.0160000000000001e-05, 'epoch': 0.49}


                                                     
  0%|          | 6/7500 [4:05:08<17:32:26,  8.43s/it]  

{'loss': 8.1216, 'grad_norm': 653.5921630859375, 'learning_rate': 1.008e-05, 'epoch': 0.5}


                                                     
  0%|          | 6/7500 [4:06:18<17:32:26,  8.43s/it]  

{'loss': 8.2717, 'grad_norm': 316.9374694824219, 'learning_rate': 1e-05, 'epoch': 0.5}


                                                     
  0%|          | 6/7500 [4:07:28<17:32:26,  8.43s/it]  

{'loss': 7.8252, 'grad_norm': 268.9579162597656, 'learning_rate': 9.920000000000002e-06, 'epoch': 0.5}


                                                     
  0%|          | 6/7500 [4:08:39<17:32:26,  8.43s/it]  

{'loss': 8.1921, 'grad_norm': 362.0890197753906, 'learning_rate': 9.84e-06, 'epoch': 0.51}


                                                     
  0%|          | 6/7500 [4:09:52<17:32:26,  8.43s/it]  

{'loss': 8.1186, 'grad_norm': 472.3583679199219, 'learning_rate': 9.760000000000001e-06, 'epoch': 0.51}


                                                     
  0%|          | 6/7500 [4:11:04<17:32:26,  8.43s/it]  

{'loss': 7.831, 'grad_norm': 368.4115905761719, 'learning_rate': 9.68e-06, 'epoch': 0.52}


                                                     
  0%|          | 6/7500 [4:12:15<17:32:26,  8.43s/it]  

{'loss': 7.7369, 'grad_norm': 1986.5787353515625, 'learning_rate': 9.600000000000001e-06, 'epoch': 0.52}


                                                     
  0%|          | 6/7500 [4:13:26<17:32:26,  8.43s/it]  

{'loss': 8.1639, 'grad_norm': 99.16883850097656, 'learning_rate': 9.52e-06, 'epoch': 0.52}


                                                     
  0%|          | 6/7500 [4:14:37<17:32:26,  8.43s/it]  

{'loss': 7.5862, 'grad_norm': 1298.030029296875, 'learning_rate': 9.440000000000001e-06, 'epoch': 0.53}


                                                     
  0%|          | 6/7500 [4:15:48<17:32:26,  8.43s/it]  

{'loss': 7.8341, 'grad_norm': 499.1407165527344, 'learning_rate': 9.360000000000002e-06, 'epoch': 0.53}


                                                     
  0%|          | 6/7500 [4:16:59<17:32:26,  8.43s/it]  

{'loss': 7.6509, 'grad_norm': 286.2748718261719, 'learning_rate': 9.280000000000001e-06, 'epoch': 0.54}


                                                     
  0%|          | 6/7500 [4:18:10<17:32:26,  8.43s/it]  

{'loss': 7.4513, 'grad_norm': 506.72760009765625, 'learning_rate': 9.200000000000002e-06, 'epoch': 0.54}


                                                     
  0%|          | 6/7500 [4:19:22<17:32:26,  8.43s/it]  

{'loss': 7.4673, 'grad_norm': 1641.9501953125, 'learning_rate': 9.12e-06, 'epoch': 0.54}


                                                     
  0%|          | 6/7500 [4:20:33<17:32:26,  8.43s/it]  

{'loss': 7.4201, 'grad_norm': 552.9197387695312, 'learning_rate': 9.040000000000002e-06, 'epoch': 0.55}


                                                     
  0%|          | 6/7500 [4:21:44<17:32:26,  8.43s/it]  

{'loss': 7.9843, 'grad_norm': 378.48712158203125, 'learning_rate': 8.96e-06, 'epoch': 0.55}


                                                     
  0%|          | 6/7500 [4:22:56<17:32:26,  8.43s/it]  

{'loss': 7.4325, 'grad_norm': 236.40420532226562, 'learning_rate': 8.880000000000001e-06, 'epoch': 0.56}


                                                     
  0%|          | 6/7500 [4:24:07<17:32:26,  8.43s/it]  

{'loss': 7.6977, 'grad_norm': 1531.528564453125, 'learning_rate': 8.8e-06, 'epoch': 0.56}


                                                     
  0%|          | 6/7500 [4:25:18<17:32:26,  8.43s/it]  

{'loss': 7.482, 'grad_norm': 546.7688598632812, 'learning_rate': 8.720000000000001e-06, 'epoch': 0.56}


                                                     
  0%|          | 6/7500 [4:26:29<17:32:26,  8.43s/it]  

{'loss': 7.3829, 'grad_norm': 163.6239776611328, 'learning_rate': 8.64e-06, 'epoch': 0.57}


                                                     
  0%|          | 6/7500 [4:27:40<17:32:26,  8.43s/it]  

{'loss': 7.1764, 'grad_norm': 259.2082214355469, 'learning_rate': 8.560000000000001e-06, 'epoch': 0.57}


                                                     
  0%|          | 6/7500 [4:28:50<17:32:26,  8.43s/it]  

{'loss': 7.4715, 'grad_norm': 1399.6409912109375, 'learning_rate': 8.48e-06, 'epoch': 0.58}


                                                     
  0%|          | 6/7500 [4:30:01<17:32:26,  8.43s/it]  

{'loss': 7.3404, 'grad_norm': 4966.10205078125, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.58}


                                                     
  0%|          | 6/7500 [4:31:12<17:32:26,  8.43s/it]  

{'loss': 7.0874, 'grad_norm': 1444.624267578125, 'learning_rate': 8.32e-06, 'epoch': 0.58}


                                                     
  0%|          | 6/7500 [4:32:23<17:32:26,  8.43s/it]  

{'loss': 7.15, 'grad_norm': 142.67562866210938, 'learning_rate': 8.24e-06, 'epoch': 0.59}


                                                     
  0%|          | 6/7500 [4:33:33<17:32:26,  8.43s/it]  

{'loss': 7.1644, 'grad_norm': 744.0586547851562, 'learning_rate': 8.16e-06, 'epoch': 0.59}


                                                     
  0%|          | 6/7500 [4:34:44<17:32:26,  8.43s/it]  

{'loss': 6.9612, 'grad_norm': 591.82275390625, 'learning_rate': 8.08e-06, 'epoch': 0.6}


                                                     
  0%|          | 6/7500 [4:35:56<17:32:26,  8.43s/it]  

{'loss': 7.0076, 'grad_norm': 4747.7041015625, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.6}


                                                     
  0%|          | 6/7500 [4:37:07<17:32:26,  8.43s/it]  

{'loss': 6.9241, 'grad_norm': 173.6011199951172, 'learning_rate': 7.92e-06, 'epoch': 0.6}


                                                     
  0%|          | 6/7500 [4:38:19<17:32:26,  8.43s/it]  

{'loss': 7.4288, 'grad_norm': 136.71627807617188, 'learning_rate': 7.840000000000001e-06, 'epoch': 0.61}


                                                     
  0%|          | 6/7500 [4:39:30<17:32:26,  8.43s/it]  

{'loss': 6.7204, 'grad_norm': 342.0278015136719, 'learning_rate': 7.76e-06, 'epoch': 0.61}


                                                     
  0%|          | 6/7500 [4:40:41<17:32:26,  8.43s/it]  

{'loss': 6.8169, 'grad_norm': 197.17825317382812, 'learning_rate': 7.680000000000001e-06, 'epoch': 0.62}


                                                     
  0%|          | 6/7500 [4:41:51<17:32:26,  8.43s/it]  

{'loss': 6.7625, 'grad_norm': 94.64596557617188, 'learning_rate': 7.600000000000001e-06, 'epoch': 0.62}


                                                     
  0%|          | 6/7500 [4:43:02<17:32:26,  8.43s/it]  

{'loss': 6.8837, 'grad_norm': 721.543701171875, 'learning_rate': 7.520000000000001e-06, 'epoch': 0.62}


                                                     
  0%|          | 6/7500 [4:44:13<17:32:26,  8.43s/it]  

{'loss': 6.7664, 'grad_norm': 344.3021240234375, 'learning_rate': 7.440000000000001e-06, 'epoch': 0.63}


                                                     
  0%|          | 6/7500 [4:45:24<17:32:26,  8.43s/it]  

{'loss': 6.8166, 'grad_norm': 481.5422668457031, 'learning_rate': 7.360000000000001e-06, 'epoch': 0.63}


                                                     
  0%|          | 6/7500 [4:46:35<17:32:26,  8.43s/it]  

{'loss': 6.4522, 'grad_norm': 964.45947265625, 'learning_rate': 7.280000000000001e-06, 'epoch': 0.64}


                                                     
  0%|          | 6/7500 [4:47:45<17:32:26,  8.43s/it]  

{'loss': 6.3257, 'grad_norm': 134.69337463378906, 'learning_rate': 7.2000000000000005e-06, 'epoch': 0.64}


                                                     
  0%|          | 6/7500 [4:48:56<17:32:26,  8.43s/it]  

{'loss': 6.399, 'grad_norm': 12622.8681640625, 'learning_rate': 7.1200000000000004e-06, 'epoch': 0.64}


                                                     
  0%|          | 6/7500 [4:50:07<17:32:26,  8.43s/it]  

{'loss': 6.3509, 'grad_norm': 144.90658569335938, 'learning_rate': 7.04e-06, 'epoch': 0.65}


                                                     
  0%|          | 6/7500 [4:51:18<17:32:26,  8.43s/it]  

{'loss': 6.3003, 'grad_norm': 475.7489013671875, 'learning_rate': 6.96e-06, 'epoch': 0.65}


                                                     
  0%|          | 6/7500 [4:52:29<17:32:26,  8.43s/it]  

{'loss': 6.5192, 'grad_norm': 1695.178955078125, 'learning_rate': 6.88e-06, 'epoch': 0.66}


                                                     
  0%|          | 6/7500 [4:53:42<17:32:26,  8.43s/it]  

{'loss': 6.5165, 'grad_norm': 1039.5093994140625, 'learning_rate': 6.800000000000001e-06, 'epoch': 0.66}


                                                     
  0%|          | 6/7500 [4:54:53<17:32:26,  8.43s/it]  

{'loss': 6.4084, 'grad_norm': 396.11700439453125, 'learning_rate': 6.720000000000001e-06, 'epoch': 0.66}


                                                     
  0%|          | 6/7500 [4:56:05<17:32:26,  8.43s/it]  

{'loss': 6.5748, 'grad_norm': 1477.0062255859375, 'learning_rate': 6.640000000000001e-06, 'epoch': 0.67}


                                                     
  0%|          | 6/7500 [4:57:17<17:32:26,  8.43s/it]  

{'loss': 6.5498, 'grad_norm': 220.80548095703125, 'learning_rate': 6.560000000000001e-06, 'epoch': 0.67}


                                                     
  0%|          | 6/7500 [4:58:27<17:32:26,  8.43s/it]  

{'loss': 6.2488, 'grad_norm': 139.1355438232422, 'learning_rate': 6.480000000000001e-06, 'epoch': 0.68}


                                                     
  0%|          | 6/7500 [4:59:37<17:32:26,  8.43s/it]  

{'loss': 6.459, 'grad_norm': 362.7652893066406, 'learning_rate': 6.4000000000000006e-06, 'epoch': 0.68}


                                                     
  0%|          | 6/7500 [5:00:48<17:32:26,  8.43s/it]  

{'loss': 6.063, 'grad_norm': 183.5489501953125, 'learning_rate': 6.3200000000000005e-06, 'epoch': 0.68}


                                                     
  0%|          | 6/7500 [5:01:59<17:32:26,  8.43s/it]  

{'loss': 6.3277, 'grad_norm': 423.57440185546875, 'learning_rate': 6.24e-06, 'epoch': 0.69}


                                                     
  0%|          | 6/7500 [5:03:10<17:32:26,  8.43s/it]  

{'loss': 6.2476, 'grad_norm': 313.7919616699219, 'learning_rate': 6.16e-06, 'epoch': 0.69}


                                                     
  0%|          | 6/7500 [5:04:20<17:32:26,  8.43s/it]  

{'loss': 6.1997, 'grad_norm': 354.1591796875, 'learning_rate': 6.08e-06, 'epoch': 0.7}


                                                     
  0%|          | 6/7500 [5:05:31<17:32:26,  8.43s/it]  

{'loss': 6.1602, 'grad_norm': 439.1802673339844, 'learning_rate': 6e-06, 'epoch': 0.7}


                                                     
  0%|          | 6/7500 [5:06:42<17:32:26,  8.43s/it]  

{'loss': 6.0074, 'grad_norm': 882.0795288085938, 'learning_rate': 5.92e-06, 'epoch': 0.7}


                                                     
  0%|          | 6/7500 [5:07:53<17:32:26,  8.43s/it]  

{'loss': 6.302, 'grad_norm': 276.34417724609375, 'learning_rate': 5.84e-06, 'epoch': 0.71}


                                                     
  0%|          | 6/7500 [5:09:07<17:32:26,  8.43s/it]  

{'loss': 6.4137, 'grad_norm': 1228.124267578125, 'learning_rate': 5.76e-06, 'epoch': 0.71}


                                                     
  0%|          | 6/7500 [5:10:18<17:32:26,  8.43s/it]  

{'loss': 6.3294, 'grad_norm': 439.0770568847656, 'learning_rate': 5.68e-06, 'epoch': 0.72}


                                                     
  0%|          | 6/7500 [5:11:28<17:32:26,  8.43s/it]  

{'loss': 5.983, 'grad_norm': 354.6218566894531, 'learning_rate': 5.600000000000001e-06, 'epoch': 0.72}


                                                     
  0%|          | 6/7500 [5:12:39<17:32:26,  8.43s/it]  

{'loss': 5.9201, 'grad_norm': 572.608154296875, 'learning_rate': 5.5200000000000005e-06, 'epoch': 0.72}


                                                     
  0%|          | 6/7500 [5:13:49<17:32:26,  8.43s/it]  

{'loss': 6.013, 'grad_norm': 106.43063354492188, 'learning_rate': 5.4400000000000004e-06, 'epoch': 0.73}


                                                     
  0%|          | 6/7500 [5:15:00<17:32:26,  8.43s/it]  

{'loss': 6.3401, 'grad_norm': 190.42901611328125, 'learning_rate': 5.36e-06, 'epoch': 0.73}


                                                     
  0%|          | 6/7500 [5:16:11<17:32:26,  8.43s/it]  

{'loss': 6.1295, 'grad_norm': 789.1180419921875, 'learning_rate': 5.28e-06, 'epoch': 0.74}


                                                     
  0%|          | 6/7500 [5:17:22<17:32:26,  8.43s/it]  

{'loss': 5.8508, 'grad_norm': 580.4296875, 'learning_rate': 5.2e-06, 'epoch': 0.74}


                                                     
  0%|          | 6/7500 [5:18:33<17:32:26,  8.43s/it]  

{'loss': 5.8946, 'grad_norm': 768.1133422851562, 'learning_rate': 5.12e-06, 'epoch': 0.74}


                                                     
  0%|          | 6/7500 [5:19:44<17:32:26,  8.43s/it]  

{'loss': 5.7328, 'grad_norm': 109.09242248535156, 'learning_rate': 5.04e-06, 'epoch': 0.75}


                                                     
  0%|          | 6/7500 [5:20:55<17:32:26,  8.43s/it]  

{'loss': 5.8949, 'grad_norm': 113.39321899414062, 'learning_rate': 4.960000000000001e-06, 'epoch': 0.75}


                                                     
  0%|          | 6/7500 [5:22:06<17:32:26,  8.43s/it]  

{'loss': 5.7925, 'grad_norm': 959.2523193359375, 'learning_rate': 4.880000000000001e-06, 'epoch': 0.76}


                                                     
  0%|          | 6/7500 [5:23:17<17:32:26,  8.43s/it]  

{'loss': 5.8611, 'grad_norm': 449.69830322265625, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.76}


                                                     
  0%|          | 6/7500 [5:24:28<17:32:26,  8.43s/it]  

{'loss': 5.6424, 'grad_norm': 158.4562530517578, 'learning_rate': 4.7200000000000005e-06, 'epoch': 0.76}


                                                     
  0%|          | 6/7500 [5:25:39<17:32:26,  8.43s/it]  

{'loss': 5.7484, 'grad_norm': 465.1369323730469, 'learning_rate': 4.6400000000000005e-06, 'epoch': 0.77}


                                                     
  0%|          | 6/7500 [5:26:50<17:32:26,  8.43s/it]  

{'loss': 5.505, 'grad_norm': 190.24954223632812, 'learning_rate': 4.56e-06, 'epoch': 0.77}


                                                     
  0%|          | 6/7500 [5:28:02<17:32:26,  8.43s/it]  

{'loss': 5.7092, 'grad_norm': 897.302734375, 'learning_rate': 4.48e-06, 'epoch': 0.78}


                                                     
  0%|          | 6/7500 [5:29:12<17:32:26,  8.43s/it]  

{'loss': 6.0855, 'grad_norm': 399.28167724609375, 'learning_rate': 4.4e-06, 'epoch': 0.78}


                                                     
  0%|          | 6/7500 [5:30:22<17:32:26,  8.43s/it]  

{'loss': 5.82, 'grad_norm': 228.3208465576172, 'learning_rate': 4.32e-06, 'epoch': 0.78}


                                                     
  0%|          | 6/7500 [5:31:32<17:32:26,  8.43s/it]  

{'loss': 5.7057, 'grad_norm': 388.3545837402344, 'learning_rate': 4.24e-06, 'epoch': 0.79}


                                                     
  0%|          | 6/7500 [5:32:42<17:32:26,  8.43s/it]  

{'loss': 5.7483, 'grad_norm': 262.9902038574219, 'learning_rate': 4.16e-06, 'epoch': 0.79}


                                                     
  0%|          | 6/7500 [5:33:52<17:32:26,  8.43s/it]

{'loss': 5.8634, 'grad_norm': 450.1901550292969, 'learning_rate': 4.08e-06, 'epoch': 0.8}


                                                     
  0%|          | 6/7500 [5:35:03<17:32:26,  8.43s/it]

{'loss': 5.5214, 'grad_norm': 592.58056640625, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


                                                     
  0%|          | 6/7500 [5:36:13<17:32:26,  8.43s/it]

{'loss': 5.9614, 'grad_norm': 65.30148315429688, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.8}


                                                     
  0%|          | 6/7500 [5:37:24<17:32:26,  8.43s/it]

{'loss': 5.4653, 'grad_norm': 511.77191162109375, 'learning_rate': 3.8400000000000005e-06, 'epoch': 0.81}


                                                     
  0%|          | 6/7500 [5:38:34<17:32:26,  8.43s/it]

{'loss': 5.7029, 'grad_norm': 327.1967468261719, 'learning_rate': 3.7600000000000004e-06, 'epoch': 0.81}


                                                     
  0%|          | 6/7500 [5:39:44<17:32:26,  8.43s/it]

{'loss': 5.8313, 'grad_norm': 89.7340316772461, 'learning_rate': 3.6800000000000003e-06, 'epoch': 0.82}


                                                     
  0%|          | 6/7500 [5:40:55<17:32:26,  8.43s/it]

{'loss': 5.4142, 'grad_norm': 180.5078887939453, 'learning_rate': 3.6000000000000003e-06, 'epoch': 0.82}


                                                     
  0%|          | 6/7500 [5:42:06<17:32:26,  8.43s/it]

{'loss': 5.6139, 'grad_norm': 598.855224609375, 'learning_rate': 3.52e-06, 'epoch': 0.82}


                                                     
  0%|          | 6/7500 [5:43:17<17:32:26,  8.43s/it]

{'loss': 5.5375, 'grad_norm': 231.72998046875, 'learning_rate': 3.44e-06, 'epoch': 0.83}


                                                     
  0%|          | 6/7500 [5:44:27<17:32:26,  8.43s/it]

{'loss': 5.4455, 'grad_norm': 567.0284423828125, 'learning_rate': 3.3600000000000004e-06, 'epoch': 0.83}


                                                     
  0%|          | 6/7500 [5:45:37<17:32:26,  8.43s/it]

{'loss': 5.4977, 'grad_norm': 463.4721984863281, 'learning_rate': 3.2800000000000004e-06, 'epoch': 0.84}


                                                     
  0%|          | 6/7500 [5:46:48<17:32:26,  8.43s/it]

{'loss': 5.6077, 'grad_norm': 452.6423034667969, 'learning_rate': 3.2000000000000003e-06, 'epoch': 0.84}


                                                     
  0%|          | 6/7500 [5:47:59<17:32:26,  8.43s/it]

{'loss': 5.8072, 'grad_norm': 195.8623504638672, 'learning_rate': 3.12e-06, 'epoch': 0.84}


                                                     
  0%|          | 6/7500 [5:49:11<17:32:26,  8.43s/it]

{'loss': 5.7893, 'grad_norm': 230.51622009277344, 'learning_rate': 3.04e-06, 'epoch': 0.85}


                                                     
  0%|          | 6/7500 [5:50:21<17:32:26,  8.43s/it]

{'loss': 5.5257, 'grad_norm': 148.3167266845703, 'learning_rate': 2.96e-06, 'epoch': 0.85}


                                                     
  0%|          | 6/7500 [5:51:32<17:32:26,  8.43s/it]

{'loss': 5.6556, 'grad_norm': 525.6694946289062, 'learning_rate': 2.88e-06, 'epoch': 0.86}


                                                     
  0%|          | 6/7500 [5:52:43<17:32:26,  8.43s/it]

{'loss': 5.6011, 'grad_norm': 207.94882202148438, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.86}


                                                     
  0%|          | 6/7500 [5:53:54<17:32:26,  8.43s/it]

{'loss': 5.3572, 'grad_norm': 227.95193481445312, 'learning_rate': 2.7200000000000002e-06, 'epoch': 0.86}


                                                     
  0%|          | 6/7500 [5:55:04<17:32:26,  8.43s/it]

{'loss': 5.3457, 'grad_norm': 277.1990966796875, 'learning_rate': 2.64e-06, 'epoch': 0.87}


                                                     
  0%|          | 6/7500 [5:56:15<17:32:26,  8.43s/it]

{'loss': 5.2763, 'grad_norm': 143.98486328125, 'learning_rate': 2.56e-06, 'epoch': 0.87}


                                                     
  0%|          | 6/7500 [5:57:26<17:32:26,  8.43s/it]

{'loss': 5.5115, 'grad_norm': 192.4591522216797, 'learning_rate': 2.4800000000000004e-06, 'epoch': 0.88}


                                                     
  0%|          | 6/7500 [5:58:36<17:32:26,  8.43s/it]

{'loss': 5.4969, 'grad_norm': 236.53680419921875, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.88}


                                                     
  0%|          | 6/7500 [5:59:46<17:32:26,  8.43s/it]

{'loss': 5.1673, 'grad_norm': 154.8863525390625, 'learning_rate': 2.3200000000000002e-06, 'epoch': 0.88}


                                                     
  0%|          | 6/7500 [6:00:56<17:32:26,  8.43s/it]

{'loss': 5.179, 'grad_norm': 461.9427185058594, 'learning_rate': 2.24e-06, 'epoch': 0.89}


                                                     
  0%|          | 6/7500 [6:02:06<17:32:26,  8.43s/it]

{'loss': 5.5395, 'grad_norm': 682.6221313476562, 'learning_rate': 2.16e-06, 'epoch': 0.89}


                                                     
  0%|          | 6/7500 [6:03:17<17:32:26,  8.43s/it]

{'loss': 5.4047, 'grad_norm': 192.13902282714844, 'learning_rate': 2.08e-06, 'epoch': 0.9}


                                                     
  0%|          | 6/7500 [6:04:28<17:32:26,  8.43s/it]

{'loss': 5.5548, 'grad_norm': 226.24327087402344, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.9}


                                                     
  0%|          | 6/7500 [6:05:38<17:32:26,  8.43s/it]

{'loss': 5.3984, 'grad_norm': 123.1158218383789, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.9}


                                                     
  0%|          | 6/7500 [6:06:49<17:32:26,  8.43s/it]

{'loss': 5.4548, 'grad_norm': 124.35997009277344, 'learning_rate': 1.8400000000000002e-06, 'epoch': 0.91}


                                                     
  0%|          | 6/7500 [6:07:59<17:32:26,  8.43s/it]

{'loss': 5.5458, 'grad_norm': 113.87542724609375, 'learning_rate': 1.76e-06, 'epoch': 0.91}


                                                     
  0%|          | 6/7500 [6:09:12<17:32:26,  8.43s/it]

{'loss': 5.3443, 'grad_norm': 207.53843688964844, 'learning_rate': 1.6800000000000002e-06, 'epoch': 0.92}


                                                     
  0%|          | 6/7500 [6:10:23<17:32:26,  8.43s/it]

{'loss': 5.469, 'grad_norm': 285.478271484375, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.92}


                                                     
  0%|          | 6/7500 [6:11:34<17:32:26,  8.43s/it]

{'loss': 5.3202, 'grad_norm': 107.69458770751953, 'learning_rate': 1.52e-06, 'epoch': 0.92}


                                                     
  0%|          | 6/7500 [6:12:45<17:32:26,  8.43s/it]

{'loss': 5.6493, 'grad_norm': 3720.46435546875, 'learning_rate': 1.44e-06, 'epoch': 0.93}


                                                     
  0%|          | 6/7500 [6:13:55<17:32:26,  8.43s/it]

{'loss': 5.6185, 'grad_norm': 237.9912872314453, 'learning_rate': 1.3600000000000001e-06, 'epoch': 0.93}


                                                     
  0%|          | 6/7500 [6:15:06<17:32:26,  8.43s/it]

{'loss': 5.2465, 'grad_norm': 322.2344970703125, 'learning_rate': 1.28e-06, 'epoch': 0.94}


                                                     
  0%|          | 6/7500 [6:16:16<17:32:26,  8.43s/it]

{'loss': 5.4277, 'grad_norm': 1575.4910888671875, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.94}


                                                     
  0%|          | 6/7500 [6:17:27<17:32:26,  8.43s/it]

{'loss': 5.4858, 'grad_norm': 215.68251037597656, 'learning_rate': 1.12e-06, 'epoch': 0.94}


                                                     
  0%|          | 6/7500 [6:18:38<17:32:26,  8.43s/it]

{'loss': 5.2827, 'grad_norm': 2666.75927734375, 'learning_rate': 1.04e-06, 'epoch': 0.95}


                                                     
  0%|          | 6/7500 [6:19:49<17:32:26,  8.43s/it]

{'loss': 5.5265, 'grad_norm': 445.7021484375, 'learning_rate': 9.600000000000001e-07, 'epoch': 0.95}


                                                     
  0%|          | 6/7500 [6:20:59<17:32:26,  8.43s/it]

{'loss': 5.2202, 'grad_norm': 72.6478271484375, 'learning_rate': 8.8e-07, 'epoch': 0.96}


                                                     
  0%|          | 6/7500 [6:22:09<17:32:26,  8.43s/it]

{'loss': 5.3097, 'grad_norm': 220.7271728515625, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.96}


                                                     
  0%|          | 6/7500 [6:23:20<17:32:26,  8.43s/it]

{'loss': 5.4037, 'grad_norm': 232.9355010986328, 'learning_rate': 7.2e-07, 'epoch': 0.96}


                                                     
  0%|          | 6/7500 [6:24:31<17:32:26,  8.43s/it]

{'loss': 5.1651, 'grad_norm': 131.2239227294922, 'learning_rate': 6.4e-07, 'epoch': 0.97}


                                                     
  0%|          | 6/7500 [6:25:41<17:32:26,  8.43s/it]

{'loss': 5.3666, 'grad_norm': 850.5905151367188, 'learning_rate': 5.6e-07, 'epoch': 0.97}


                                                     
  0%|          | 6/7500 [6:26:52<17:32:26,  8.43s/it]

{'loss': 5.1876, 'grad_norm': 413.99420166015625, 'learning_rate': 4.800000000000001e-07, 'epoch': 0.98}


                                                     
  0%|          | 6/7500 [6:28:03<17:32:26,  8.43s/it]

{'loss': 5.3682, 'grad_norm': 842.7271118164062, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.98}


                                                     
  0%|          | 6/7500 [6:29:13<17:32:26,  8.43s/it]

{'loss': 5.2372, 'grad_norm': 132.62741088867188, 'learning_rate': 3.2e-07, 'epoch': 0.98}


                                                     
  0%|          | 6/7500 [6:30:23<17:32:26,  8.43s/it]

{'loss': 5.5774, 'grad_norm': 905.8253784179688, 'learning_rate': 2.4000000000000003e-07, 'epoch': 0.99}


                                                     
  0%|          | 6/7500 [6:31:34<17:32:26,  8.43s/it]

{'loss': 5.1779, 'grad_norm': 374.45428466796875, 'learning_rate': 1.6e-07, 'epoch': 0.99}


                                                     
  0%|          | 6/7500 [6:32:44<17:32:26,  8.43s/it]

{'loss': 5.468, 'grad_norm': 120.15302276611328, 'learning_rate': 8e-08, 'epoch': 1.0}


                                                     
  0%|          | 6/7500 [6:33:54<17:32:26,  8.43s/it]

{'loss': 5.2736, 'grad_norm': 126.8198013305664, 'learning_rate': 0.0, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 5.535392761230469, 'eval_runtime': 736.839, 'eval_samples_per_second': 2.714, 'eval_steps_per_second': 0.679, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
                                                     
100%|██████████| 2500/2500 [6:45:18<00:00,  9.73s/it]


{'train_runtime': 24319.1993, 'train_samples_per_second': 0.411, 'train_steps_per_second': 0.103, 'train_loss': 11.98397053527832, 'epoch': 1.0}


TrainOutput(global_step=2500, training_loss=11.98397053527832, metrics={'train_runtime': 24319.1993, 'train_samples_per_second': 0.411, 'train_steps_per_second': 0.103, 'total_flos': 5287496908800000.0, 'train_loss': 11.98397053527832, 'epoch': 1.0})

In [8]:
from nltk.translate.bleu_score import sentence_bleu

# Evaluate BLEU score
def compute_bleu(data):
    references = [[word_tokenize(summary)] for summary in data["summary"]]
    predictions = []
    for article in data["document"]:
        inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
        pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(word_tokenize(pred_summary))
    return corpus_bleu(references, predictions)

average_bleu = compute_bleu(test_data)
print(f"Average BLEU Score: {average_bleu}")

# Predict on test data and print BLEU score for each sample
for i in range(10):  # Predict and display summaries for first 10 examples
    article = test_data[i]["document"]
    reference_summary = test_data[i]["summary"]

    # Generate prediction
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenize references and prediction
    reference_tokens = word_tokenize(reference_summary)
    prediction_tokens = word_tokenize(pred_summary)

    # Compute BLEU score for the sample
    sample_bleu = sentence_bleu([reference_tokens], prediction_tokens)

    print(f"Article: {article}...")
    print(f"Reference Summary: {reference_summary}")
    print(f"Predicted Summary: {pred_summary}")
    print(f"BLEU Score: {sample_bleu}\n")


Average BLEU Score: 0.0006279388331626488


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Article: Liputan6 . com , Bangka : Kapal patroli Angkatan Laut Republik Indonesia , Belinyu , baru-baru ini , menangkap tiga kapal nelayan berbendera Thailand , yakni KM Binatama , KM Sumber Jaya II , dan KM Mataram di Perairan Belitung Utara . Ketiga kapal itu ditangkap karena melanggar zona ekonomi ekslusif Indonesia . Saat ini , kapal-kapal itu diamankan di Pos Lanal Pelabuhan Pangkalan Balam , Bangka-Belitung . Menurut Komandan Pangkalan TNI AL Bangka Letnan Kolonel Laut Fredy Egam , selain menangkap tiga kapal , ALRI juga memeriksa 43 anak buah kapal . Mereka disergap saat sedang mengangkat jaring pukat harimau di Perairan Belitung Utara . Dari jumlah itu , hanya enam orang yang dijadikan tersangka , yakni tiga nahkoda dan tiga kepala kamar mesin kapal . Sedangkan ABK yang lain akan dideportasi ke negara asalnya . Meski berhasil menahan enam tersangka , TNI AL gagal mengamankan ikan tangkapan nelayan Thailand tersebut . Sebab , sebelum patroli datang , mereka telah memindahkan pul

In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Evaluate BLEU score
def compute_bleu(data):
    references = [[word_tokenize(summary)] for summary in data["summary"]]
    predictions = []
    for article in data["document"]:
        inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
        pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(word_tokenize(pred_summary))
    return corpus_bleu(references, predictions)

average_bleu = compute_bleu(test_data)
print(f"Average BLEU Score: {average_bleu}")

# Predict on test data and print BLEU score for each sample
for i in range(10):  # Predict and display summaries for first 10 examples
    article = test_data[i]["document"]
    reference_summary = test_data[i]["summary"]

    # Generate prediction
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenize references and prediction
    reference_tokens = word_tokenize(reference_summary)
    prediction_tokens = word_tokenize(pred_summary)

    # Compute BLEU score for the sample
    sample_bleu = sentence_bleu([reference_tokens], prediction_tokens)

    print(f"Article: {article}...")
    print(f"Reference Summary: {reference_summary}")
    print(f"Predicted Summary: {pred_summary}")
    print(f"BLEU Score: {sample_bleu}\n")


Average BLEU Score: 0.0004225577576027379
Article: Liputan6 . com , Bangka : Kapal patroli Angkatan Laut Republik Indonesia , Belinyu , baru-baru ini , menangkap tiga kapal nelayan berbendera Thailand , yakni KM Binatama , KM Sumber Jaya II , dan KM Mataram di Perairan Belitung Utara . Ketiga kapal itu ditangkap karena melanggar zona ekonomi ekslusif Indonesia . Saat ini , kapal-kapal itu diamankan di Pos Lanal Pelabuhan Pangkalan Balam , Bangka-Belitung . Menurut Komandan Pangkalan TNI AL Bangka Letnan Kolonel Laut Fredy Egam , selain menangkap tiga kapal , ALRI juga memeriksa 43 anak buah kapal . Mereka disergap saat sedang mengangkat jaring pukat harimau di Perairan Belitung Utara . Dari jumlah itu , hanya enam orang yang dijadikan tersangka , yakni tiga nahkoda dan tiga kepala kamar mesin kapal . Sedangkan ABK yang lain akan dideportasi ke negara asalnya . Meski berhasil menahan enam tersangka , TNI AL gagal mengamankan ikan tangkapan nelayan Thailand tersebut . Sebab , sebelum pat