In [1]:
import os
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from prompts import prompt, system

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
cls2name = {
    "FC": "Fully compliant",
    "LC": "Largely compliant",
    "PC": "Partially compliant",
    "NC": "Not compliant"
}

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="mistralai/Mistral-Nemo-Instruct-2407",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    # token="hf_..."
)

==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla V100-PCIE-32GB. Max memory: 31.739 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.24+cu118. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 297.02it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [04:28<00:00, 134.37s/it]


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=128,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=56,
    use_rslora=False,
    loftq_config=None
)

Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [5]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for task, name, uc_text, ssts_text, answer in zip(examples["task"], examples["name"], examples["uc_text"], examples["ssts_text"], examples["answer"]):
        if task == "Complience Level":
            answer = cls2name[answer]
        text = prompt.format(system[task], name, uc_text, ssts_text, task, answer) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [6]:
data_files = {"train": "./train data/dataset.csv"}
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 33/33 [00:00<00:00, 581.20 examples/s]


In [7]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        num_train_epochs=6,
        per_device_train_batch_size=1, # т.к. мало данных
        gradient_accumulation_steps=1,
        warmup_steps=10,
        learning_rate=5e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim= "adamw_8bit",
        weight_decay=0.0025,
        lr_scheduler_type="linear",
        seed=56,
        output_dir="outputs",
        load_best_model_at_end=False,
        greater_is_better=False,
        eval_steps=dataset["train"].num_rows,
        save_steps=dataset["train"].num_rows,
        eval_strategy="no",
        save_strategy="steps"
    )
)

Map (num_proc=2): 100%|██████████| 33/33 [00:01<00:00, 18.68 examples/s]
  super().__init__(


In [8]:
trainer_stats = trainer.train()
model = trainer.model

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 33 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 198
 "-____-"     Number of trainable parameters = 57,016,320
  1%|          | 1/198 [00:17<56:34, 17.23s/it]

{'loss': 2.1397, 'grad_norm': 6.9088544845581055, 'learning_rate': 5e-06, 'epoch': 0.03}


  1%|          | 2/198 [00:18<26:10,  8.01s/it]

{'loss': 2.4268, 'grad_norm': 10.900650978088379, 'learning_rate': 1e-05, 'epoch': 0.06}


  2%|▏         | 3/198 [00:20<16:47,  5.17s/it]

{'loss': 1.9608, 'grad_norm': 5.369039535522461, 'learning_rate': 1.5e-05, 'epoch': 0.09}


  2%|▏         | 4/198 [00:22<13:03,  4.04s/it]

{'loss': 2.0831, 'grad_norm': 4.163241386413574, 'learning_rate': 2e-05, 'epoch': 0.12}


  3%|▎         | 5/198 [00:24<09:45,  3.03s/it]

{'loss': 2.3774, 'grad_norm': 5.4531426429748535, 'learning_rate': 2.5e-05, 'epoch': 0.15}


  3%|▎         | 6/198 [00:26<08:55,  2.79s/it]

{'loss': 2.0886, 'grad_norm': 3.14402437210083, 'learning_rate': 3e-05, 'epoch': 0.18}


  4%|▎         | 7/198 [00:27<07:27,  2.34s/it]

{'loss': 2.3192, 'grad_norm': 3.6562325954437256, 'learning_rate': 3.5e-05, 'epoch': 0.21}


  4%|▍         | 8/198 [00:29<06:20,  2.00s/it]

{'loss': 2.0936, 'grad_norm': 4.617311000823975, 'learning_rate': 4e-05, 'epoch': 0.24}


  5%|▍         | 9/198 [00:30<05:41,  1.81s/it]

{'loss': 1.7653, 'grad_norm': 3.6722664833068848, 'learning_rate': 4.5e-05, 'epoch': 0.27}


  5%|▌         | 10/198 [00:32<05:23,  1.72s/it]

{'loss': 1.8676, 'grad_norm': 3.3992819786071777, 'learning_rate': 5e-05, 'epoch': 0.3}


  6%|▌         | 11/198 [00:33<05:02,  1.62s/it]

{'loss': 1.7149, 'grad_norm': 3.9256796836853027, 'learning_rate': 4.973404255319149e-05, 'epoch': 0.33}


  6%|▌         | 12/198 [00:34<04:56,  1.59s/it]

{'loss': 1.6135, 'grad_norm': 4.962081432342529, 'learning_rate': 4.946808510638298e-05, 'epoch': 0.36}


  7%|▋         | 13/198 [00:36<05:14,  1.70s/it]

{'loss': 1.6538, 'grad_norm': 3.8974881172180176, 'learning_rate': 4.920212765957447e-05, 'epoch': 0.39}


  7%|▋         | 14/198 [00:38<05:12,  1.70s/it]

{'loss': 1.4527, 'grad_norm': 4.598281383514404, 'learning_rate': 4.893617021276596e-05, 'epoch': 0.42}


  8%|▊         | 15/198 [00:40<05:07,  1.68s/it]

{'loss': 1.3872, 'grad_norm': 4.1066765785217285, 'learning_rate': 4.8670212765957454e-05, 'epoch': 0.45}


  8%|▊         | 16/198 [00:42<05:21,  1.77s/it]

{'loss': 1.364, 'grad_norm': 3.121248245239258, 'learning_rate': 4.840425531914894e-05, 'epoch': 0.48}


  9%|▊         | 17/198 [00:43<05:15,  1.74s/it]

{'loss': 1.3217, 'grad_norm': 4.571457862854004, 'learning_rate': 4.8138297872340426e-05, 'epoch': 0.52}


  9%|▉         | 18/198 [00:45<05:16,  1.76s/it]

{'loss': 1.1054, 'grad_norm': 3.6112632751464844, 'learning_rate': 4.787234042553192e-05, 'epoch': 0.55}


 10%|▉         | 19/198 [00:47<05:11,  1.74s/it]

{'loss': 1.0495, 'grad_norm': 4.536927700042725, 'learning_rate': 4.7606382978723405e-05, 'epoch': 0.58}


 10%|█         | 20/198 [00:48<04:46,  1.61s/it]

{'loss': 1.626, 'grad_norm': 4.658405303955078, 'learning_rate': 4.734042553191489e-05, 'epoch': 0.61}


 11%|█         | 21/198 [00:50<04:33,  1.55s/it]

{'loss': 1.3913, 'grad_norm': 5.469171047210693, 'learning_rate': 4.7074468085106385e-05, 'epoch': 0.64}


 11%|█         | 22/198 [00:51<04:31,  1.54s/it]

{'loss': 1.0355, 'grad_norm': 3.903778553009033, 'learning_rate': 4.680851063829788e-05, 'epoch': 0.67}


 12%|█▏        | 23/198 [00:53<04:22,  1.50s/it]

{'loss': 0.9899, 'grad_norm': 3.952552556991577, 'learning_rate': 4.6542553191489364e-05, 'epoch': 0.7}


 12%|█▏        | 24/198 [00:54<04:30,  1.56s/it]

{'loss': 1.1382, 'grad_norm': 4.9673027992248535, 'learning_rate': 4.627659574468085e-05, 'epoch': 0.73}


 13%|█▎        | 25/198 [00:56<04:36,  1.60s/it]

{'loss': 0.873, 'grad_norm': 3.7296271324157715, 'learning_rate': 4.601063829787234e-05, 'epoch': 0.76}


 13%|█▎        | 26/198 [00:58<04:44,  1.65s/it]

{'loss': 0.8258, 'grad_norm': 3.1372616291046143, 'learning_rate': 4.574468085106383e-05, 'epoch': 0.79}


 14%|█▎        | 27/198 [00:59<04:28,  1.57s/it]

{'loss': 0.7833, 'grad_norm': 5.443823337554932, 'learning_rate': 4.547872340425532e-05, 'epoch': 0.82}


 14%|█▍        | 28/198 [01:00<04:18,  1.52s/it]

{'loss': 0.5917, 'grad_norm': 4.610968589782715, 'learning_rate': 4.5212765957446815e-05, 'epoch': 0.85}


 15%|█▍        | 29/198 [01:02<04:17,  1.52s/it]

{'loss': 0.9321, 'grad_norm': 5.278853893280029, 'learning_rate': 4.49468085106383e-05, 'epoch': 0.88}


 15%|█▌        | 30/198 [01:04<04:47,  1.71s/it]

{'loss': 1.3917, 'grad_norm': 6.668214321136475, 'learning_rate': 4.468085106382979e-05, 'epoch': 0.91}


 16%|█▌        | 31/198 [01:05<04:23,  1.58s/it]

{'loss': 0.7696, 'grad_norm': 5.6665120124816895, 'learning_rate': 4.441489361702128e-05, 'epoch': 0.94}


 16%|█▌        | 32/198 [01:07<04:19,  1.56s/it]

{'loss': 0.865, 'grad_norm': 4.446552276611328, 'learning_rate': 4.414893617021277e-05, 'epoch': 0.97}


 17%|█▋        | 33/198 [01:09<04:24,  1.60s/it]

{'loss': 0.5504, 'grad_norm': 3.9654247760772705, 'learning_rate': 4.388297872340425e-05, 'epoch': 1.0}


 17%|█▋        | 34/198 [01:13<06:52,  2.52s/it]

{'loss': 0.5273, 'grad_norm': 4.600104808807373, 'learning_rate': 4.3617021276595746e-05, 'epoch': 1.03}


 18%|█▊        | 35/198 [01:15<05:49,  2.14s/it]

{'loss': 0.3265, 'grad_norm': 5.489748001098633, 'learning_rate': 4.335106382978724e-05, 'epoch': 1.06}


 18%|█▊        | 36/198 [01:17<05:38,  2.09s/it]

{'loss': 0.9063, 'grad_norm': 5.6071391105651855, 'learning_rate': 4.3085106382978725e-05, 'epoch': 1.09}


 19%|█▊        | 37/198 [01:18<05:02,  1.88s/it]

{'loss': 0.6694, 'grad_norm': 8.118706703186035, 'learning_rate': 4.281914893617022e-05, 'epoch': 1.12}


 19%|█▉        | 38/198 [01:20<05:13,  1.96s/it]

{'loss': 1.1145, 'grad_norm': 5.766993999481201, 'learning_rate': 4.2553191489361704e-05, 'epoch': 1.15}


 20%|█▉        | 39/198 [01:22<04:51,  1.83s/it]

{'loss': 0.3865, 'grad_norm': 4.368535995483398, 'learning_rate': 4.228723404255319e-05, 'epoch': 1.18}


 20%|██        | 40/198 [01:23<04:35,  1.74s/it]

{'loss': 0.347, 'grad_norm': 5.5002217292785645, 'learning_rate': 4.2021276595744684e-05, 'epoch': 1.21}


 21%|██        | 41/198 [01:25<04:35,  1.75s/it]

{'loss': 0.4458, 'grad_norm': 4.055555820465088, 'learning_rate': 4.175531914893617e-05, 'epoch': 1.24}


 21%|██        | 42/198 [01:27<05:00,  1.93s/it]

{'loss': 0.7952, 'grad_norm': 4.462283611297607, 'learning_rate': 4.148936170212766e-05, 'epoch': 1.27}


 22%|██▏       | 43/198 [01:29<04:52,  1.88s/it]

{'loss': 0.3256, 'grad_norm': 3.6693642139434814, 'learning_rate': 4.1223404255319156e-05, 'epoch': 1.3}


 22%|██▏       | 44/198 [01:31<04:34,  1.78s/it]

{'loss': 0.2207, 'grad_norm': 6.122828483581543, 'learning_rate': 4.095744680851064e-05, 'epoch': 1.33}


 23%|██▎       | 45/198 [01:32<04:33,  1.79s/it]

{'loss': 0.5725, 'grad_norm': 6.415865421295166, 'learning_rate': 4.069148936170213e-05, 'epoch': 1.36}


 23%|██▎       | 46/198 [01:34<04:09,  1.64s/it]

{'loss': 0.9095, 'grad_norm': 11.890031814575195, 'learning_rate': 4.0425531914893614e-05, 'epoch': 1.39}


 24%|██▎       | 47/198 [01:35<04:10,  1.66s/it]

{'loss': 0.0636, 'grad_norm': 3.881798028945923, 'learning_rate': 4.015957446808511e-05, 'epoch': 1.42}


 24%|██▍       | 48/198 [01:37<04:23,  1.76s/it]

{'loss': 0.3114, 'grad_norm': 5.485233306884766, 'learning_rate': 3.9893617021276594e-05, 'epoch': 1.45}


 25%|██▍       | 49/198 [01:39<04:12,  1.69s/it]

{'loss': 0.3083, 'grad_norm': 8.286267280578613, 'learning_rate': 3.962765957446809e-05, 'epoch': 1.48}


 25%|██▌       | 50/198 [01:41<04:10,  1.70s/it]

{'loss': 0.4184, 'grad_norm': 9.507805824279785, 'learning_rate': 3.936170212765958e-05, 'epoch': 1.52}


 26%|██▌       | 51/198 [01:42<03:55,  1.60s/it]

{'loss': 0.5106, 'grad_norm': 5.767796516418457, 'learning_rate': 3.9095744680851066e-05, 'epoch': 1.55}


 26%|██▋       | 52/198 [01:44<03:50,  1.58s/it]

{'loss': 0.1015, 'grad_norm': 4.741504669189453, 'learning_rate': 3.882978723404255e-05, 'epoch': 1.58}


 27%|██▋       | 53/198 [01:45<03:54,  1.62s/it]

{'loss': 0.7193, 'grad_norm': 10.057308197021484, 'learning_rate': 3.8563829787234045e-05, 'epoch': 1.61}


 27%|██▋       | 54/198 [01:48<04:23,  1.83s/it]

{'loss': 0.4063, 'grad_norm': 6.158865451812744, 'learning_rate': 3.829787234042553e-05, 'epoch': 1.64}


 28%|██▊       | 55/198 [01:49<04:15,  1.79s/it]

{'loss': 0.5312, 'grad_norm': 5.089329242706299, 'learning_rate': 3.8031914893617024e-05, 'epoch': 1.67}


 28%|██▊       | 56/198 [01:51<04:09,  1.76s/it]

{'loss': 0.271, 'grad_norm': 4.980171203613281, 'learning_rate': 3.776595744680852e-05, 'epoch': 1.7}


 29%|██▉       | 57/198 [01:53<04:05,  1.74s/it]

{'loss': 0.3206, 'grad_norm': 4.760906219482422, 'learning_rate': 3.7500000000000003e-05, 'epoch': 1.73}


 29%|██▉       | 58/198 [01:54<04:05,  1.75s/it]

{'loss': 0.0851, 'grad_norm': 4.377508640289307, 'learning_rate': 3.723404255319149e-05, 'epoch': 1.76}


 30%|██▉       | 59/198 [01:56<03:49,  1.65s/it]

{'loss': 0.3942, 'grad_norm': 7.116857528686523, 'learning_rate': 3.696808510638298e-05, 'epoch': 1.79}


 30%|███       | 60/198 [01:57<03:35,  1.56s/it]

{'loss': 0.3255, 'grad_norm': 11.032469749450684, 'learning_rate': 3.670212765957447e-05, 'epoch': 1.82}


 31%|███       | 61/198 [01:59<03:27,  1.52s/it]

{'loss': 0.1869, 'grad_norm': 6.396059036254883, 'learning_rate': 3.6436170212765955e-05, 'epoch': 1.85}


 31%|███▏      | 62/198 [02:00<03:20,  1.47s/it]

{'loss': 0.242, 'grad_norm': 5.636672496795654, 'learning_rate': 3.617021276595745e-05, 'epoch': 1.88}


 32%|███▏      | 63/198 [02:01<03:16,  1.46s/it]

{'loss': 0.1621, 'grad_norm': 3.9139769077301025, 'learning_rate': 3.590425531914894e-05, 'epoch': 1.91}


 32%|███▏      | 64/198 [02:03<03:18,  1.48s/it]

{'loss': 0.2887, 'grad_norm': 5.721384048461914, 'learning_rate': 3.563829787234043e-05, 'epoch': 1.94}


 33%|███▎      | 65/198 [02:04<03:07,  1.41s/it]

{'loss': 0.2675, 'grad_norm': 11.957317352294922, 'learning_rate': 3.537234042553192e-05, 'epoch': 1.97}


 33%|███▎      | 66/198 [02:06<03:06,  1.41s/it]

{'loss': 0.0413, 'grad_norm': 3.91524338722229, 'learning_rate': 3.5106382978723407e-05, 'epoch': 2.0}


 34%|███▍      | 67/198 [02:10<05:13,  2.40s/it]

{'loss': 0.1046, 'grad_norm': 6.185946941375732, 'learning_rate': 3.484042553191489e-05, 'epoch': 2.03}


 34%|███▍      | 68/198 [02:12<04:43,  2.18s/it]

{'loss': 0.1188, 'grad_norm': 7.963841438293457, 'learning_rate': 3.4574468085106386e-05, 'epoch': 2.06}


 35%|███▍      | 69/198 [02:14<04:27,  2.07s/it]

{'loss': 0.2071, 'grad_norm': 9.557926177978516, 'learning_rate': 3.430851063829787e-05, 'epoch': 2.09}


 35%|███▌      | 70/198 [02:15<03:58,  1.86s/it]

{'loss': 0.0746, 'grad_norm': 2.5866711139678955, 'learning_rate': 3.4042553191489365e-05, 'epoch': 2.12}


 36%|███▌      | 71/198 [02:17<03:39,  1.73s/it]

{'loss': 0.041, 'grad_norm': 1.649294137954712, 'learning_rate': 3.377659574468085e-05, 'epoch': 2.15}


 36%|███▋      | 72/198 [02:18<03:24,  1.62s/it]

{'loss': 0.0968, 'grad_norm': 5.633889198303223, 'learning_rate': 3.3510638297872344e-05, 'epoch': 2.18}


 37%|███▋      | 73/198 [02:19<03:09,  1.51s/it]

{'loss': 0.0681, 'grad_norm': 4.504541397094727, 'learning_rate': 3.324468085106383e-05, 'epoch': 2.21}


 37%|███▋      | 74/198 [02:21<03:14,  1.57s/it]

{'loss': 0.1052, 'grad_norm': 3.8763341903686523, 'learning_rate': 3.2978723404255317e-05, 'epoch': 2.24}


 38%|███▊      | 75/198 [02:23<03:18,  1.61s/it]

{'loss': 0.0863, 'grad_norm': 4.145663738250732, 'learning_rate': 3.271276595744681e-05, 'epoch': 2.27}


 38%|███▊      | 76/198 [02:24<03:22,  1.66s/it]

{'loss': 0.0817, 'grad_norm': 4.872596740722656, 'learning_rate': 3.2446808510638296e-05, 'epoch': 2.3}


 39%|███▉      | 77/198 [02:26<03:25,  1.70s/it]

{'loss': 0.0575, 'grad_norm': 2.2525041103363037, 'learning_rate': 3.218085106382979e-05, 'epoch': 2.33}


 39%|███▉      | 78/198 [02:28<03:34,  1.78s/it]

{'loss': 0.0712, 'grad_norm': 5.186206817626953, 'learning_rate': 3.191489361702128e-05, 'epoch': 2.36}


 40%|███▉      | 79/198 [02:30<03:29,  1.76s/it]

{'loss': 0.0465, 'grad_norm': 2.4868922233581543, 'learning_rate': 3.164893617021277e-05, 'epoch': 2.39}


 40%|████      | 80/198 [02:31<03:15,  1.65s/it]

{'loss': 0.0373, 'grad_norm': 5.562924385070801, 'learning_rate': 3.1382978723404254e-05, 'epoch': 2.42}


 41%|████      | 81/198 [02:33<03:09,  1.62s/it]

{'loss': 0.1472, 'grad_norm': 11.671944618225098, 'learning_rate': 3.111702127659575e-05, 'epoch': 2.45}


 41%|████▏     | 82/198 [02:34<03:04,  1.59s/it]

{'loss': 0.185, 'grad_norm': 11.556520462036133, 'learning_rate': 3.085106382978723e-05, 'epoch': 2.48}


 42%|████▏     | 83/198 [02:36<03:01,  1.58s/it]

{'loss': 0.0431, 'grad_norm': 1.6747602224349976, 'learning_rate': 3.0585106382978726e-05, 'epoch': 2.52}


 42%|████▏     | 84/198 [02:37<02:58,  1.56s/it]

{'loss': 0.127, 'grad_norm': 4.271610736846924, 'learning_rate': 3.0319148936170216e-05, 'epoch': 2.55}


 43%|████▎     | 85/198 [02:39<02:50,  1.50s/it]

{'loss': 0.0662, 'grad_norm': 3.9540133476257324, 'learning_rate': 3.0053191489361706e-05, 'epoch': 2.58}


 43%|████▎     | 86/198 [02:41<03:04,  1.65s/it]

{'loss': 0.0798, 'grad_norm': 3.042069435119629, 'learning_rate': 2.9787234042553192e-05, 'epoch': 2.61}


 44%|████▍     | 87/198 [02:42<02:54,  1.58s/it]

{'loss': 0.0188, 'grad_norm': 1.0735986232757568, 'learning_rate': 2.9521276595744685e-05, 'epoch': 2.64}


 44%|████▍     | 88/198 [02:44<02:51,  1.56s/it]

{'loss': 0.0657, 'grad_norm': 8.731338500976562, 'learning_rate': 2.925531914893617e-05, 'epoch': 2.67}


 45%|████▍     | 89/198 [02:45<02:39,  1.47s/it]

{'loss': 0.1163, 'grad_norm': 8.695021629333496, 'learning_rate': 2.898936170212766e-05, 'epoch': 2.7}


 45%|████▌     | 90/198 [02:46<02:35,  1.44s/it]

{'loss': 0.0883, 'grad_norm': 4.795970439910889, 'learning_rate': 2.8723404255319154e-05, 'epoch': 2.73}


 46%|████▌     | 91/198 [02:48<02:36,  1.46s/it]

{'loss': 0.0221, 'grad_norm': 1.6006938219070435, 'learning_rate': 2.845744680851064e-05, 'epoch': 2.76}


 46%|████▋     | 92/198 [02:50<02:42,  1.54s/it]

{'loss': 0.0305, 'grad_norm': 2.243584394454956, 'learning_rate': 2.819148936170213e-05, 'epoch': 2.79}


 47%|████▋     | 93/198 [02:51<02:46,  1.59s/it]

{'loss': 0.0671, 'grad_norm': 5.8436455726623535, 'learning_rate': 2.7925531914893616e-05, 'epoch': 2.82}


 47%|████▋     | 94/198 [02:53<02:35,  1.49s/it]

{'loss': 0.0102, 'grad_norm': 0.6999393701553345, 'learning_rate': 2.765957446808511e-05, 'epoch': 2.85}


 48%|████▊     | 95/198 [02:55<02:54,  1.69s/it]

{'loss': 0.3513, 'grad_norm': 8.063093185424805, 'learning_rate': 2.7393617021276595e-05, 'epoch': 2.88}


 48%|████▊     | 96/198 [02:57<03:12,  1.89s/it]

{'loss': 0.2283, 'grad_norm': 5.214378833770752, 'learning_rate': 2.7127659574468084e-05, 'epoch': 2.91}


 49%|████▉     | 97/198 [02:59<03:23,  2.02s/it]

{'loss': 0.1403, 'grad_norm': 3.8067386150360107, 'learning_rate': 2.6861702127659577e-05, 'epoch': 2.94}


 49%|████▉     | 98/198 [03:01<03:15,  1.95s/it]

{'loss': 0.0317, 'grad_norm': 4.458873748779297, 'learning_rate': 2.6595744680851064e-05, 'epoch': 2.97}


 50%|█████     | 99/198 [03:03<02:57,  1.79s/it]

{'loss': 0.0414, 'grad_norm': 6.5904083251953125, 'learning_rate': 2.6329787234042553e-05, 'epoch': 3.0}


 51%|█████     | 100/198 [03:07<04:21,  2.67s/it]

{'loss': 0.0165, 'grad_norm': 1.097118854522705, 'learning_rate': 2.6063829787234046e-05, 'epoch': 3.03}


 51%|█████     | 101/198 [03:09<03:50,  2.38s/it]

{'loss': 0.0372, 'grad_norm': 2.8986775875091553, 'learning_rate': 2.5797872340425532e-05, 'epoch': 3.06}


 52%|█████▏    | 102/198 [03:11<03:31,  2.20s/it]

{'loss': 0.0313, 'grad_norm': 1.208497166633606, 'learning_rate': 2.5531914893617022e-05, 'epoch': 3.09}


 52%|█████▏    | 103/198 [03:13<03:17,  2.08s/it]

{'loss': 0.0107, 'grad_norm': 0.49542760848999023, 'learning_rate': 2.5265957446808515e-05, 'epoch': 3.12}


 53%|█████▎    | 104/198 [03:15<03:17,  2.10s/it]

{'loss': 0.1298, 'grad_norm': 5.385983943939209, 'learning_rate': 2.5e-05, 'epoch': 3.15}


 53%|█████▎    | 105/198 [03:16<02:54,  1.88s/it]

{'loss': 0.0401, 'grad_norm': 3.6569063663482666, 'learning_rate': 2.473404255319149e-05, 'epoch': 3.18}


 54%|█████▎    | 106/198 [03:18<02:43,  1.77s/it]

{'loss': 0.0441, 'grad_norm': 3.7052700519561768, 'learning_rate': 2.446808510638298e-05, 'epoch': 3.21}


 54%|█████▍    | 107/198 [03:19<02:30,  1.66s/it]

{'loss': 0.0421, 'grad_norm': 2.9388303756713867, 'learning_rate': 2.420212765957447e-05, 'epoch': 3.24}


 55%|█████▍    | 108/198 [03:21<02:37,  1.75s/it]

{'loss': 0.0721, 'grad_norm': 6.667715072631836, 'learning_rate': 2.393617021276596e-05, 'epoch': 3.27}


 55%|█████▌    | 109/198 [03:23<02:51,  1.93s/it]

{'loss': 0.0336, 'grad_norm': 2.1439900398254395, 'learning_rate': 2.3670212765957446e-05, 'epoch': 3.3}


 56%|█████▌    | 110/198 [03:25<02:43,  1.86s/it]

{'loss': 0.0207, 'grad_norm': 1.0125291347503662, 'learning_rate': 2.340425531914894e-05, 'epoch': 3.33}


 56%|█████▌    | 111/198 [03:26<02:26,  1.69s/it]

{'loss': 0.0084, 'grad_norm': 0.5962375402450562, 'learning_rate': 2.3138297872340425e-05, 'epoch': 3.36}


 57%|█████▋    | 112/198 [03:28<02:14,  1.56s/it]

{'loss': 0.0474, 'grad_norm': 2.8947417736053467, 'learning_rate': 2.2872340425531915e-05, 'epoch': 3.39}


 57%|█████▋    | 113/198 [03:29<02:05,  1.48s/it]

{'loss': 0.0903, 'grad_norm': 6.950456619262695, 'learning_rate': 2.2606382978723408e-05, 'epoch': 3.42}


 58%|█████▊    | 114/198 [03:30<02:02,  1.46s/it]

{'loss': 0.0058, 'grad_norm': 0.42648622393608093, 'learning_rate': 2.2340425531914894e-05, 'epoch': 3.45}


 58%|█████▊    | 115/198 [03:32<02:06,  1.53s/it]

{'loss': 0.0615, 'grad_norm': 3.3892462253570557, 'learning_rate': 2.2074468085106383e-05, 'epoch': 3.48}


 59%|█████▊    | 116/198 [03:33<01:58,  1.45s/it]

{'loss': 0.0581, 'grad_norm': 3.6277213096618652, 'learning_rate': 2.1808510638297873e-05, 'epoch': 3.52}


 59%|█████▉    | 117/198 [03:35<01:59,  1.48s/it]

{'loss': 0.0355, 'grad_norm': 4.084924697875977, 'learning_rate': 2.1542553191489363e-05, 'epoch': 3.55}


 60%|█████▉    | 118/198 [03:36<02:03,  1.54s/it]

{'loss': 0.0437, 'grad_norm': 2.831000328063965, 'learning_rate': 2.1276595744680852e-05, 'epoch': 3.58}


 60%|██████    | 119/198 [03:38<02:01,  1.54s/it]

{'loss': 0.0212, 'grad_norm': 2.697206735610962, 'learning_rate': 2.1010638297872342e-05, 'epoch': 3.61}


 61%|██████    | 120/198 [03:40<02:05,  1.61s/it]

{'loss': 0.033, 'grad_norm': 2.7717952728271484, 'learning_rate': 2.074468085106383e-05, 'epoch': 3.64}


 61%|██████    | 121/198 [03:41<02:06,  1.64s/it]

{'loss': 0.0475, 'grad_norm': 4.484803199768066, 'learning_rate': 2.047872340425532e-05, 'epoch': 3.67}


 62%|██████▏   | 122/198 [03:43<02:05,  1.66s/it]

{'loss': 0.0079, 'grad_norm': 1.0975624322891235, 'learning_rate': 2.0212765957446807e-05, 'epoch': 3.7}


 62%|██████▏   | 123/198 [03:45<01:57,  1.57s/it]

{'loss': 0.0559, 'grad_norm': 4.662158012390137, 'learning_rate': 1.9946808510638297e-05, 'epoch': 3.73}


 63%|██████▎   | 124/198 [03:46<02:01,  1.65s/it]

{'loss': 0.0436, 'grad_norm': 3.580444574356079, 'learning_rate': 1.968085106382979e-05, 'epoch': 3.76}


 63%|██████▎   | 125/198 [03:49<02:15,  1.85s/it]

{'loss': 0.025, 'grad_norm': 2.5484180450439453, 'learning_rate': 1.9414893617021276e-05, 'epoch': 3.79}


 64%|██████▎   | 126/198 [03:50<02:03,  1.72s/it]

{'loss': 0.0194, 'grad_norm': 1.8302993774414062, 'learning_rate': 1.9148936170212766e-05, 'epoch': 3.82}


 64%|██████▍   | 127/198 [03:52<02:07,  1.80s/it]

{'loss': 0.0151, 'grad_norm': 1.7138370275497437, 'learning_rate': 1.888297872340426e-05, 'epoch': 3.85}


 65%|██████▍   | 128/198 [03:54<02:00,  1.72s/it]

{'loss': 0.0756, 'grad_norm': 2.9254636764526367, 'learning_rate': 1.8617021276595745e-05, 'epoch': 3.88}


 65%|██████▌   | 129/198 [03:55<01:52,  1.62s/it]

{'loss': 0.0218, 'grad_norm': 1.3163985013961792, 'learning_rate': 1.8351063829787234e-05, 'epoch': 3.91}


 66%|██████▌   | 130/198 [03:57<01:48,  1.59s/it]

{'loss': 0.0133, 'grad_norm': 1.5377591848373413, 'learning_rate': 1.8085106382978724e-05, 'epoch': 3.94}


 66%|██████▌   | 131/198 [03:58<01:42,  1.53s/it]

{'loss': 0.0263, 'grad_norm': 1.046928882598877, 'learning_rate': 1.7819148936170214e-05, 'epoch': 3.97}


 67%|██████▋   | 132/198 [03:59<01:41,  1.53s/it]

{'loss': 0.0244, 'grad_norm': 0.8592835068702698, 'learning_rate': 1.7553191489361703e-05, 'epoch': 4.0}


 67%|██████▋   | 133/198 [04:04<02:41,  2.49s/it]

{'loss': 0.0088, 'grad_norm': 0.3856964707374573, 'learning_rate': 1.7287234042553193e-05, 'epoch': 4.03}


 68%|██████▊   | 134/198 [04:06<02:24,  2.25s/it]

{'loss': 0.0065, 'grad_norm': 1.1101787090301514, 'learning_rate': 1.7021276595744682e-05, 'epoch': 4.06}


 68%|██████▊   | 135/198 [04:08<02:12,  2.11s/it]

{'loss': 0.0176, 'grad_norm': 2.5035524368286133, 'learning_rate': 1.6755319148936172e-05, 'epoch': 4.09}


 69%|██████▊   | 136/198 [04:09<01:54,  1.85s/it]

{'loss': 0.0412, 'grad_norm': 2.754216194152832, 'learning_rate': 1.6489361702127658e-05, 'epoch': 4.12}


 69%|██████▉   | 137/198 [04:10<01:44,  1.72s/it]

{'loss': 0.0094, 'grad_norm': 0.5756649374961853, 'learning_rate': 1.6223404255319148e-05, 'epoch': 4.15}


 70%|██████▉   | 138/198 [04:12<01:40,  1.67s/it]

{'loss': 0.0096, 'grad_norm': 0.4211764335632324, 'learning_rate': 1.595744680851064e-05, 'epoch': 4.18}


 70%|███████   | 139/198 [04:14<01:38,  1.67s/it]

{'loss': 0.0184, 'grad_norm': 2.2160727977752686, 'learning_rate': 1.5691489361702127e-05, 'epoch': 4.21}


 71%|███████   | 140/198 [04:15<01:37,  1.68s/it]

{'loss': 0.0063, 'grad_norm': 0.3347245752811432, 'learning_rate': 1.5425531914893617e-05, 'epoch': 4.24}


 71%|███████   | 141/198 [04:17<01:38,  1.72s/it]

{'loss': 0.0161, 'grad_norm': 1.276038408279419, 'learning_rate': 1.5159574468085108e-05, 'epoch': 4.27}


 72%|███████▏  | 142/198 [04:18<01:31,  1.63s/it]

{'loss': 0.0248, 'grad_norm': 3.6792612075805664, 'learning_rate': 1.4893617021276596e-05, 'epoch': 4.3}


 72%|███████▏  | 143/198 [04:20<01:25,  1.56s/it]

{'loss': 0.0078, 'grad_norm': 0.7755721807479858, 'learning_rate': 1.4627659574468085e-05, 'epoch': 4.33}


 73%|███████▎  | 144/198 [04:22<01:28,  1.63s/it]

{'loss': 0.0062, 'grad_norm': 0.33763909339904785, 'learning_rate': 1.4361702127659577e-05, 'epoch': 4.36}


 73%|███████▎  | 145/198 [04:23<01:22,  1.56s/it]

{'loss': 0.0091, 'grad_norm': 0.7188576459884644, 'learning_rate': 1.4095744680851065e-05, 'epoch': 4.39}


 74%|███████▎  | 146/198 [04:24<01:18,  1.51s/it]

{'loss': 0.0097, 'grad_norm': 0.5150034427642822, 'learning_rate': 1.3829787234042554e-05, 'epoch': 4.42}


 74%|███████▍  | 147/198 [04:26<01:17,  1.52s/it]

{'loss': 0.0212, 'grad_norm': 3.999324083328247, 'learning_rate': 1.3563829787234042e-05, 'epoch': 4.45}


 75%|███████▍  | 148/198 [04:28<01:16,  1.52s/it]

{'loss': 0.0218, 'grad_norm': 6.395025253295898, 'learning_rate': 1.3297872340425532e-05, 'epoch': 4.48}


 75%|███████▌  | 149/198 [04:30<01:23,  1.71s/it]

{'loss': 0.049, 'grad_norm': 4.691069602966309, 'learning_rate': 1.3031914893617023e-05, 'epoch': 4.52}


 76%|███████▌  | 150/198 [04:31<01:17,  1.61s/it]

{'loss': 0.0053, 'grad_norm': 1.0850037336349487, 'learning_rate': 1.2765957446808511e-05, 'epoch': 4.55}


 76%|███████▋  | 151/198 [04:33<01:14,  1.59s/it]

{'loss': 0.006, 'grad_norm': 0.3837309181690216, 'learning_rate': 1.25e-05, 'epoch': 4.58}


 77%|███████▋  | 152/198 [04:34<01:10,  1.53s/it]

{'loss': 0.0264, 'grad_norm': 2.2927937507629395, 'learning_rate': 1.223404255319149e-05, 'epoch': 4.61}


 77%|███████▋  | 153/198 [04:36<01:11,  1.58s/it]

{'loss': 0.0183, 'grad_norm': 1.6262975931167603, 'learning_rate': 1.196808510638298e-05, 'epoch': 4.64}


 78%|███████▊  | 154/198 [04:37<01:11,  1.62s/it]

{'loss': 0.0083, 'grad_norm': 0.4860614538192749, 'learning_rate': 1.170212765957447e-05, 'epoch': 4.67}


 78%|███████▊  | 155/198 [04:39<01:14,  1.73s/it]

{'loss': 0.0068, 'grad_norm': 1.3728102445602417, 'learning_rate': 1.1436170212765957e-05, 'epoch': 4.7}


 79%|███████▉  | 156/198 [04:42<01:20,  1.91s/it]

{'loss': 0.0076, 'grad_norm': 0.7337515950202942, 'learning_rate': 1.1170212765957447e-05, 'epoch': 4.73}


 79%|███████▉  | 157/198 [04:43<01:10,  1.72s/it]

{'loss': 0.0181, 'grad_norm': 0.9895786643028259, 'learning_rate': 1.0904255319148937e-05, 'epoch': 4.76}


 80%|███████▉  | 158/198 [04:45<01:11,  1.79s/it]

{'loss': 0.023, 'grad_norm': 0.9253844618797302, 'learning_rate': 1.0638297872340426e-05, 'epoch': 4.79}


 80%|████████  | 159/198 [04:46<01:04,  1.64s/it]

{'loss': 0.0084, 'grad_norm': 0.8015117645263672, 'learning_rate': 1.0372340425531916e-05, 'epoch': 4.82}


 81%|████████  | 160/198 [04:48<01:04,  1.69s/it]

{'loss': 0.0259, 'grad_norm': 3.03987193107605, 'learning_rate': 1.0106382978723404e-05, 'epoch': 4.85}


 81%|████████▏ | 161/198 [04:50<01:09,  1.88s/it]

{'loss': 0.0094, 'grad_norm': 0.48129913210868835, 'learning_rate': 9.840425531914895e-06, 'epoch': 4.88}


 82%|████████▏ | 162/198 [04:52<01:03,  1.78s/it]

{'loss': 0.0073, 'grad_norm': 0.48436084389686584, 'learning_rate': 9.574468085106383e-06, 'epoch': 4.91}


 82%|████████▏ | 163/198 [04:53<00:56,  1.62s/it]

{'loss': 0.0067, 'grad_norm': 0.3341325521469116, 'learning_rate': 9.308510638297872e-06, 'epoch': 4.94}


 83%|████████▎ | 164/198 [04:55<00:54,  1.60s/it]

{'loss': 0.005, 'grad_norm': 0.24795368313789368, 'learning_rate': 9.042553191489362e-06, 'epoch': 4.97}


 83%|████████▎ | 165/198 [04:56<00:53,  1.63s/it]

{'loss': 0.0075, 'grad_norm': 0.4128953814506531, 'learning_rate': 8.776595744680852e-06, 'epoch': 5.0}


 84%|████████▍ | 166/198 [05:01<01:22,  2.59s/it]

{'loss': 0.0062, 'grad_norm': 0.3544166386127472, 'learning_rate': 8.510638297872341e-06, 'epoch': 5.03}


 84%|████████▍ | 167/198 [05:03<01:12,  2.35s/it]

{'loss': 0.0046, 'grad_norm': 0.9356147050857544, 'learning_rate': 8.244680851063829e-06, 'epoch': 5.06}


 85%|████████▍ | 168/198 [05:04<01:00,  2.02s/it]

{'loss': 0.0211, 'grad_norm': 1.5683048963546753, 'learning_rate': 7.97872340425532e-06, 'epoch': 5.09}


 85%|████████▌ | 169/198 [05:06<00:55,  1.92s/it]

{'loss': 0.005, 'grad_norm': 1.6329779624938965, 'learning_rate': 7.712765957446808e-06, 'epoch': 5.12}


 86%|████████▌ | 170/198 [05:07<00:48,  1.73s/it]

{'loss': 0.0055, 'grad_norm': 0.26289665699005127, 'learning_rate': 7.446808510638298e-06, 'epoch': 5.15}


 86%|████████▋ | 171/198 [05:09<00:46,  1.71s/it]

{'loss': 0.0114, 'grad_norm': 1.4385648965835571, 'learning_rate': 7.180851063829788e-06, 'epoch': 5.18}


 87%|████████▋ | 172/198 [05:10<00:42,  1.62s/it]

{'loss': 0.0065, 'grad_norm': 0.32637977600097656, 'learning_rate': 6.914893617021277e-06, 'epoch': 5.21}


 87%|████████▋ | 173/198 [05:12<00:41,  1.67s/it]

{'loss': 0.0084, 'grad_norm': 1.3563005924224854, 'learning_rate': 6.648936170212766e-06, 'epoch': 5.24}


 88%|████████▊ | 174/198 [05:14<00:41,  1.72s/it]

{'loss': 0.0081, 'grad_norm': 1.005731463432312, 'learning_rate': 6.3829787234042555e-06, 'epoch': 5.27}


 88%|████████▊ | 175/198 [05:15<00:37,  1.62s/it]

{'loss': 0.0062, 'grad_norm': 0.4067169725894928, 'learning_rate': 6.117021276595745e-06, 'epoch': 5.3}


 89%|████████▉ | 176/198 [05:18<00:40,  1.83s/it]

{'loss': 0.0045, 'grad_norm': 0.17648029327392578, 'learning_rate': 5.851063829787235e-06, 'epoch': 5.33}


 89%|████████▉ | 177/198 [05:20<00:39,  1.88s/it]

{'loss': 0.0043, 'grad_norm': 0.2738938629627228, 'learning_rate': 5.5851063829787235e-06, 'epoch': 5.36}


 90%|████████▉ | 178/198 [05:21<00:34,  1.73s/it]

{'loss': 0.0076, 'grad_norm': 1.1787018775939941, 'learning_rate': 5.319148936170213e-06, 'epoch': 5.39}


 90%|█████████ | 179/198 [05:22<00:30,  1.62s/it]

{'loss': 0.0038, 'grad_norm': 0.2162531167268753, 'learning_rate': 5.053191489361702e-06, 'epoch': 5.42}


 91%|█████████ | 180/198 [05:24<00:28,  1.56s/it]

{'loss': 0.0046, 'grad_norm': 0.2476741224527359, 'learning_rate': 4.787234042553191e-06, 'epoch': 5.45}


 91%|█████████▏| 181/198 [05:25<00:26,  1.55s/it]

{'loss': 0.0063, 'grad_norm': 0.3547848165035248, 'learning_rate': 4.521276595744681e-06, 'epoch': 5.48}


 92%|█████████▏| 182/198 [05:27<00:25,  1.60s/it]

{'loss': 0.0163, 'grad_norm': 0.9913724064826965, 'learning_rate': 4.255319148936171e-06, 'epoch': 5.52}


 92%|█████████▏| 183/198 [05:29<00:23,  1.58s/it]

{'loss': 0.0046, 'grad_norm': 0.41194552183151245, 'learning_rate': 3.98936170212766e-06, 'epoch': 5.55}


 93%|█████████▎| 184/198 [05:30<00:22,  1.62s/it]

{'loss': 0.0057, 'grad_norm': 0.28684312105178833, 'learning_rate': 3.723404255319149e-06, 'epoch': 5.58}


 93%|█████████▎| 185/198 [05:32<00:22,  1.73s/it]

{'loss': 0.0086, 'grad_norm': 0.5813469290733337, 'learning_rate': 3.4574468085106386e-06, 'epoch': 5.61}


 94%|█████████▍| 186/198 [05:34<00:19,  1.63s/it]

{'loss': 0.0206, 'grad_norm': 4.833148002624512, 'learning_rate': 3.1914893617021277e-06, 'epoch': 5.64}


 94%|█████████▍| 187/198 [05:35<00:17,  1.61s/it]

{'loss': 0.0073, 'grad_norm': 0.707857608795166, 'learning_rate': 2.9255319148936174e-06, 'epoch': 5.67}


 95%|█████████▍| 188/198 [05:37<00:16,  1.64s/it]

{'loss': 0.0054, 'grad_norm': 0.30262085795402527, 'learning_rate': 2.6595744680851065e-06, 'epoch': 5.7}


 95%|█████████▌| 189/198 [05:38<00:14,  1.57s/it]

{'loss': 0.0063, 'grad_norm': 0.32007724046707153, 'learning_rate': 2.3936170212765957e-06, 'epoch': 5.73}


 96%|█████████▌| 190/198 [05:40<00:13,  1.64s/it]

{'loss': 0.0038, 'grad_norm': 0.1913178563117981, 'learning_rate': 2.1276595744680853e-06, 'epoch': 5.76}


 96%|█████████▋| 191/198 [05:42<00:11,  1.61s/it]

{'loss': 0.0106, 'grad_norm': 3.53623628616333, 'learning_rate': 1.8617021276595745e-06, 'epoch': 5.79}


 97%|█████████▋| 192/198 [05:43<00:09,  1.58s/it]

{'loss': 0.0056, 'grad_norm': 1.5241271257400513, 'learning_rate': 1.5957446808510639e-06, 'epoch': 5.82}


 97%|█████████▋| 193/198 [05:44<00:07,  1.49s/it]

{'loss': 0.0042, 'grad_norm': 0.29021331667900085, 'learning_rate': 1.3297872340425533e-06, 'epoch': 5.85}


 98%|█████████▊| 194/198 [05:47<00:06,  1.69s/it]

{'loss': 0.0179, 'grad_norm': 1.8681721687316895, 'learning_rate': 1.0638297872340427e-06, 'epoch': 5.88}


 98%|█████████▊| 195/198 [05:49<00:05,  1.88s/it]

{'loss': 0.0053, 'grad_norm': 0.2604987919330597, 'learning_rate': 7.978723404255319e-07, 'epoch': 5.91}


 99%|█████████▉| 196/198 [05:50<00:03,  1.77s/it]

{'loss': 0.0046, 'grad_norm': 0.2631043791770935, 'learning_rate': 5.319148936170213e-07, 'epoch': 5.94}


 99%|█████████▉| 197/198 [05:52<00:01,  1.65s/it]

{'loss': 0.0066, 'grad_norm': 0.46007052063941956, 'learning_rate': 2.6595744680851066e-07, 'epoch': 5.97}


100%|██████████| 198/198 [05:54<00:00,  1.67s/it]

{'loss': 0.0039, 'grad_norm': 0.29465770721435547, 'learning_rate': 0.0, 'epoch': 6.0}


100%|██████████| 198/198 [05:57<00:00,  1.81s/it]

{'train_runtime': 357.4326, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.554, 'train_loss': 0.33379795959553293, 'epoch': 6.0}





In [9]:
model.save_pretrained("./Mistral-finetuned")
tokenizer.save_pretrained("./Mistral-finetuned")

('./Mistral-finetuned/tokenizer_config.json',
 './Mistral-finetuned/special_tokens_map.json',
 './Mistral-finetuned/tokenizer.json')