In [9]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from datasets import Dataset

In [6]:
df = pd.read_csv('Conversation.csv')

In [7]:
# Combine question-answer pairs into a dialogue format for training
df['input_text'] = df['question'] + " " + df['answer']
dataset = Dataset.from_pandas(df[['input_text']])

In [10]:
# Initialize the tokenizer and model
model_name = "gpt2"  # Choose a small model like distilbert, bert, or gpt2
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [12]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token

In [13]:
# Resize model embeddings to account for the added padding token
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [14]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 3725/3725 [00:00<00:00, 6064.91 examples/s]


In [15]:
# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank decomposition
    lora_alpha=32,  # Alpha scaling factor
    target_modules=["c_attn", "c_proj"],  # Apply LoRA to attention layers in GPT-2
    lora_dropout=0.1,  # Dropout rate
)

In [17]:
# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)



In [18]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora-finetuned-model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
)

In [19]:
# Initialize Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [20]:
# Start training
trainer.train()

  1%|          | 10/1398 [00:22<46:22,  2.00s/it] 

{'loss': 4.1873, 'grad_norm': 0.9092060327529907, 'learning_rate': 4.964234620886982e-05, 'epoch': 0.02}


  1%|▏         | 20/1398 [00:44<49:48,  2.17s/it]

{'loss': 4.1091, 'grad_norm': 1.2194467782974243, 'learning_rate': 4.928469241773963e-05, 'epoch': 0.04}


  2%|▏         | 30/1398 [01:00<34:52,  1.53s/it]

{'loss': 4.156, 'grad_norm': 1.172675609588623, 'learning_rate': 4.8927038626609446e-05, 'epoch': 0.06}


  3%|▎         | 40/1398 [01:16<35:34,  1.57s/it]

{'loss': 3.9594, 'grad_norm': 1.2112587690353394, 'learning_rate': 4.856938483547926e-05, 'epoch': 0.09}


  4%|▎         | 50/1398 [01:32<36:12,  1.61s/it]

{'loss': 3.7932, 'grad_norm': 1.1277062892913818, 'learning_rate': 4.8211731044349076e-05, 'epoch': 0.11}


  4%|▍         | 60/1398 [01:48<35:44,  1.60s/it]

{'loss': 3.5902, 'grad_norm': 1.4214569330215454, 'learning_rate': 4.785407725321889e-05, 'epoch': 0.13}


  5%|▌         | 70/1398 [02:04<34:51,  1.57s/it]

{'loss': 3.6281, 'grad_norm': 1.8246862888336182, 'learning_rate': 4.74964234620887e-05, 'epoch': 0.15}


  6%|▌         | 80/1398 [02:21<36:12,  1.65s/it]

{'loss': 3.5588, 'grad_norm': 1.828236699104309, 'learning_rate': 4.713876967095851e-05, 'epoch': 0.17}


  6%|▋         | 90/1398 [02:38<40:23,  1.85s/it]

{'loss': 3.478, 'grad_norm': 2.4205355644226074, 'learning_rate': 4.678111587982833e-05, 'epoch': 0.19}


  7%|▋         | 100/1398 [02:55<37:30,  1.73s/it]

{'loss': 3.5082, 'grad_norm': 1.4839125871658325, 'learning_rate': 4.642346208869814e-05, 'epoch': 0.21}


  8%|▊         | 110/1398 [03:13<37:27,  1.75s/it]

{'loss': 3.4222, 'grad_norm': 1.8698281049728394, 'learning_rate': 4.606580829756796e-05, 'epoch': 0.24}


  9%|▊         | 120/1398 [03:30<35:25,  1.66s/it]

{'loss': 3.4284, 'grad_norm': 1.6848185062408447, 'learning_rate': 4.570815450643777e-05, 'epoch': 0.26}


  9%|▉         | 130/1398 [03:46<34:59,  1.66s/it]

{'loss': 3.4171, 'grad_norm': 1.9011300802230835, 'learning_rate': 4.5350500715307586e-05, 'epoch': 0.28}


 10%|█         | 140/1398 [04:03<35:18,  1.68s/it]

{'loss': 3.1921, 'grad_norm': 1.7150545120239258, 'learning_rate': 4.49928469241774e-05, 'epoch': 0.3}


 11%|█         | 150/1398 [04:21<37:14,  1.79s/it]

{'loss': 3.3786, 'grad_norm': 1.893955111503601, 'learning_rate': 4.4635193133047216e-05, 'epoch': 0.32}


 11%|█▏        | 160/1398 [04:39<37:27,  1.82s/it]

{'loss': 3.37, 'grad_norm': 2.5419111251831055, 'learning_rate': 4.4277539341917024e-05, 'epoch': 0.34}


 12%|█▏        | 170/1398 [04:57<38:18,  1.87s/it]

{'loss': 3.2869, 'grad_norm': 1.74984872341156, 'learning_rate': 4.391988555078684e-05, 'epoch': 0.36}


 13%|█▎        | 180/1398 [05:16<38:34,  1.90s/it]

{'loss': 3.4988, 'grad_norm': 1.7831679582595825, 'learning_rate': 4.356223175965665e-05, 'epoch': 0.39}


 14%|█▎        | 190/1398 [05:35<36:40,  1.82s/it]

{'loss': 3.4048, 'grad_norm': 1.7854557037353516, 'learning_rate': 4.320457796852647e-05, 'epoch': 0.41}


 14%|█▍        | 200/1398 [05:53<36:03,  1.81s/it]

{'loss': 3.2474, 'grad_norm': 2.058101177215576, 'learning_rate': 4.284692417739628e-05, 'epoch': 0.43}


 15%|█▌        | 210/1398 [06:12<36:55,  1.86s/it]

{'loss': 3.3426, 'grad_norm': 3.000382661819458, 'learning_rate': 4.24892703862661e-05, 'epoch': 0.45}


 16%|█▌        | 220/1398 [06:31<36:46,  1.87s/it]

{'loss': 3.2032, 'grad_norm': 1.7072010040283203, 'learning_rate': 4.213161659513591e-05, 'epoch': 0.47}


 16%|█▋        | 230/1398 [06:50<37:17,  1.92s/it]

{'loss': 3.3649, 'grad_norm': 1.921324610710144, 'learning_rate': 4.1773962804005726e-05, 'epoch': 0.49}


 17%|█▋        | 240/1398 [07:10<39:39,  2.05s/it]

{'loss': 3.281, 'grad_norm': 2.074822187423706, 'learning_rate': 4.1416309012875534e-05, 'epoch': 0.52}


 18%|█▊        | 250/1398 [07:31<39:59,  2.09s/it]

{'loss': 3.3093, 'grad_norm': 1.915819764137268, 'learning_rate': 4.105865522174535e-05, 'epoch': 0.54}


 19%|█▊        | 260/1398 [07:53<40:58,  2.16s/it]

{'loss': 3.3691, 'grad_norm': 2.947751998901367, 'learning_rate': 4.0701001430615164e-05, 'epoch': 0.56}


 19%|█▉        | 270/1398 [08:14<39:28,  2.10s/it]

{'loss': 3.1628, 'grad_norm': 1.8204374313354492, 'learning_rate': 4.034334763948498e-05, 'epoch': 0.58}


 20%|██        | 280/1398 [08:36<39:34,  2.12s/it]

{'loss': 3.273, 'grad_norm': 2.095668315887451, 'learning_rate': 3.998569384835479e-05, 'epoch': 0.6}


 21%|██        | 290/1398 [08:57<39:52,  2.16s/it]

{'loss': 3.2262, 'grad_norm': 1.9892082214355469, 'learning_rate': 3.962804005722461e-05, 'epoch': 0.62}


 21%|██▏       | 300/1398 [09:18<39:00,  2.13s/it]

{'loss': 3.1904, 'grad_norm': 2.1545400619506836, 'learning_rate': 3.927038626609442e-05, 'epoch': 0.64}


 22%|██▏       | 310/1398 [09:40<40:57,  2.26s/it]

{'loss': 3.2311, 'grad_norm': 1.823108196258545, 'learning_rate': 3.891273247496424e-05, 'epoch': 0.67}


 23%|██▎       | 320/1398 [10:02<37:48,  2.10s/it]

{'loss': 3.2787, 'grad_norm': 1.962561845779419, 'learning_rate': 3.855507868383405e-05, 'epoch': 0.69}


 24%|██▎       | 330/1398 [10:23<38:41,  2.17s/it]

{'loss': 3.0756, 'grad_norm': 1.7926772832870483, 'learning_rate': 3.8197424892703866e-05, 'epoch': 0.71}


 24%|██▍       | 340/1398 [10:45<37:03,  2.10s/it]

{'loss': 3.1387, 'grad_norm': 2.2201812267303467, 'learning_rate': 3.783977110157368e-05, 'epoch': 0.73}


 25%|██▌       | 350/1398 [11:05<36:07,  2.07s/it]

{'loss': 3.2927, 'grad_norm': 2.332338571548462, 'learning_rate': 3.7482117310443496e-05, 'epoch': 0.75}


 26%|██▌       | 360/1398 [11:27<36:57,  2.14s/it]

{'loss': 3.2301, 'grad_norm': 2.4191033840179443, 'learning_rate': 3.712446351931331e-05, 'epoch': 0.77}


 26%|██▋       | 370/1398 [11:48<36:36,  2.14s/it]

{'loss': 3.1928, 'grad_norm': 1.7739166021347046, 'learning_rate': 3.676680972818312e-05, 'epoch': 0.79}


 27%|██▋       | 380/1398 [12:11<38:34,  2.27s/it]

{'loss': 3.2061, 'grad_norm': 1.8932071924209595, 'learning_rate': 3.640915593705293e-05, 'epoch': 0.82}


 28%|██▊       | 390/1398 [12:33<37:14,  2.22s/it]

{'loss': 3.2865, 'grad_norm': 2.3869006633758545, 'learning_rate': 3.605150214592275e-05, 'epoch': 0.84}


 29%|██▊       | 400/1398 [12:57<41:36,  2.50s/it]

{'loss': 3.3198, 'grad_norm': 2.0679585933685303, 'learning_rate': 3.569384835479256e-05, 'epoch': 0.86}


 29%|██▉       | 410/1398 [13:20<36:55,  2.24s/it]

{'loss': 3.1244, 'grad_norm': 2.624908924102783, 'learning_rate': 3.533619456366238e-05, 'epoch': 0.88}


 30%|███       | 420/1398 [13:43<36:00,  2.21s/it]

{'loss': 3.2267, 'grad_norm': 2.03485107421875, 'learning_rate': 3.497854077253219e-05, 'epoch': 0.9}


 31%|███       | 430/1398 [14:05<36:35,  2.27s/it]

{'loss': 3.3289, 'grad_norm': 2.0667741298675537, 'learning_rate': 3.4620886981402006e-05, 'epoch': 0.92}


 31%|███▏      | 440/1398 [14:27<35:17,  2.21s/it]

{'loss': 3.1631, 'grad_norm': 2.288820505142212, 'learning_rate': 3.426323319027182e-05, 'epoch': 0.94}


 32%|███▏      | 450/1398 [14:49<33:15,  2.10s/it]

{'loss': 3.2831, 'grad_norm': 2.1724250316619873, 'learning_rate': 3.3905579399141636e-05, 'epoch': 0.97}


 33%|███▎      | 460/1398 [15:10<33:57,  2.17s/it]

{'loss': 3.1631, 'grad_norm': 2.5007262229919434, 'learning_rate': 3.354792560801145e-05, 'epoch': 0.99}


 34%|███▎      | 470/1398 [15:30<30:55,  2.00s/it]

{'loss': 3.1578, 'grad_norm': 3.984278440475464, 'learning_rate': 3.3190271816881265e-05, 'epoch': 1.01}


 34%|███▍      | 480/1398 [15:51<31:01,  2.03s/it]

{'loss': 3.2185, 'grad_norm': 2.1906678676605225, 'learning_rate': 3.283261802575107e-05, 'epoch': 1.03}


 35%|███▌      | 490/1398 [16:12<31:29,  2.08s/it]

{'loss': 3.1435, 'grad_norm': 2.0619940757751465, 'learning_rate': 3.247496423462089e-05, 'epoch': 1.05}


 36%|███▌      | 500/1398 [16:33<30:26,  2.03s/it]

{'loss': 3.2403, 'grad_norm': 1.8071168661117554, 'learning_rate': 3.21173104434907e-05, 'epoch': 1.07}


 36%|███▋      | 510/1398 [16:56<31:50,  2.15s/it]

{'loss': 3.235, 'grad_norm': 2.9036903381347656, 'learning_rate': 3.175965665236052e-05, 'epoch': 1.09}


 37%|███▋      | 520/1398 [17:17<30:09,  2.06s/it]

{'loss': 3.2562, 'grad_norm': 2.483210802078247, 'learning_rate': 3.140200286123033e-05, 'epoch': 1.12}


 38%|███▊      | 530/1398 [17:38<30:56,  2.14s/it]

{'loss': 3.1778, 'grad_norm': 2.407302141189575, 'learning_rate': 3.104434907010014e-05, 'epoch': 1.14}


 39%|███▊      | 540/1398 [17:59<29:06,  2.04s/it]

{'loss': 3.1384, 'grad_norm': 2.07947039604187, 'learning_rate': 3.0686695278969954e-05, 'epoch': 1.16}


 39%|███▉      | 550/1398 [18:20<28:59,  2.05s/it]

{'loss': 3.2344, 'grad_norm': 2.8624625205993652, 'learning_rate': 3.0329041487839772e-05, 'epoch': 1.18}


 40%|████      | 560/1398 [18:40<28:57,  2.07s/it]

{'loss': 3.1062, 'grad_norm': 2.545910358428955, 'learning_rate': 2.9971387696709587e-05, 'epoch': 1.2}


 41%|████      | 570/1398 [19:01<28:57,  2.10s/it]

{'loss': 3.2932, 'grad_norm': 2.39279842376709, 'learning_rate': 2.9613733905579398e-05, 'epoch': 1.22}


 41%|████▏     | 580/1398 [19:22<27:32,  2.02s/it]

{'loss': 3.1457, 'grad_norm': 2.4817678928375244, 'learning_rate': 2.9256080114449213e-05, 'epoch': 1.24}


 42%|████▏     | 590/1398 [19:43<27:22,  2.03s/it]

{'loss': 3.1528, 'grad_norm': 2.364473819732666, 'learning_rate': 2.8898426323319027e-05, 'epoch': 1.27}


 43%|████▎     | 600/1398 [20:03<27:01,  2.03s/it]

{'loss': 3.1464, 'grad_norm': 2.695836305618286, 'learning_rate': 2.8540772532188842e-05, 'epoch': 1.29}


 44%|████▎     | 610/1398 [20:24<27:03,  2.06s/it]

{'loss': 3.1867, 'grad_norm': 2.232842206954956, 'learning_rate': 2.8183118741058657e-05, 'epoch': 1.31}


 44%|████▍     | 620/1398 [20:45<28:01,  2.16s/it]

{'loss': 3.1521, 'grad_norm': 2.531153440475464, 'learning_rate': 2.782546494992847e-05, 'epoch': 1.33}


 45%|████▌     | 630/1398 [21:07<28:17,  2.21s/it]

{'loss': 3.1714, 'grad_norm': 2.5243072509765625, 'learning_rate': 2.7467811158798286e-05, 'epoch': 1.35}


 46%|████▌     | 640/1398 [21:32<29:58,  2.37s/it]

{'loss': 3.1519, 'grad_norm': 2.1176271438598633, 'learning_rate': 2.7110157367668097e-05, 'epoch': 1.37}


 46%|████▋     | 650/1398 [21:53<26:25,  2.12s/it]

{'loss': 3.0867, 'grad_norm': 2.32248854637146, 'learning_rate': 2.6752503576537912e-05, 'epoch': 1.39}


 47%|████▋     | 660/1398 [22:14<25:29,  2.07s/it]

{'loss': 3.1416, 'grad_norm': 2.537095308303833, 'learning_rate': 2.6394849785407727e-05, 'epoch': 1.42}


 48%|████▊     | 670/1398 [22:35<25:06,  2.07s/it]

{'loss': 3.2717, 'grad_norm': 2.063969850540161, 'learning_rate': 2.603719599427754e-05, 'epoch': 1.44}


 49%|████▊     | 680/1398 [22:56<24:24,  2.04s/it]

{'loss': 3.2163, 'grad_norm': 2.6541450023651123, 'learning_rate': 2.5679542203147356e-05, 'epoch': 1.46}


 49%|████▉     | 690/1398 [23:17<24:22,  2.07s/it]

{'loss': 3.1734, 'grad_norm': 2.898845672607422, 'learning_rate': 2.532188841201717e-05, 'epoch': 1.48}


 50%|█████     | 700/1398 [23:38<24:28,  2.10s/it]

{'loss': 3.1073, 'grad_norm': 2.419450283050537, 'learning_rate': 2.4964234620886985e-05, 'epoch': 1.5}


 51%|█████     | 710/1398 [23:59<23:34,  2.06s/it]

{'loss': 3.1542, 'grad_norm': 2.011422634124756, 'learning_rate': 2.4606580829756797e-05, 'epoch': 1.52}


 52%|█████▏    | 720/1398 [24:20<23:19,  2.06s/it]

{'loss': 3.1416, 'grad_norm': 2.7828636169433594, 'learning_rate': 2.4248927038626608e-05, 'epoch': 1.55}


 52%|█████▏    | 730/1398 [24:42<24:00,  2.16s/it]

{'loss': 3.2392, 'grad_norm': 2.201263189315796, 'learning_rate': 2.3891273247496423e-05, 'epoch': 1.57}


 53%|█████▎    | 740/1398 [25:04<24:22,  2.22s/it]

{'loss': 3.1622, 'grad_norm': 2.475813150405884, 'learning_rate': 2.3533619456366237e-05, 'epoch': 1.59}


 54%|█████▎    | 750/1398 [25:25<22:22,  2.07s/it]

{'loss': 3.1359, 'grad_norm': 2.4546046257019043, 'learning_rate': 2.3175965665236052e-05, 'epoch': 1.61}


 54%|█████▍    | 760/1398 [25:45<21:37,  2.03s/it]

{'loss': 3.2877, 'grad_norm': 2.6536760330200195, 'learning_rate': 2.2818311874105867e-05, 'epoch': 1.63}


 55%|█████▌    | 770/1398 [26:06<21:59,  2.10s/it]

{'loss': 3.2174, 'grad_norm': 2.6719419956207275, 'learning_rate': 2.246065808297568e-05, 'epoch': 1.65}


 56%|█████▌    | 780/1398 [26:28<21:37,  2.10s/it]

{'loss': 3.2109, 'grad_norm': 2.1571223735809326, 'learning_rate': 2.2103004291845496e-05, 'epoch': 1.67}


 57%|█████▋    | 790/1398 [26:49<20:59,  2.07s/it]

{'loss': 3.0623, 'grad_norm': 2.177504539489746, 'learning_rate': 2.1745350500715307e-05, 'epoch': 1.7}


 57%|█████▋    | 800/1398 [27:10<21:32,  2.16s/it]

{'loss': 3.193, 'grad_norm': 2.424550771713257, 'learning_rate': 2.1387696709585122e-05, 'epoch': 1.72}


 58%|█████▊    | 810/1398 [27:32<21:08,  2.16s/it]

{'loss': 3.179, 'grad_norm': 2.362187147140503, 'learning_rate': 2.1030042918454937e-05, 'epoch': 1.74}


 59%|█████▊    | 820/1398 [27:53<20:27,  2.12s/it]

{'loss': 3.119, 'grad_norm': 2.8569765090942383, 'learning_rate': 2.067238912732475e-05, 'epoch': 1.76}


 59%|█████▉    | 830/1398 [28:14<19:26,  2.05s/it]

{'loss': 3.1706, 'grad_norm': 2.390244960784912, 'learning_rate': 2.0314735336194566e-05, 'epoch': 1.78}


 60%|██████    | 840/1398 [28:35<19:21,  2.08s/it]

{'loss': 3.0978, 'grad_norm': 2.0463016033172607, 'learning_rate': 1.995708154506438e-05, 'epoch': 1.8}


 61%|██████    | 850/1398 [28:56<18:46,  2.06s/it]

{'loss': 3.121, 'grad_norm': 2.682586908340454, 'learning_rate': 1.9599427753934195e-05, 'epoch': 1.82}


 62%|██████▏   | 860/1398 [29:17<19:05,  2.13s/it]

{'loss': 3.118, 'grad_norm': 2.122577428817749, 'learning_rate': 1.9241773962804007e-05, 'epoch': 1.85}


 62%|██████▏   | 870/1398 [29:38<18:32,  2.11s/it]

{'loss': 3.0913, 'grad_norm': 2.3199470043182373, 'learning_rate': 1.888412017167382e-05, 'epoch': 1.87}


 63%|██████▎   | 880/1398 [30:00<18:51,  2.18s/it]

{'loss': 3.1115, 'grad_norm': 2.598256826400757, 'learning_rate': 1.8526466380543633e-05, 'epoch': 1.89}


 64%|██████▎   | 890/1398 [30:21<17:54,  2.12s/it]

{'loss': 3.0785, 'grad_norm': 2.379957437515259, 'learning_rate': 1.8168812589413447e-05, 'epoch': 1.91}


 64%|██████▍   | 900/1398 [30:42<17:29,  2.11s/it]

{'loss': 3.1451, 'grad_norm': 2.4322636127471924, 'learning_rate': 1.7811158798283262e-05, 'epoch': 1.93}


 65%|██████▌   | 910/1398 [31:04<17:33,  2.16s/it]

{'loss': 3.1344, 'grad_norm': 2.2428627014160156, 'learning_rate': 1.7453505007153077e-05, 'epoch': 1.95}


 66%|██████▌   | 920/1398 [31:25<16:29,  2.07s/it]

{'loss': 3.1415, 'grad_norm': 2.3197739124298096, 'learning_rate': 1.709585121602289e-05, 'epoch': 1.97}


 67%|██████▋   | 930/1398 [31:46<16:36,  2.13s/it]

{'loss': 3.2287, 'grad_norm': 2.355725049972534, 'learning_rate': 1.6738197424892706e-05, 'epoch': 2.0}


 67%|██████▋   | 940/1398 [32:08<16:52,  2.21s/it]

{'loss': 3.0498, 'grad_norm': 2.6425132751464844, 'learning_rate': 1.6380543633762517e-05, 'epoch': 2.02}


 68%|██████▊   | 950/1398 [32:30<16:24,  2.20s/it]

{'loss': 3.0768, 'grad_norm': 2.88859224319458, 'learning_rate': 1.6022889842632332e-05, 'epoch': 2.04}


 69%|██████▊   | 960/1398 [32:52<16:25,  2.25s/it]

{'loss': 3.1989, 'grad_norm': 2.6884937286376953, 'learning_rate': 1.5665236051502147e-05, 'epoch': 2.06}


 69%|██████▉   | 970/1398 [33:15<16:07,  2.26s/it]

{'loss': 3.2652, 'grad_norm': 2.342850923538208, 'learning_rate': 1.530758226037196e-05, 'epoch': 2.08}


 70%|███████   | 980/1398 [33:38<15:45,  2.26s/it]

{'loss': 3.0576, 'grad_norm': 2.644789934158325, 'learning_rate': 1.4949928469241776e-05, 'epoch': 2.1}


 71%|███████   | 990/1398 [34:00<14:55,  2.19s/it]

{'loss': 3.1854, 'grad_norm': 2.8885679244995117, 'learning_rate': 1.4592274678111589e-05, 'epoch': 2.12}


 72%|███████▏  | 1000/1398 [34:22<14:22,  2.17s/it]

{'loss': 3.0226, 'grad_norm': 2.399096965789795, 'learning_rate': 1.4234620886981404e-05, 'epoch': 2.15}


 72%|███████▏  | 1010/1398 [34:47<15:22,  2.38s/it]

{'loss': 3.1131, 'grad_norm': 2.5744569301605225, 'learning_rate': 1.3876967095851218e-05, 'epoch': 2.17}


 73%|███████▎  | 1020/1398 [35:09<13:51,  2.20s/it]

{'loss': 3.1358, 'grad_norm': 2.3127846717834473, 'learning_rate': 1.3519313304721031e-05, 'epoch': 2.19}


 74%|███████▎  | 1030/1398 [35:31<13:34,  2.21s/it]

{'loss': 3.2224, 'grad_norm': 2.5278961658477783, 'learning_rate': 1.3161659513590846e-05, 'epoch': 2.21}


 74%|███████▍  | 1040/1398 [35:54<13:57,  2.34s/it]

{'loss': 3.1462, 'grad_norm': 2.222583532333374, 'learning_rate': 1.2804005722460657e-05, 'epoch': 2.23}


 75%|███████▌  | 1050/1398 [36:17<13:00,  2.24s/it]

{'loss': 3.1815, 'grad_norm': 2.4959163665771484, 'learning_rate': 1.2446351931330473e-05, 'epoch': 2.25}


 76%|███████▌  | 1060/1398 [36:39<12:44,  2.26s/it]

{'loss': 3.1417, 'grad_norm': 2.204700231552124, 'learning_rate': 1.2088698140200286e-05, 'epoch': 2.27}


 77%|███████▋  | 1070/1398 [37:02<12:33,  2.30s/it]

{'loss': 3.0349, 'grad_norm': 2.7366626262664795, 'learning_rate': 1.1731044349070101e-05, 'epoch': 2.3}


 77%|███████▋  | 1080/1398 [37:24<11:39,  2.20s/it]

{'loss': 3.2354, 'grad_norm': 2.5602877140045166, 'learning_rate': 1.1373390557939914e-05, 'epoch': 2.32}


 78%|███████▊  | 1090/1398 [37:46<11:13,  2.19s/it]

{'loss': 3.1134, 'grad_norm': 2.7212233543395996, 'learning_rate': 1.1015736766809729e-05, 'epoch': 2.34}


 79%|███████▊  | 1100/1398 [38:08<11:05,  2.23s/it]

{'loss': 3.0169, 'grad_norm': 2.462503433227539, 'learning_rate': 1.0658082975679542e-05, 'epoch': 2.36}


 79%|███████▉  | 1110/1398 [38:30<10:37,  2.21s/it]

{'loss': 3.1614, 'grad_norm': 2.5701096057891846, 'learning_rate': 1.0300429184549356e-05, 'epoch': 2.38}


 80%|████████  | 1120/1398 [38:52<09:57,  2.15s/it]

{'loss': 3.1178, 'grad_norm': 2.233114004135132, 'learning_rate': 9.942775393419171e-06, 'epoch': 2.4}


 81%|████████  | 1130/1398 [39:15<10:02,  2.25s/it]

{'loss': 3.0708, 'grad_norm': 2.3588638305664062, 'learning_rate': 9.585121602288986e-06, 'epoch': 2.42}


 82%|████████▏ | 1140/1398 [39:37<09:39,  2.24s/it]

{'loss': 3.1581, 'grad_norm': 2.6464743614196777, 'learning_rate': 9.227467811158799e-06, 'epoch': 2.45}


 82%|████████▏ | 1150/1398 [39:58<08:46,  2.12s/it]

{'loss': 3.0475, 'grad_norm': 2.3537940979003906, 'learning_rate': 8.869814020028613e-06, 'epoch': 2.47}


 83%|████████▎ | 1160/1398 [40:19<08:19,  2.10s/it]

{'loss': 3.175, 'grad_norm': 2.6668131351470947, 'learning_rate': 8.512160228898426e-06, 'epoch': 2.49}


 84%|████████▎ | 1170/1398 [40:40<08:04,  2.13s/it]

{'loss': 3.0607, 'grad_norm': 2.256556272506714, 'learning_rate': 8.154506437768241e-06, 'epoch': 2.51}


 84%|████████▍ | 1180/1398 [41:02<07:55,  2.18s/it]

{'loss': 3.1269, 'grad_norm': 2.6444849967956543, 'learning_rate': 7.796852646638054e-06, 'epoch': 2.53}


 85%|████████▌ | 1190/1398 [41:24<07:29,  2.16s/it]

{'loss': 3.087, 'grad_norm': 1.9795804023742676, 'learning_rate': 7.439198855507869e-06, 'epoch': 2.55}


 86%|████████▌ | 1200/1398 [41:45<06:52,  2.08s/it]

{'loss': 3.2155, 'grad_norm': 2.4713921546936035, 'learning_rate': 7.0815450643776825e-06, 'epoch': 2.58}


 87%|████████▋ | 1210/1398 [42:06<06:34,  2.10s/it]

{'loss': 3.1978, 'grad_norm': 2.1823360919952393, 'learning_rate': 6.723891273247497e-06, 'epoch': 2.6}


 87%|████████▋ | 1220/1398 [42:28<06:32,  2.20s/it]

{'loss': 3.0672, 'grad_norm': 2.0741870403289795, 'learning_rate': 6.366237482117311e-06, 'epoch': 2.62}


 88%|████████▊ | 1230/1398 [42:49<05:58,  2.14s/it]

{'loss': 3.0045, 'grad_norm': 2.71649432182312, 'learning_rate': 6.008583690987125e-06, 'epoch': 2.64}


 89%|████████▊ | 1240/1398 [43:10<05:26,  2.07s/it]

{'loss': 3.1535, 'grad_norm': 2.3460934162139893, 'learning_rate': 5.650929899856939e-06, 'epoch': 2.66}


 89%|████████▉ | 1250/1398 [43:31<05:12,  2.11s/it]

{'loss': 3.1984, 'grad_norm': 2.495405673980713, 'learning_rate': 5.293276108726753e-06, 'epoch': 2.68}


 90%|█████████ | 1260/1398 [43:53<05:16,  2.30s/it]

{'loss': 3.1467, 'grad_norm': 2.270580291748047, 'learning_rate': 4.935622317596566e-06, 'epoch': 2.7}


 91%|█████████ | 1270/1398 [44:16<04:44,  2.23s/it]

{'loss': 3.1889, 'grad_norm': 2.3568432331085205, 'learning_rate': 4.577968526466381e-06, 'epoch': 2.73}


 92%|█████████▏| 1280/1398 [44:39<04:32,  2.31s/it]

{'loss': 3.1303, 'grad_norm': 2.387542724609375, 'learning_rate': 4.220314735336195e-06, 'epoch': 2.75}


 92%|█████████▏| 1290/1398 [45:02<04:08,  2.30s/it]

{'loss': 3.221, 'grad_norm': 2.877251148223877, 'learning_rate': 3.8626609442060095e-06, 'epoch': 2.77}


 93%|█████████▎| 1300/1398 [45:26<04:02,  2.47s/it]

{'loss': 3.1494, 'grad_norm': 2.247800827026367, 'learning_rate': 3.5050071530758225e-06, 'epoch': 2.79}


 94%|█████████▎| 1310/1398 [45:48<03:17,  2.24s/it]

{'loss': 3.0435, 'grad_norm': 2.1629819869995117, 'learning_rate': 3.1473533619456367e-06, 'epoch': 2.81}


 94%|█████████▍| 1320/1398 [46:10<02:45,  2.12s/it]

{'loss': 3.2012, 'grad_norm': 2.6417076587677, 'learning_rate': 2.789699570815451e-06, 'epoch': 2.83}


 95%|█████████▌| 1330/1398 [46:32<02:34,  2.27s/it]

{'loss': 3.1854, 'grad_norm': 2.615497589111328, 'learning_rate': 2.432045779685265e-06, 'epoch': 2.85}


 96%|█████████▌| 1340/1398 [46:54<02:10,  2.24s/it]

{'loss': 3.0997, 'grad_norm': 2.732083320617676, 'learning_rate': 2.074391988555079e-06, 'epoch': 2.88}


 97%|█████████▋| 1350/1398 [47:16<01:42,  2.14s/it]

{'loss': 3.1797, 'grad_norm': 2.1206674575805664, 'learning_rate': 1.7167381974248929e-06, 'epoch': 2.9}


 97%|█████████▋| 1360/1398 [47:38<01:24,  2.21s/it]

{'loss': 3.1774, 'grad_norm': 1.8321667909622192, 'learning_rate': 1.3590844062947067e-06, 'epoch': 2.92}


 98%|█████████▊| 1370/1398 [48:00<01:03,  2.26s/it]

{'loss': 3.1093, 'grad_norm': 2.323324680328369, 'learning_rate': 1.0014306151645207e-06, 'epoch': 2.94}


 99%|█████████▊| 1380/1398 [48:22<00:38,  2.15s/it]

{'loss': 3.1723, 'grad_norm': 2.257047176361084, 'learning_rate': 6.437768240343348e-07, 'epoch': 2.96}


 99%|█████████▉| 1390/1398 [48:44<00:18,  2.27s/it]

{'loss': 3.1544, 'grad_norm': 2.168179512023926, 'learning_rate': 2.861230329041488e-07, 'epoch': 2.98}


100%|██████████| 1398/1398 [49:02<00:00,  2.10s/it]

{'train_runtime': 2942.4181, 'train_samples_per_second': 3.798, 'train_steps_per_second': 0.475, 'train_loss': 3.2285721237226275, 'epoch': 3.0}





TrainOutput(global_step=1398, training_loss=3.2285721237226275, metrics={'train_runtime': 2942.4181, 'train_samples_per_second': 3.798, 'train_steps_per_second': 0.475, 'total_flos': 371952702259200.0, 'train_loss': 3.2285721237226275, 'epoch': 3.0})

In [21]:
# Save the fine-tuned model and tokenizer
lora_model.save_pretrained("./lora-finetuned-model")
tokenizer.save_pretrained("./lora-finetuned-model")


('./lora-finetuned-model\\tokenizer_config.json',
 './lora-finetuned-model\\special_tokens_map.json',
 './lora-finetuned-model\\vocab.json',
 './lora-finetuned-model\\merges.txt',
 './lora-finetuned-model\\added_tokens.json',
 './lora-finetuned-model\\tokenizer.json')