In [2]:
import time
from IPython.display import display, Javascript

In [3]:
def prevent_disconnect():
    display(Javascript('''
        function preventColabDisconnect() {
            setInterval(() => {
                document.querySelector("colab-toolbar-button#connect").click();
            }, 30000);
        }
        preventColabDisconnect();
    '''))

prevent_disconnect()

<IPython.core.display.Javascript object>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
output_dir = "/content/drive/MyDrive/samrudha_llama_finetuned"

In [None]:
import torch
import os
import json
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from dotenv import load_dotenv
load_dotenv()

In [8]:
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=os.getenv("HF_TOKEN"))
tokenizer.pad_token = tokenizer.eos_token

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_8bit=True,
    token=os.getenv("HF_TOKEN")
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

In [12]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,587,520 || all params: 3,217,337,344 || trainable%: 0.1426


In [13]:
dataset_path = "/content/drive/MyDrive/agriculture_dataset.json"

In [14]:
dataset = load_dataset("json", data_files=dataset_path)

In [15]:
def preprocess_function(examples):
    texts = [
        prompt + tokenizer.eos_token + response
        for prompt, response in zip(examples["prompt"], examples["response"])
    ]
    tokenized = tokenizer(
        texts, truncation=True, padding="max_length", max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [16]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
output_dir = "/content/drive/MyDrive/llama_finetuned"

In [18]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    eval_strategy="no",
    report_to="none"
)

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

In [21]:
trainer.train()

Step,Training Loss
10,3.7524
20,3.6563
30,3.5442
40,3.3504
50,3.2227
60,3.0467
70,2.8101
80,2.6754
90,2.4701
100,2.3398


TrainOutput(global_step=375, training_loss=1.8686954358418784, metrics={'train_runtime': 1503.563, 'train_samples_per_second': 1.995, 'train_steps_per_second': 0.249, 'total_flos': 2.6019854548992e+16, 'train_loss': 1.8686954358418784, 'epoch': 3.0})

In [22]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/llama_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/llama_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/llama_finetuned/tokenizer.json')