In [1]:


!pip install transformers==4.41.2
!pip install peft==0.10.0
!pip install accelerate==0.30.1


Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: peft 0.10.0
Uninstalling peft-0.10.0:
  Successfully uninstalled peft-0.10.0
Found existing installation: accelerate 0.30.1
Uninstalling accelerate-0.30.1:
  Successfully uninstalled accelerate-0.30.1
Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.41.2
Collecting peft==0.10.0
  Using cached peft-0.10.0-py3-none-any.whl (199 kB)
Collecting accelerate>=0.21.0
  Using cached accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate, peft
Successfully installed accelerate-1.12.0 peft-0.10.0
Collecting accelerate==0.30.1
  Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installa

In [2]:
import transformers
import peft
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM

transformers.__version__, peft.__version__, accelerate.__version__


  from .autonotebook import tqdm as notebook_tqdm


('4.41.2', '0.10.0', '0.30.1')

In [3]:
BASE_MODEL = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)


In [4]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/home/moni/Desktop/ai-knowledge-assistant/backend/training/training_data.jsonl",
    split="train"
)

dataset


Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 90
})

In [5]:
def format_example(example):
    text = f"Instruction: {example['instruction']}\n"
    if example['input']:
        text += f"Input: {example['input']}\n"
    text += f"Answer: {example['output']}"
    return {"text": text}

dataset = dataset.map(format_example)
dataset[0]


{'instruction': 'How many paid leaves am I entitled to in a year?',
 'input': '',
 'output': 'Employees are entitled to 24 paid leaves per year, which includes 18 casual/sick leaves and 6 privilege leaves. Casual leaves can be taken with prior approval from your reporting manager, while privilege leaves require at least 7 days advance notice. Unused casual leaves lapse at year-end, but up to 15 privilege leaves can be carried forward to the next calendar year.',
 'text': 'Instruction: How many paid leaves am I entitled to in a year?\nAnswer: Employees are entitled to 24 paid leaves per year, which includes 18 casual/sick leaves and 6 privilege leaves. Casual leaves can be taken with prior approval from your reporting manager, while privilege leaves require at least 7 days advance notice. Unused casual leaves lapse at year-end, but up to 15 privilege leaves can be carried forward to the next calendar year.'}

In [6]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


In [7]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(["text"])
tokenized = tokenized.with_format("torch")
tokenized


Map: 100%|██████████| 90/90 [00:00<00:00, 901.04 examples/s]


Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 90
})

In [8]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],   # GPT-2 attention
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()




trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.23643136409814364


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="lora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=False,
)


In [17]:
from transformers import DataCollatorForLanguageModeling, Trainer

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,

)


In [18]:
trainer.train()


 15%|█▌        | 10/66 [02:12<11:25, 12.25s/it]

{'loss': 4.0084, 'grad_norm': 0.4147673547267914, 'learning_rate': 0.00016969696969696972, 'epoch': 0.44}


 30%|███       | 20/66 [04:09<09:01, 11.77s/it]

{'loss': 3.946, 'grad_norm': 0.6604386568069458, 'learning_rate': 0.0001393939393939394, 'epoch': 0.89}


 45%|████▌     | 30/66 [06:05<06:58, 11.63s/it]

{'loss': 3.8491, 'grad_norm': 0.5183394551277161, 'learning_rate': 0.00010909090909090909, 'epoch': 1.33}


 61%|██████    | 40/66 [08:05<05:15, 12.14s/it]

{'loss': 3.773, 'grad_norm': 0.6252368688583374, 'learning_rate': 7.878787878787879e-05, 'epoch': 1.78}


 76%|███████▌  | 50/66 [10:03<03:08, 11.80s/it]

{'loss': 3.7203, 'grad_norm': 0.7110324501991272, 'learning_rate': 4.848484848484849e-05, 'epoch': 2.22}


 91%|█████████ | 60/66 [12:01<01:09, 11.63s/it]

{'loss': 3.7346, 'grad_norm': 0.7061592936515808, 'learning_rate': 1.8181818181818182e-05, 'epoch': 2.67}


100%|██████████| 66/66 [13:11<00:00, 12.00s/it]

{'train_runtime': 791.8907, 'train_samples_per_second': 0.341, 'train_steps_per_second': 0.083, 'train_loss': 3.8265199372262666, 'epoch': 2.93}





TrainOutput(global_step=66, training_loss=3.8265199372262666, metrics={'train_runtime': 791.8907, 'train_samples_per_second': 0.341, 'train_steps_per_second': 0.083, 'total_flos': 34610136219648.0, 'train_loss': 3.8265199372262666, 'epoch': 2.9333333333333336})

In [20]:
trainer.save_model("training/lora_output")
tokenizer.save_pretrained("training/lora_output")
model.save_pretrained("training/lora_output")
