In [2]:
!pip install -q transformers datasets peft accelerate bitsandbytes

In [5]:
import random
import torch

from datasets import load_dataset, Dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model

In [6]:
print("CUDA:", torch.cuda.is_available())

CUDA: True


In [7]:
dolly = load_dataset("databricks/databricks-dolly-15k", split="train")
print("Total Dolly samples:", len(dolly))

README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Total Dolly samples: 15011


In [8]:
print(dolly[0].keys())
print("\n--- SAMPLE ---")
print(dolly[0])

dict_keys(['instruction', 'context', 'response', 'category'])

--- SAMPLE ---
{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


In [9]:
def format_dolly(example):
    instruction = example["instruction"].strip()
    context = example["context"]
    response = example["response"].strip()

    text = f"### Instruction:\n{instruction}\n\n"

    if context and context.strip():
        text += f"### Input:\n{context.strip()}\n\n"

    text += f"### Response:\n{response}"
    return {"text": text}

In [10]:
processed = [format_dolly(ex) for ex in dolly]

random.shuffle(processed)
processed = processed[:8000]   # still ideal

train_dataset = Dataset.from_list(processed)
print(train_dataset)

Dataset({
    features: ['text'],
    num_rows: 8000
})


In [11]:
print(train_dataset[0]["text"])

### Instruction:
Identify which instrument is string or percussion: Daf, Bandura

### Response:
Bandura is string, Daf is percussion.


In [12]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [13]:
def tokenize(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_ds = train_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [14]:
print(tokenized_ds[0].keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [15]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 1024)

In [18]:
lora_config = LoraConfig(
    r=16,                  # ↑ from 8
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,325,376 || all params: 359,148,544 || trainable%: 1.2043


In [22]:
training_args = TrainingArguments(
    output_dir="./gpt2-medium-dolly-lora",
    per_device_train_batch_size=4,     # ↓
    gradient_accumulation_steps=4,     # ↑ effective batch = 16
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    label_names=["labels"]
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds
)

In [24]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.5484
100,1.4315
150,1.3193
200,1.3837
250,1.3113
300,1.3576
350,1.3188
400,1.3519
450,1.313
500,1.3732


TrainOutput(global_step=1500, training_loss=1.3567216873168946, metrics={'train_runtime': 2177.5205, 'train_samples_per_second': 11.022, 'train_steps_per_second': 0.689, 'total_flos': 1.13038589952e+16, 'train_loss': 1.3567216873168946, 'epoch': 3.0})

In [25]:
model.save_pretrained("./gpt2-medium-dolly-lora")
tokenizer.save_pretrained("./gpt2-medium-dolly-lora")

('./gpt2-medium-dolly-lora/tokenizer_config.json',
 './gpt2-medium-dolly-lora/special_tokens_map.json',
 './gpt2-medium-dolly-lora/vocab.json',
 './gpt2-medium-dolly-lora/merges.txt',
 './gpt2-medium-dolly-lora/added_tokens.json')

In [26]:
def generate_answer(model, tokenizer, prompt, max_new_tokens=150):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.4,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [31]:
prompt = """### Instruction:
Who is Elon musk?

### Response:
"""

print(generate_answer(model, tokenizer, prompt))

### Instruction:
Who is Elon musk?

### Response:
Elon Musk (born August 25, 1954) is a British entrepreneur and inventor. He founded the Tesla Motors Company in California and has since become an investor in several other companies including SolarCity, SpaceX , and SpaceX.


In [39]:
prompt = """### Instruction:
I have cold, What should i do?

### Response:
"""

print(generate_answer(model, tokenizer, prompt))

### Instruction:
I have cold, What should i do?

### Response:
You can take a shower and cool down with a hot cup of tea.  You can also use a warm bath to help you feel better.


In [36]:
!zip -r gpt2_medium_dolly_lora_full.zip ./gpt2-medium-dolly-lora

  adding: gpt2-medium-dolly-lora/ (stored 0%)
  adding: gpt2-medium-dolly-lora/README.md (deflated 66%)
  adding: gpt2-medium-dolly-lora/adapter_model.safetensors (deflated 7%)
  adding: gpt2-medium-dolly-lora/adapter_config.json (deflated 56%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/ (stored 0%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/README.md (deflated 66%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/rng_state.pth (deflated 25%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/training_args.bin (deflated 52%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/adapter_model.safetensors (deflated 7%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/trainer_state.json (deflated 76%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/adapter_config.json (deflated 56%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/scaler.pt (deflated 60%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/optimizer.pt (deflated 9%)
  adding: gpt2-medium-dolly-lora/checkpoint-1000/sche