Install required packages

In [1]:
!pip install transformers datasets peft accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


Import model and dataset 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Load GPT-2 model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

SUBSET_SIZE = 5000
# Load dataset (e.g., wikitext-103-v1)
dataset = load_dataset("databricks/databricks-dolly-15k", split="train").select(range(SUBSET_SIZE))

config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

First, let's check model's performance without any training 

In [4]:
input_text = f"Who wrote harry potter?",
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Who wrote harry potter? A. Rowling B. Rowling C. Rowling D. Rowling
Answer:
A

Which of the following statements about the relationship between the Communist Party of China and the democratic parties is incorrect?
A. The Communist Party of China and the democratic parties are both the ruling party and the participating party in the country.
B. The Communist Party of China and the democratic parties are both the ruling party and the participating party in the country.
C. The Communist Party of China and the democratic parties are both the ruling party and the participating party in the country.
D. The Communist Party of China and the democratic parties are both the ruling party and the participating party in the country.
Answer:
B

The main reason for the formation of the 'Three Gorges Dam' is ____
A. The terrain is high and the river is long
B. The terrain is low and the river is short
C. The terrain is high and the river is short
D. The terrain is low and the river is long
Answer:
A



We can see the model's performance is not that good. It's not giving any precise or accurate answer. It's showing too many garbage values.

In [5]:
def tokenize_function(examples):
    # Combine instruction, context, and response for each example in the batch
    combined_text = [
        f"{instruction} \n{context} \n {response}"
        for instruction, context, response in zip(examples["instruction"], examples["context"], examples["response"])
    ]
    # Tokenize the combined text
    tokenized_output = tokenizer(combined_text, truncation=True, padding="max_length", max_length=512)
    # Add labels for causal language modeling
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "context", "response", "category"])

# Split the tokenized dataset into train and eval sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
print(train_dataset.shape)
print(eval_dataset.shape)

(4000, 3)
(1000, 3)


In [7]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [8]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
peft_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=16,  # Scaling factor
    target_modules=["k_proj","v_proj"],  # Target attention layers in GPT-2
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # No bias for LoRA
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

In [9]:
print(model.print_trainable_parameters())


trainable params: 393,216 || all params: 494,425,984 || trainable%: 0.0795
None


In [10]:
model

PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): Linear(in_features=896, out_features=896, bias=True)
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=128, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=128, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): Mod

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="content/qwen2.5-lora-dolly",
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    fp16=True,  # Use mixed precision for faster training
    report_to = "none",
    remove_unused_columns=False,  # Keep all columns in the dataset

)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
500,0.6959,No log
1000,0.6817,No log
1500,0.6844,No log
2000,0.6456,No log
2500,0.7116,No log
3000,0.689,No log


TrainOutput(global_step=3000, training_loss=0.9405069402058919, metrics={'train_runtime': 2919.259, 'train_samples_per_second': 4.111, 'train_steps_per_second': 1.028, 'total_flos': 1.3208051515392e+16, 'train_loss': 0.9405069402058919, 'epoch': 3.0})

In [19]:
# cont = 0
# for example in dataset:
#     if cont >=5:
#       break
#     print(example)
#     cont += 1

Now, as the training is done, let's try the same question as before. 

In [18]:
input_text = f"Who wrote harry potter?",
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Who wrote harry potter? 
 
 J.K. Rowling


We can see, now the models shows clear and concise answer. With just 3 epochs and 5000 data, the model is giving a good enough answer.

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_id = "mustafij48/qwen2.5-0.5B_on_databricks_dolly_4k_datasize_3_epoch"
model.push_to_hub(repo_id=repo_id)
tokenizer.push_to_hub(repo_id=repo_id)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mustafij48/qwen2.5-0.5B_on_databricks_dolly_4k_datasize_3_epoch/commit/a8b17c9e83be0a796521bf71b08115a4f4b78880', commit_message='Upload tokenizer', commit_description='', oid='a8b17c9e83be0a796521bf71b08115a4f4b78880', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mustafij48/qwen2.5-0.5B_on_databricks_dolly_4k_datasize_3_epoch', endpoint='https://huggingface.co', repo_type='model', repo_id='mustafij48/qwen2.5-0.5B_on_databricks_dolly_4k_datasize_3_epoch'), pr_revision=None, pr_num=None)