In [1]:
!pip install transformers datasets peft bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.2


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [3]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [4]:
dataset = load_dataset("tatsu-lab/alpaca", split="train[:1%]")
def tokenize(example):
    text = f"### Instruction:\n{example['instruction']}\n### Response:\n"

    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [5]:
dataset = dataset.map(tokenize)
dataset.set_format("torch")


Map:   0%|          | 0/520 [00:00<?, ? examples/s]

In [6]:
def tokenize(example):
    text = f"### Instruction:\n{example['instruction']}\n### Response:\n"

    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    tokens["labels"] = tokens["input_ids"].copy()
    return tokens




In [7]:
dataset = dataset.map(tokenize)
dataset.set_format("torch")


Map:   0%|          | 0/520 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


Step,Training Loss
10,16.945872
20,15.167786
30,12.673177
40,9.661459
50,7.460767
60,6.430657


TrainOutput(global_step=65, training_loss=10.968617541973407, metrics={'train_runtime': 96.4031, 'train_samples_per_second': 5.394, 'train_steps_per_second': 0.674, 'total_flos': 827185409556480.0, 'train_loss': 10.968617541973407, 'epoch': 1.0})

In [9]:
model.save_pretrained("adapter_A")
tokenizer.save_pretrained("adapter_A")

('adapter_A/tokenizer_config.json',
 'adapter_A/chat_template.jinja',
 'adapter_A/tokenizer.json')

In [10]:
from datasets import Dataset

data2 = {
    "instruction": [
        "Summarize: Artificial intelligence is the simulation of human intelligence in machines.",
        "Summarize: Deep learning uses neural networks with many layers.",
        "Summarize: Machine learning allows systems to learn from data."
    ]
}

dataset2 = Dataset.from_dict(data2)

def tokenize2(example):
    tokens = tokenizer(
        example["instruction"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

dataset2 = dataset2.map(tokenize2)
dataset2.set_format("torch")


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model = get_peft_model(model, lora_config)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [12]:
model.save_pretrained("adapter_B")

In [13]:
from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [14]:
model_A = PeftModel.from_pretrained(base_model, "adapter_A")

In [15]:
from peft import PeftModel
from transformers import AutoModelForCausalLM

# load base model again
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# load adapter A
model_A = PeftModel.from_pretrained(base_model, "adapter_A")

# load adapter B
model_B = PeftModel.from_pretrained(base_model, "adapter_B")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]



In [16]:
state_dict_A = model_A.state_dict()
state_dict_B = model_B.state_dict()


In [17]:
merged_state_dict = {}

for key in state_dict_A:
    if key in state_dict_B and "lora" in key:
        merged_state_dict[key] = (state_dict_A[key] + state_dict_B[key]) / 2
    else:
        merged_state_dict[key] = state_dict_A[key]


In [18]:
model_A.load_state_dict(merged_state_dict, strict=False)


_IncompatibleKeys(missing_keys=[], unexpected_keys=['base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_map', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.nested_absmax', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.nested_quant_map', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'base_model.model.model.layers.0.self_attn.k_proj.weight.absmax', 'base_model.model.model.layers.0.self_attn.k_proj.weight.quant_map', 'base_model.model.model.layers.0.self_attn.k_proj.weight.nested_absmax', 'base_model.model.model.layers.0.self_attn.k_proj.weight.nested_quant_map', 'base_model.model.model.layers.0.self_attn.k_proj.weight.quant_state.bitsandbytes__nf4', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.absmax', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight.qu

In [19]:
model_A.load_state_dict(merged_state_dict, strict=False)
model_A.save_pretrained("adapter_merged")

In [20]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [21]:
from peft import PeftModel

model_A = PeftModel.from_pretrained(base_model, "adapter_A")
model_A.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [22]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model_B = PeftModel.from_pretrained(base_model, "adapter_B")
model_B.eval()


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [23]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model_A = PeftModel.from_pretrained(base_model, "adapter_A")
model_A.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [24]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model_B = PeftModel.from_pretrained(base_model, "adapter_B")
model_B.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [25]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [26]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model_merged = PeftModel.from_pretrained(base_model, "adapter_merged")
model_merged.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [27]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [28]:
prompt_A = "Explain artificial intelligence."
prompt_B = "Summarize: Deep learning uses neural networks."


In [29]:
def generate_output(model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=120,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [30]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [31]:
from peft import PeftModel

# Load base model once
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Adapter A
model_A = PeftModel.from_pretrained(base_model, "adapter_A")
model_A.eval()

# Adapter B
model_B = PeftModel.from_pretrained(base_model, "adapter_B")
model_B.eval()

# Merged Adapter
model_merged = PeftModel.from_pretrained(base_model, "adapter_merged")
model_merged.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [33]:
print("Adapter A → Task A")
print(generate_output(model_A, prompt_A))

print("\nAdapter B → Task A")
print(generate_output(model_B, prompt_A))

print("\nMerged → Task A")
print(generate_output(model_merged, prompt_A))


Adapter A → Task A
Explain artificial intelligence.

Adapter B → Task A
Explain artificial intelligence. How does it work and what are its potential applications in medicine, transportation, and other fields?

Merged → Task A
Explain artificial intelligence. How is it different from traditional data analysis?
