## Loading Model & Dataset

In [1]:
#from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
import torch
import datasets

In [2]:
from transformers import(
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer
)
from peft import LoraConfig,PeftModel,get_peft_model,prepare_model_for_kbit_training
from trl import SFTTrainer
import os ,wandb
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [3]:
model_path = "gemma2-2b"
dataset_path = 'databricks-dolly-15k'

In [34]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float32,
    bnb_4bit_use_double_quant = False,
)

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map={"":0},
    torch_dtype=torch.float32
)

model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [40]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["context"]
    outputs      = examples["response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        #if len(text) <= 10000:
        texts.append(text)
    return { "text" : texts, }
pass

In [41]:
dataset = datasets.load_dataset(dataset_path,split = 'train')
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [42]:
def filter_long_samples(example, max_length):
    return len(example['text']) <= max_length


filtered_dataset = dataset.filter(lambda example: filter_long_samples(example, max_length=3000))

In [9]:
len(filtered_dataset['text'])

14423

## Wandb Setting

In [10]:
wandb.login(key = '')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
run = wandb.init(
    project = 'Gemma-Lora',
    job_type = 'training'
)

In [12]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Params need train:{trainable_params} || Total Params:{all_param} || percentage:{100*(trainable_params/all_param)}%"
    )

## Lora & Hyperparamters

In [13]:
#Lora Config
peft_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ["q_proj","v_proj"]
)

In [14]:
#Hyperparamters
training_arguments = TrainingArguments(
    output_dir = '/root/autodl-tmp/output',
    num_train_epochs = 1,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 2,
    optim = "paged_adamw_8bit",
    save_steps = 600,
    logging_steps = 200,
    save_total_limit = 3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    fp16 = False,
    bf16 = False,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.05,
    group_by_length = True,
    lr_scheduler_type = "linear",
    report_to = "wandb",
)

## Model Hyperparam

In [15]:
trainer = SFTTrainer(
    model = model,
    train_dataset = filtered_dataset,
    peft_config = peft_config,
    tokenizer = tokenizer,
    args = training_arguments,
    packing = False,
    max_seq_length=32768,
    dataset_text_field="text"
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla V100-PCIE-32GB. Max memory = 31.739 GB.
4.457 GB of memory reserved.


In [17]:
trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
200,3.2083
400,3.379
600,4.064
800,2.1635
1000,2.5714
1200,3.5577
1400,3.4603
1600,5.2616
1800,2.0894
2000,3.1437




TrainOutput(global_step=3606, training_loss=3.0195249509824627, metrics={'train_runtime': 4860.0294, 'train_samples_per_second': 2.968, 'train_steps_per_second': 0.742, 'total_flos': 3.218520668603904e+16, 'train_loss': 3.0195249509824627, 'epoch': 1.0})

In [18]:
model = get_peft_model(model,peft_config)

print_trainable_parameters(model)

Params need train:1597440 || Total Params:1603801344 || percentage:0.09960335835708092%


## Saving Model

In [19]:
trainer.model.save_pretrained('lora-model')

wandb.finish()
model.config.use_cache = True
model.eval()

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███
train/grad_norm,█ ▂ ▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁
train/loss,▄▄▆▂▃▅▅█▂▄▁▃▂▃▄▂▄▅

0,1
total_flos,3.218520668603904e+16
train/epoch,1.0
train/global_step,3606.0
train/grad_norm,
train/learning_rate,0.0
train/loss,3.5886
train_loss,3.01952
train_runtime,4860.0294
train_samples_per_second,2.968
train_steps_per_second,0.742


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
      

## Inference

In [36]:
def stream(user_input):
    device = "cuda:0"
    system_prompt ='Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.'
    B_INST,R_INST = "###Instruction:\n","###Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{R_INST}"
    inputs = tokenizer([prompt],return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
    _ = model.generate(**inputs,streamer = streamer,max_new_tokens=128)

In [44]:
#Base Model
stream("Give me some info about the Great Wall")

The Great Wall of China is a series of fortifications built across the northern borders of China to protect the country from invasions. The Great Wall was built over a period of 2,000 years, with the first sections built in the 3rd century BC. The Great Wall is a series of fortifications built across the northern borders of China to protect the country from invasions. The Great Wall was built over a period of 2,000 years, with the first sections built in the 3rd century BC. The Great Wall is a series of fortifications built across the northern borders of China to protect the country from invasions.


In [52]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map={"":0},
    torch_dtype=torch.float32,
    low_cpu_mem_usage = True,
    return_dict = True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [53]:
fintuned_model = PeftModel.from_pretrained(model,"lora-model")

In [54]:
merged_model = fintuned_model.merge_and_unload()

In [55]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

In [60]:
def new_stream(user_input):
    device = "cuda:0"
    system_prompt ='Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.'
    B_INST,R_INST = "###Instruction:\n","###Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{R_INST}"
    inputs = tokenizer([prompt],return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer,skip_prompt=True,skip_special_tokens=True)
    _ =merged_model.generate(**inputs,streamer = streamer,max_new_tokens=128)

In [None]:
new_stream("Give me some info about the Great Wall")