In [1]:
import torch, gc, os
import peft
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, PeftConfig, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments
from trl import SFTTrainer

In [2]:
gc.collect()
torch.cuda.empty_cache()

In [3]:
max_seq_length = 1024
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
MODEL = "meta-llama/Meta-Llama-3-8B"
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=bnb_config, low_cpu_mem_usage=True, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Get the data to train
dataset = load_dataset("wikimedia/wikipedia", "20231101.hi", split = "train",)
# Use only 10% of data
dataset = dataset.train_test_split(train_size = 0.1)["train"]

In [7]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    bias="none",
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir="outputs",
    #auto_find_batch_size=True, # Find a correct bvatch size that fits the size of Data.
    learning_rate= 5e-5, # Higher learning rate than full fine-tuning.
    max_steps = 240,
    warmup_steps = 20,
    # do_eval=True,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    seed=3047,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    fp16=False,
    bf16=True,
    lr_scheduler_type="linear",
    save_steps=20,
    # eval_steps=20,
    logging_steps=20,
    # gradient_accumulation_steps=4,
    # num_train_epochs=5
)

trainer = SFTTrainer(
    model = model,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_num_proc = 2,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False
)
tokenizer.pad_token = tokenizer.eos_token

Map (num_proc=2):   0%|          | 0/16309 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.41 GB.
10.588 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
20,1.4068
40,1.5234
60,1.3438
80,1.0916
100,1.2673
120,1.2589
140,1.1596
160,1.2666
180,1.2444
200,1.5582




In [10]:
trainer_stats

TrainOutput(global_step=240, training_loss=1.3082326253255208, metrics={'train_runtime': 141.3068, 'train_samples_per_second': 1.698, 'train_steps_per_second': 1.698, 'total_flos': 4292421559050240.0, 'train_loss': 1.3082326253255208, 'epoch': 0.014715801091421914})

In [12]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

141.3068 seconds used for training.
2.36 minutes used for training.
Peak reserved memory = 10.588 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 45.229 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [11]:
import os
working_dir = './'

output_directory = os.path.join(working_dir, "peft_pre-outputs_llama3")
# peft_model_path = os.path.join(output_directory, f"lora_model")

In [15]:
trainer.model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('./peft_pre-outputs_llama3/tokenizer_config.json',
 './peft_pre-outputs_llama3/special_tokens_map.json',
 './peft_pre-outputs_llama3/tokenizer.json')

# Inference

In [9]:
bnb_config2 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [17]:

base_with_adapters_model = AutoPeftModelForCausalLM.from_pretrained(output_directory)
tokenizer = AutoTokenizer.from_pretrained(output_directory)

# loaded_model = AutoPeftModelForCausalLM.from_pretrained(
#                                         output_directory,
#                                         torch_dtype=torch.bfloat16,
#                                         #load_in_4bit=True,
#                                         quantization_config=bnb_config2,
#                                         device_map = 'cuda')


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# Merge with base model, Note that this creates a full f32 model!
model = base_with_adapters_model.merge_and_unload()


In [12]:
dir = os.path.join(working_dir, "peft_pre-outputs_llama3_full")

In [20]:
model.save_pretrained(dir)
tokenizer.save_pretrained(dir)

('./peft_pre-outputs_llama3_full/tokenizer_config.json',
 './peft_pre-outputs_llama3_full/special_tokens_map.json',
 './peft_pre-outputs_llama3_full/tokenizer.json')

In [21]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [22]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
# Load the merged model
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(dir, torch_dtype=dtype, quantization_config=bnb_config2, device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(dir)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
input_sentences = tokenizer("फाइबोनैचि अनुक्रम जारी रखें: 1, 1, 2, 3, 5, 8,", return_tensors="pt").to('cuda')
foundational_outputs_sentence = get_outputs(model, input_sentences, max_new_tokens=128)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['फाइबोनैचि अनुक्रम जारी रखें: 1, 1, 2, 3, 5, 8, … (A000045). यह एक संख्या के लिए उसके, और उससे पहले आनेवाली, दो मूल गुणकों पर आधरित है।\n\nगणितज्ञ फ़िलिप व्हीलेर नामांकन प्रणीत करताहुए कहा, "मैं इस आइडियाको, इसकेकोई खास रूप नहीं बनान्देख रहहूँ।"\n\nइन सम्बन्धनविशेष\nपृष्ठ शुरू करें\nअधिकृत वर्णिमापद्-सूची\nश्र']


# FineTuning

In [25]:
gc.collect()
torch.cuda.empty_cache()

In [4]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-hindi", split = "train")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'कुछ एक रीसाइक्लिंग अभियान के लिए एक नारा सुझाव दें।\n'}, {'from': 'gpt', 'value': '1. "ग्रीन भविष्य के लिए एक साथ: कम करें, पुन: उपयोग करें, रीसाइकल करें।"\n2. "एक बेहतर कल के लिए आज ही रीसाइकल करें।"\n3. "अपने कचरे को खजाना बनाएं - रीसाइकल करें!"\n4. "जीवन के चक्र के लिए रीसाइकल करें।"\n5. "संसाधन बचाएं, अधिक रीसाइकल करें।"'}], 'id': '23712'}


In [6]:
alpaca_hindi_prompt="""नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।

### निर्देश:
{}

### इनपुट:
{}

### प्रतिक्रिया:
{}"""

In [14]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # Careful Aya Dataset does not have an input!
        text = alpaca_hindi_prompt.format(convo[0]["value"], "", convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

In [15]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    bias="none",
    task_type="CAUSAL_LM"
)

# base_model_with_new_adapter = get_peft_model(model, peft_config)

training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    output_dir="outputs_instrc",
    # auto_find_batch_size=True, # Find a correct batch size that fits the size of Data.
    learning_rate= 5e-5, # Higher learning rate than full fine-tuning.
    max_steps = 240,
    warmup_steps = 20,
    optim = "adamw_8bit",
    weight_decay = 0.00,
    seed=3047,
    fp16=False,
    bf16=True,
    lr_scheduler_type="linear",
    save_steps=20,
    # eval_steps=20,
    logging_steps=20
    # num_train_epochs=5
)

trainer_instrc = SFTTrainer(
    model = model,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_num_proc = 8,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    #packing=True
)
tokenizer.pad_token = tokenizer.eos_token

Map (num_proc=8):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.41 GB.
10.549 GB of memory reserved.


In [17]:
trainer_instrc_stats = trainer_instrc.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.2206
40,0.8945
60,0.8765
80,0.8637
100,0.8179
120,0.8056
140,0.8316
160,0.8198
180,0.8155
200,0.7955




In [18]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_instrc_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_instrc_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1012.1207 seconds used for training.
16.87 minutes used for training.
Peak reserved memory = 15.244 GB.
Peak reserved memory for training = 4.695 GB.
Peak reserved memory % of max memory = 65.117 %.
Peak reserved memory for training % of max memory = 20.056 %.


In [19]:
trainer_instrc.model.save_pretrained("peft_instrc_unopt")
tokenizer.save_pretrained("peft_instrc_unopt")



('peft_instrc_unopt/tokenizer_config.json',
 'peft_instrc_unopt/special_tokens_map.json',
 'peft_instrc_unopt/tokenizer.json')

# Inference on Instructions

In [20]:
gc.collect()
torch.cuda.empty_cache()

In [21]:
directory = os.path.join(working_dir, "peft_instrc_unopt")

In [22]:
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        directory,
                                        #torch_dtype=torch.bfloat16,
                                        #is_trainable=False,
                                        #load_in_4bit=True,
                                        quantization_config=bnb_config2,
                                        device_map = 'cuda')
tokenizer = AutoTokenizer.from_pretrained(directory)


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
inputs = tokenizer(
[
    alpaca_hindi_prompt.format(
        # "Describe the planet Earth extensively.", # instruction
        # "कुछ एक रीसाइक्लिंग अभियान के लिए एक नारा सुझाव दें",
        "पृथ्वी ग्रह का विस्तृत वर्णन करें",
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

In [31]:
outputs = loaded_model.generate(**inputs, max_new_tokens = 256, use_cache = True, repetition_penalty = 0.9, temperature = 0.7, early_stopping=False, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।\n\n### निर्देश:\nपृथ्वी ग्रह का विस्तृत वर्णन करें\n\n### इनपुट:\n\n\n### प्रतिक्रिया:\nपृथ्वी एक ग्रह है जो सूर्य की कक्षा में सूर्य के करीब है। यह सूर्य से 149.6 मिलियन किलोमीटर दूर है, जो सूर्य की कक्षा का एक तृतीयांश है। पृथ्वी का व्यास 12,742 किलोमीटर है, जो सूर्य से दूरी के त्रुटि के अनुरूप है। पृथ्वी के वार्षिक दो चक्र होते हैं जो कि 12 घंटे के दो चक्र से मिलकर बने होते हैं। पृथ्वी का दो घंटे का एक दिन होता है जो कि सूर्य के पास जाने के लिए एक सूर्य की कक्षा में एक चक्र के लिए लिया जाता है। पृथ्वी का एक साल होता है जो कि पृथ्वी क']
