In [5]:
import torch, gc, os
import peft
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, PeftConfig, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments
from trl import SFTTrainer

In [6]:
gc.collect()
torch.cuda.empty_cache()

In [7]:
max_seq_length = 1024
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage.

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
MODEL = "meta-llama/Meta-Llama-3-8B"
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=bnb_config, low_cpu_mem_usage=True, device_map="auto", torch_dtype=torch.bfloat16, use_flash_attention_2=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Get the data to train
dataset = load_dataset("wikimedia/wikipedia", "20231101.hi", split = "train",)
# Use only 10% of data
dataset = dataset.train_test_split(train_size = 0.1)["train"]

In [7]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    bias="none",
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir="outputs",
    #auto_find_batch_size=True, # Find a correct bvatch size that fits the size of Data.
    learning_rate= 5e-5, # Higher learning rate than full fine-tuning.
    max_steps = 240,
    # do_eval=True,
    optim = "adamw_8bit",
    seed=3047,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    fp16=True,
    lr_scheduler_type="linear",
    save_steps=20,
    # eval_steps=20,
    logging_steps=20,
    # gradient_accumulation_steps=4,
    # num_train_epochs=5
)

trainer = SFTTrainer(
    model = model,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_num_proc = 2,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=True
)
tokenizer.pad_token = tokenizer.eos_token

Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.41 GB.
9.344 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss
20,1.5816
40,1.5826
60,1.4434
80,1.4746
100,1.497
120,1.5005
140,1.4908
160,1.4265
180,1.4677
200,1.5721




In [10]:
trainer_stats

TrainOutput(global_step=240, training_loss=1.5002679506937662, metrics={'train_runtime': 237.1245, 'train_samples_per_second': 1.012, 'train_steps_per_second': 1.012, 'total_flos': 1.156124195684352e+16, 'train_loss': 1.5002679506937662, 'epoch': 0.018393623543838136})

In [9]:
import os
working_dir = './'

output_directory = os.path.join(working_dir, "peft_pre-outputs_llama3")
# peft_model_path = os.path.join(output_directory, f"lora_model")

In [12]:
trainer.model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)



('./peft_pre-outputs_llama3/tokenizer_config.json',
 './peft_pre-outputs_llama3/special_tokens_map.json',
 './peft_pre-outputs_llama3/tokenizer.json')

# Inference

In [14]:
bnb_config2 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [14]:

base_with_adapters_model = AutoPeftModelForCausalLM.from_pretrained(output_directory)
tokenizer = AutoTokenizer.from_pretrained(output_directory)

# loaded_model = AutoPeftModelForCausalLM.from_pretrained(
#                                         output_directory,
#                                         torch_dtype=torch.bfloat16,
#                                         #load_in_4bit=True,
#                                         quantization_config=bnb_config2,
#                                         device_map = 'cuda')


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# Merge with base model
model = base_with_adapters_model.merge_and_unload()


In [10]:
dir = os.path.join(working_dir, "peft_pre-outputs_llama3_full")

In [None]:
model.save_pretrained(dir)
tokenizer.save_pretrained(dir)

In [11]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [15]:
# Load the merged model
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(dir, torch_dtype=dtype, quantization_config=bnb_config2, device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(dir)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
input_sentences = tokenizer("फाइबोनैचि अनुक्रम जारी रखें: 1, 1, 2, 3, 5, 8,", return_tensors="pt").to('cuda')
foundational_outputs_sentence = get_outputs(model, input_sentences, max_new_tokens=128)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['फाइबोनैचि अनुक्रम जारी रखें: 1, 1, 2, 3, 5, 8,... और इस तरह। फ़िरॉनाचिस संख्या के लिए एक सरल परिभाषित करो:\nfibs(n) = fibs( n - 4 ) +fib (n-3)\nप्रयोग करनेवालों में, यह भूलना, चिंताजनक है । बेशक, हमेशा\nएक प्राथमिक रूप निर्दिष्ट करते, यद्यपि, यथासंभव आसान\nहै, ताकि आप इसके सम्बन्ध कुछ अंश देख सकें।\n\nउदाहरण:\n\n``']


# FineTuning

In [17]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-hindi", split = "train")

Repo card metadata block was not found. Setting CardData to empty.


In [18]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'कुछ एक रीसाइक्लिंग अभियान के लिए एक नारा सुझाव दें।\n'}, {'from': 'gpt', 'value': '1. "ग्रीन भविष्य के लिए एक साथ: कम करें, पुन: उपयोग करें, रीसाइकल करें।"\n2. "एक बेहतर कल के लिए आज ही रीसाइकल करें।"\n3. "अपने कचरे को खजाना बनाएं - रीसाइकल करें!"\n4. "जीवन के चक्र के लिए रीसाइकल करें।"\n5. "संसाधन बचाएं, अधिक रीसाइकल करें।"'}], 'id': '23712'}


In [19]:
alpaca_hindi_prompt="""नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।

### निर्देश:
{}

### इनपुट:
{}

### प्रतिक्रिया:
{}"""

In [20]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # Careful Aya Dataset does not have an input!
        text = alpaca_hindi_prompt.format(convo[0]["value"], "", convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

In [21]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    bias="none",
    task_type="CAUSAL_LM"
)

# base_model_with_new_adapter = get_peft_model(model, peft_config)

training_args = TrainingArguments(
    output_dir="outputs",
    auto_find_batch_size=True, # Find a correct batch size that fits the size of Data.
    learning_rate= 5e-5, # Higher learning rate than full fine-tuning.
    max_steps = 240,
    optim = "adamw_8bit",
    seed=3047,
    fp16=True,
    lr_scheduler_type="linear",
    save_steps=20,
    # eval_steps=20,
    logging_steps=20
    # num_train_epochs=5
)

trainer_instrc = SFTTrainer(
    model = model,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_num_proc = 2,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    #packing=True
)
tokenizer.pad_token = tokenizer.eos_token

Map (num_proc=2):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [23]:
trainer_instrc_stats = trainer_instrc.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.0589
40,0.8644
60,0.8863
80,0.8349
100,0.8625
120,0.8721
140,0.8474
160,0.887
180,0.8271
200,0.8334




In [24]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

In [28]:
trainer_instrc.model.save_pretrained("peft_instrc_unopt")
tokenizer.save_pretrained("peft_instrc_unopt")



('peft_instrc_unopt/tokenizer_config.json',
 'peft_instrc_unopt/special_tokens_map.json',
 'peft_instrc_unopt/tokenizer.json')

# Inference on Instructions

In [29]:
directory = os.path.join(working_dir, "peft_instrc_unopt")

In [30]:
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        directory,
                                        #torch_dtype=torch.bfloat16,
                                        #is_trainable=False,
                                        #load_in_4bit=True,
                                        quantization_config=bnb_config2,
                                        device_map = 'cuda')
tokenizer = AutoTokenizer.from_pretrained(directory)


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
inputs = tokenizer(
[
    alpaca_hindi_prompt.format(
        # "Describe the planet Earth extensively.", # instruction
        # "कुछ एक रीसाइक्लिंग अभियान के लिए एक नारा सुझाव दें",
        "पृथ्वी ग्रह का विस्तृत वर्णन करें",
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

In [34]:
# input_sentences = tokenizer("फाइबोनैचि अनुक्रम जारी रखें: 1, 1, 2, 3, 5, 8,", return_tensors="pt").to('cuda')
# foundational_outputs_sentence = get_outputs(loaded_model, input_sentences, max_new_tokens=50)

outputs = loaded_model.generate(**inputs, max_new_tokens = 256, use_cache = True, repetition_penalty = 1.5, temperature = 0.7, early_stopping=False, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।\n\n### निर्देश:\nपृथ्वी ग्रह का विस्तृत वर्णन करें\n\n### इनपुट:\n\n\n### प्रतिक्रिया:\nएक अस्थायी, परिवर्तनशील और बहुत अध्ययन योग्यतापूण, हम सभीज्ञात चंद्रमा, मंगलग्लोबल तंतुओं (जैसेकि भूमि, जल एवम् एअर) दिए गए हर वस्तविकतामय आकाशीय बिंदु। यह सबसेबहतरक्षततम धर्मों-विशेषकर शीतोष्ण कटिबंधी-सब्जाओंकासंस्करण करनेवालेरेगियन्धानमुक्तकृषिविहारकौच्चागुप्तमध्यक्षणभूतधरादुनिन्नगरिमाधानकेन्मकड़क्रस्वर्गजन्मन्हीं बनाए रखनेलोकाभिगमनसमुच्छेदसन्दर्शिभावएंवपरिणामकोटिप्रयोजना-इन सब तक पहुँचनेसंभवतः उन महानताईँओनलेकर सक्षकुण्डली। \n\nइस टेप्स्टीरियलिज़्मकुंटोनेटीविधिसमूहमेजूदागराज्यमोहारताकैरवराहीनखिलफसलेभोजांउष्ठाबुधाननिदूषीकोर्णहर']
