In [7]:
!pip install transformers peft bitsandbytes trl deepeval



In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig
from trl import SFTTrainer

In [10]:
# setup quantization config
compute_dtype = getattr(torch, 'float16')
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant= False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

In [11]:
from google.colab import userdata
userdata.get('Llama3')

'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'

In [13]:
# Ensure local directory does not exist
!rm -rf meta-llama/Meta-Llama-3-8B

# Load base model with access token
base_model_name = 'meta-llama/Meta-Llama-3-8B'
access_token = "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"
llama_3 = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    token=access_token,
    quantization_config=quant_config,
    device_map={"": 0},
)

# Load tokenizer with access token
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    token=access_token,
    trust_remote_code=True,
)

# Set padding token and side
tokenizer.pad_token = tokenizer.eos_token # this model doesnt requires separate padding token
tokenizer.padding_side = "right" #padding should be added to the end (right side) of the sequences

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
#load dataset
train_dataset_name = "mlabonne/guanaco-llama2-1k"
train_dataset = load_dataset(train_dataset_name, split = "train")

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
# load LoRA configuration for PEFT
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
# Set training argument
training_arguments = TrainingArguments(
    output_dir="./tuning_results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

#set SFT Parameter
trainer = SFTTrainer(
    model=llama_3,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
trainer.train()

Step,Training Loss
25,1.2391
50,1.6746
75,1.1877
100,1.574
125,1.1064
150,1.522
175,1.3347
200,1.4302
225,1.235
250,1.3274



403 Forbidden: Authorization error..
Cannot access content at: https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
If you are trying to create or update content,make sure you have a token with the `write` role. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

403 Forbidden: Authorization error..
Cannot access content at: https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
If you are trying to create or update content,make sure you have a token with the `write` role. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

403 Forbidden: Authorization error..
Cannot access content at: https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
If you are trying to create or update content,make sure you have a token with the `write` role. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

403 Forbidden: Authorization e

TrainOutput(global_step=500, training_loss=1.3434509239196777, metrics={'train_runtime': 322.4447, 'train_samples_per_second': 3.101, 'train_steps_per_second': 1.551, 'total_flos': 1.6556230510313472e+16, 'train_loss': 1.3434509239196777, 'epoch': 1.0})

In [18]:
torch.cuda.memory_summary()



In [19]:
new_model = "tuned_llama-3-8b"
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)




403 Forbidden: Authorization error..
Cannot access content at: https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
If you are trying to create or update content,make sure you have a token with the `write` role. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.


('tuned_llama-3-8b/tokenizer_config.json',
 'tuned_llama-3-8b/special_tokens_map.json',
 'tuned_llama-3-8b/tokenizer.json')

In [20]:
prompt = "what is a large language model?"
pipe = pipeline(
    "text-generation",
    model=llama_3,
    tokenizer= tokenizer,
    max_length = 200
)

result = pipe(f"[s][INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[s][INST] what is a large language model? [/INST] A large language model is a type of artificial intelligence that is trained to generate human-like text. These models are trained on vast amounts of text data, and they can generate text that is coherent and grammatically correct. Large language models are often used for tasks such as summarizing articles, generating marketing content, and answering customer questions. [/INST]
