In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import numpy as np
import torch


In [2]:
from unsloth import FastLanguageModel
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.999 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




In [3]:
baseline_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
baseline_model = FastLanguageModel.for_inference(baseline_model)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.999 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post3 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [10]:
from unsloth.chat_templates import get_chat_template

dataset = load_dataset("sentence-transformers/eli5", split = "train")

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    questions = examples['question']
    answers = examples['answer']
    convos = []
    for question, answer in zip(questions, answers):
        convos.append([{"role": "user", "content": question}, {"role": "assistant", "content": answer}])
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }





In [11]:
from unsloth.chat_templates import standardize_sharegpt
# dataset = standardize_sharegpt(dataset['train'])
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [12]:
dataset = dataset.train_test_split(test_size=0.2)

In [13]:
dataset['test'][2000]

{'question': 'How were certain Greek letters (like delta and pi) chosen to represent what they do in math?',
 'answer': "Usually it's simply the first letter of the word the symbol is standing in for (in either Greek or English, but often they're the same since so much science and maths terminology comes from Greek). So delta is δ because it is the first letter of difference (Greek: *διαφορά*). Pi was originally referred to as π/δ, i.e. periphery (*περιφέρεια*, meaning circumference) over diameter (*διάμετρος*), which is exactly what it is, and later shortened to just π.",
 'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow were certain Greek letters (like delta and pi) chosen to represent what they do in math?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nUsually it's simply the first letter of the word the symbol is standing in 

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 0.01,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        save_steps=0.1,
        save_total_limit=5,
        load_best_model_at_end=True,
        eval_steps=0.1,
        evaluation_strategy="steps",
        resume_from_checkpoint="./outputs/checkpoint-3663/"
    ),
)



In [17]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/260380 [00:00<?, ? examples/s]

Map:   0%|          | 0/65095 [00:00<?, ? examples/s]

In [18]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAre humans considered tetraploid (4n) during anaphase and telophase?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe -ploidy classification for a cell would be for its normal state. Humans have a variety of cells that operate in different states of polyploidy. Most human cells are diploid, gametes are haploid, some liver, heart, and bone cells are polyploidic. Humans are considered to be diploids, because that is what most of our cells are most of the time. A change in a single cell doesn't change that. Especially when tripoidy and tetraploidy exists as real syndromes in humans.<|eot_id|>"

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])

"                                                     \n\nThe -ploidy classification for a cell would be for its normal state. Humans have a variety of cells that operate in different states of polyploidy. Most human cells are diploid, gametes are haploid, some liver, heart, and bone cells are polyploidic. Humans are considered to be diploids, because that is what most of our cells are most of the time. A change in a single cell doesn't change that. Especially when tripoidy and tetraploidy exists as real syndromes in humans.<|eot_id|>"

In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 260,380 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 8
\        /    Total batch size = 64 | Total steps = 4,068
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss,Validation Loss
407,2.7385,2.756239
814,2.7129,2.725481
1221,2.6901,2.711261
1628,2.6671,2.700244
2035,2.678,2.694318
2442,2.6669,2.696275
2849,2.6754,2.691411
3256,2.6635,2.686543
3663,2.6527,2.682074


In [15]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

def stream_answer(question, model=model):
    messages = [
        {"role": "user", "content": question},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")
    
    
    answer = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 512,
                       use_cache = True)
    return answer

for question, test_answer in zip(dataset['test'][:100]['question'], dataset['test'][:100]['answer']):
    print("Question :", question)
    print("Answer: \n", test_answer)
    print("Baseline: \n")
    answer = stream_answer(question, baseline_model)
    print("Finetune: \n")
    answer = stream_answer(question)
    print("="*10)

Question : Why is the "Black Death" mentioned so often in western history but not Chinese history even though it originated there?
Answer: 
 I have an unpublished paper on this subject that sheds some light. I will summarize:  We have some interesting demographic data from the early 14th century in China showing the outbreak of various epidemics. These were very significant, and were certainly contributing factors to the demise of the Yuen dynasty. I say various because it is unclear from the documents whether the outbreaks were the same disease. Many of the surviving documents are brief, reporting only deaths.  So, we cannot say definitively whether the black death existed in China or not, but there was/were disease(s) at the right time, and it/they knocked out a significant chunk of the population.  EDIT: I realize this is ambiguous - the paper is not mine, my thesis advisor from grad school wrote it.
Baseline: 

The term "Black Death" is often mentioned in Western history, but not i

In [45]:
dataset['test'][:10]

{'question': ['Why is the "Black Death" mentioned so often in western history but not Chinese history even though it originated there?',
  'How historically accurate is the show The Last Kingdom on Netflix?',
  'If there was a lack of food supply in the world, could we survive of of just vitamins and a high concentration of calories? Or do we HAVE to eat in order to live?',
  'If our DNA varies from person to person, how do we know how closely human DNA resembles that of orangutans?',
  'Why would a company spend millions on a long Superbowl commercial, only to leak it online a week before the Superbowl?',
  "Question about Popes and alliances in 1500's Europe...",
  '[20th Century Russia] I\'m about done with my first read of "War and Peace", and I\'m curious how the book was regarded by the Soviets.',
  'Why are advertisements for colognes and perfumes so strange?',
  'Friday Free-for-All | April 07, 2017',
  'Do lie detectors work?'],
 'answer': ['I have an unpublished paper on this

In [15]:
model, to = FastLanguageModel.from_pretrained("./outputs/checkpoint-3663/")

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.999 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




In [17]:
model[0].save_pretrained_gguf("eli5", tokenizer, quantization_method = "q8_0")

make: Entering directory '/home/khanh/capstone/llama.cpp'
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -DGGML_USE_AMX  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML

100%|███████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 17.14it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at eli5 into q8_0 GGUF format.
The output location will be ./eli5/unsloth.Q8_0.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: eli5
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> Q8_0, shape = {2048, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2

  gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
  mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
  f"   \\\   /|    [0] Installing llama.cpp will take 3 minutes.\n"\
  f"O^O/ \_/ \\    [1] Converting HF to GGUF 16bits will take 3 minutes.\n"\
  f"\        /    [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\


KeyError: '"name"'