In [None]:
print("Jupyter running!")

In [None]:
import sys
import logging

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

In [None]:
from transformers import TrainerCallback

class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            print(logs)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/gemma-2-2b-bnb-4bit",             # New small Gemma model!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../gemma-2-9b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2060. Max memory: 6.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [3]:
import pandas as pd
from datasets import Dataset
df = pd.read_csv("data/vquanda_A_train.csv", header=0, delimiter='\t')
df

Unnamed: 0,question,output,answer
0,Which universities are alma mater to Charles P...,"University of Chicago, Vanderbilt University",The alma maters of Charles Plosser are Univers...
1,Name the basketball player who played for Phoe...,Tyson Chandler,Tyson Chandler is the basketball player whose ...
2,Count the key people of the Clinton Foundation?,8,There are 8 key people in the Clinton Foundation.
3,What is the purpose of Maharashtra Chess Assoc...,Chess,The purpose of Maharashtra Chess Association i...
4,Tandem Computers is the subsidiary of which co...,Compaq,Tandem Computers is the subsidiary of Compaq.
...,...,...,...
3995,How many shows are aired on Comedy Central?,73,There are 73 television shows broadcasted by C...
3996,What are the software whose programming langua...,"GIMP, Leafpad, ROX Desktop, Sugar (software), ...",The software whose programming language is Mul...
3997,How many owners are there of lines starting at...,14,There are 14 owners of lines starting at the S...
3998,What is the draft team of Dale Mitchell (ice h...,Toronto Maple Leafs,Toronto Maple Leafs is the team that drafted D...


In [4]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['question', 'output', 'answer'],
    num_rows: 4000
})

In [5]:

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Based on the context of the question, convert the query result for the question into a natural sounding contextual answer.
### User Question:
{}
### Query Result:
{}
### Natural Answer:
{}"""
EOS_TOKEN = tokenizer.eos_token # must add!

# Format dataset
def format_prompts(examples):
    contexts = examples["question"]
    outputs  = examples["output"]
    answers = examples["answer"]
    texts = []
    for _context, _input, _output in zip(contexts, outputs, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(_context, _input, _output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [6]:
dataset = dataset.map(format_prompts, batched=True)
dataset

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'output', 'answer', 'text'],
    num_rows: 4000
})

In [7]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    dataset_text_field="text",
    # data_collator=collator, # Response only gen
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 300,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/4000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 2060. Max memory = 6.0 GB.
2.57 GB of memory reserved.


In [None]:
log_callback = PrinterCallback()
trainer.add_callback(log_callback)

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 300
 "-____-"     Number of trainable parameters = 20,766,720


Step,Training Loss
1,3.0576
2,3.1463
3,2.9569
4,2.6696
5,2.3677
6,2.2037
7,1.5117
8,1.4438
9,1.1416
10,1.0237


In [10]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

869.2144 seconds used for training.
14.49 minutes used for training.
Peak reserved memory = 4.234 GB.
Peak reserved memory for training = 1.664 GB.
Peak reserved memory % of max memory = 70.567 %.
Peak reserved memory for training % of max memory = 27.733 %.


In [12]:
# Save model
model.save_pretrained("answer_9b_full") # Local saving
tokenizer.save_pretrained("answer_9b_full")



('answer_2b_300step/tokenizer_config.json',
 'answer_2b_300step/special_tokens_map.json',
 'answer_2b_300step/tokenizer.model',
 'answer_2b_300step/added_tokens.json',
 'answer_2b_300step/tokenizer.json')

In [11]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        "What did apple announce yesterday?",
        "iPhone_15, iPad", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Based on the context of the question, convert the query result for the question into a natural sounding contextual answer.
### User Question:
What did apple announce yesterday?
### Query Result:
iPhone_15, iPad
### Natural Answer:
The products announced by Apple yesterday are iPhone_15, iPad.<eos>
