# Unsloth

In [1]:
print("Jupyter running!")

Jupyter running!


Logging

In [2]:
import sys
import logging

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [3]:
from transformers import TrainerCallback

class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            print(logs)

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/gemma-2-2b-bnb-4bit",             # New small Gemma model!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../gemma-2-9b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.11.5: Fast Gemma2 patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 30.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.5 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


# Load datasets

In [6]:
import pandas as pd
from datasets import Dataset
df = pd.read_csv("data/webnlg_train.csv", header=0, delimiter='\t')

In [7]:
texts = []
for index, row in df.iterrows():
    output = row['Triples']
    p_input = row["NL"]
    triples = output.split('<T>')
    triples = ["<T>" + triple for triple in filter(None, triples)]

    if len(triples) > 1:
        # 1 context
        if len(triples) >= 2:
            context = "".join(triples[0])
            sliced_output = "".join(triples[1:])
            texts.append([context, p_input, sliced_output])

        # 2 context
        if len(triples) >= 3:
            context = "".join(triples[0:2])
            sliced_output = "".join(triples[2:])
            texts.append([context, p_input, sliced_output])

        # 3 context
        if len(triples) >= 4:
            context = "".join(triples[0:3])
            sliced_output = "".join(triples[3:])
            texts.append([context, p_input, sliced_output])

    # Original
    texts.append(["None", p_input, output])

# df
df = pd.DataFrame(texts, columns = ["context", "input", "output"])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,context,input,output
0,<T>United_States<R>ethnicGroup<S>Native_Americ...,The United States includes the Native American...,"<T>Auburn,_Alabama<R>country<S>United_States<T..."
1,<T>Agnes_Kant<R>nationality<S>Netherlands,The leader of the Netherlands is Mark Rutte wh...,<T>Netherlands<R>leader<S>Mark_Rutte<T>Agnes_K...
2,<T>Alan_Martin_(footballer)<R>club<S>Hamilton_...,The footballer Alan Martin has played for Hami...,<T>Alan_Martin_(footballer)<R>club<S>Aldershot...
3,,Abilene Regional Airport serves the city of Ab...,<T>Abilene_Regional_Airport<R>cityServed<S>Abi...
4,<T>14th_New_Jersey_Volunteer_Infantry_Monument...,The 14th New Jersey Volunteer Infantry Monumen...,<T>14th_New_Jersey_Volunteer_Infantry_Monument...
...,...,...,...
97131,,"The birth place of Allan Shivers is Lufkin, Te...","<T>Allan_Shivers<R>birthPlace<S>Lufkin,_Texas"
97132,<T>103_Hera<R>discoverer<S>James_Craig_Watson<...,"James Craig Watson, who discovered 103 Hera, o...",<T>James_Craig_Watson<R>deathCause<S>Peritonitis
97133,<T>Andrew_Rayel<R>associatedBand/associatedMus...,Trance musician Andrew Rayel is associated wit...,<T>Andrew_Rayel<R>associatedBand/associatedMus...
97134,,Ahmet Ertegun's genre is rhythm and blues.,<T>Ahmet_Ertegun<R>genre<S>Rhythm_and_blues


In [8]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['context', 'input', 'output'],
    num_rows: 97136
})

In [9]:
from trl import DataCollatorForCompletionOnlyLM
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Extract the most confident information in the sentence below as much as possible, and express the relationships in RDF Triples that complement the existing RDF triples. Do not use information from common sense.
### Existing RDF triples:
{}
### Input:
{}
### Response:
<unused0>{}"""
EOS_TOKEN = tokenizer.eos_token # must add!

# Format dataset
def format_prompts(examples):
    context = examples["context"]
    inputs  = examples["input"]
    outputs = examples["output"]
    texts = []
    for _context, _input, _output in zip(context, inputs, outputs):
        _context = _context.replace("<T>", "<unused0>") # change to single token
        _context = _context.replace("<R>", "<unused1>") # change to single token
        _context = _context.replace("<S>", "<unused2>") # change to single token
        
        _output = _output[3:] # remove the first <T> because the prompt contains that
        _output = _output.replace("<T>", "<unused0>") # change to single token
        _output = _output.replace("<R>", "<unused1>") # change to single token
        _output = _output.replace("<S>", "<unused2>") # change to single token
        
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(_context, _input, _output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


# response_template = "### Response:"
# collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [10]:
dataset = dataset.map(format_prompts, batched=True)
dataset

Map:   0%|          | 0/97136 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'input', 'output', 'text'],
    num_rows: 97136
})

In [11]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    dataset_text_field="text",
    # data_collator=collator, # Response only gen
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/97136 [00:00<?, ? examples/s]

In [12]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100 80GB PCIe. Max memory = 30.0 GB.
6.576 GB of memory reserved.


In [13]:
log_callback = PrinterCallback()
trainer.add_callback(log_callback)

In [None]:
trainer_stats = trainer.train()

In [20]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3528.5272 seconds used for training.
58.81 minutes used for training.
Peak reserved memory = 4.053 GB.
Peak reserved memory for training = 0.088 GB.
Peak reserved memory % of max memory = 67.55 %.
Peak reserved memory for training % of max memory = 1.467 %.


In [21]:
# # alpaca_prompt = Copied from above
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     prompt.format(
#         "", # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")
# 
# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)

In [22]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        "<unused0>Turn_Me_On_(album)<unused1>runtime<unused1>35.1",
        """Turn Me On is a 35.1 minute long album produced by Wharton Tiers that was followed by the album entitled Take it Off.""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Extract the most confident information in the sentence below as much as possible, and express the relationships in RDF Triples that complement the existing RDF triples. Do not use information from common sense.
### Existing RDF triples:
<unused0>Turn_Me_On_(album)<unused1>runtime<unused1>35.1
### Input:
Turn Me On is a 35.1 minute long album produced by Wharton Tiers that was followed by the album entitled Take it Off.
### Response:
<unused0>Turn_Me_On_(album)<unused1>followedBy<unused2>Take_It_Off_(album)<unused0>Take_It_Off_(album)<unused1>producer<unused2>Wharton_Tier<eos>


## Saving

In [23]:
model.save_pretrained("graph_9b_full") # Local saving
tokenizer.save_pretrained("graph_9b_full")

('graph_2b_1000step/tokenizer_config.json',
 'graph_2b_1000step/special_tokens_map.json',
 'graph_2b_1000step/tokenizer.model',
 'graph_2b_1000step/added_tokens.json',
 'graph_2b_1000step/tokenizer.json')

## Inference

In [21]:
# if True:
#     from unsloth import FastLanguageModel
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name = "graph_2b_test", # YOUR MODEL YOU USED FOR TRAINING
#         max_seq_length = 2048,
#         dtype = None,
#         load_in_4bit = True,
#     )
#     FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# 
# # alpaca_prompt = You MUST copy from above!
# 
# prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
# ### Instruction:
# Using only the information provided in the sentence, extract the relationship between subjects in the sentence below, and express the relationships in RDF Triples.
# ### Input:
# {}
# ### Response:
# {}"""
# 
# inputs = tokenizer(
# [
#     prompt.format(
#         "The AP CSA course is a college-level course developed by Collegeboard, it teaches high-school students the basic of programming in Java.", # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")
# 
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100)

==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2060. Max memory: 6.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 