## Completion finetuning using unsloth

This notebook makes use of unsloth to finetune a model for a completion task.
In this example we will finetune the llama 3.2 base model to generate ascii art. I would recommend using the unsloth library compared to just using the huggingface library as it requires less memory and is faster.

Adapted from unsloth notebooks, if something is broken check on:
https://unsloth.ai/

In [15]:
%%capture
%pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3  peft trl triton
%pip install --no-deps cut_cross_entropy unsloth_zoo
%pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
%pip install --no-deps unsloth

# Enable faster downloads
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

### Load base model

In [16]:
from unsloth import FastLanguageModel
import torch
import os

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen3-0.6B",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = False,
    token=os.environ["HF_ACCESS_TOKEN"]
)

==((====))==  Unsloth 2025.9.9: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [17]:
tokenizer.clean_up_tokenization_spaces = False

### Add lora to base model and patch with Unsloth

In [18]:
# More info about parameters: https://huggingface.co/docs/peft/v0.11.0/en/package_reference/lora#peft.LoraConfig
target_modules =  ["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"]

# When adding special tokens
train_embeddings = False

if train_embeddings:
  target_modules = target_modules + ["lm_head"]

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # rank of lora matrices according to paper not much loss when set relatively low
    target_modules = target_modules,  # On which modules of the llm the lora weights are used
    lora_alpha = 16, # scales the weights of the adapters (more influence on base model), 16 was recommended on reddit
    lora_dropout = 0, # Default on 0.05 in tutorial but unsloth says 0 is better
    bias = "none",    # "none" is optimized
    use_gradient_checkpointing = "unsloth", #"unsloth" for very long context, decreases vram
    random_state = 3407,
    use_rslora = False,  # scales lora_alpha with 1/sqrt(r), huggingface says this works better
    loftq_config = None, # And LoftQ
)

In [19]:
empty_prompt = """
{ascii_art}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func_no_prompt(examples):
  ascii_art_samples = examples["ascii"]
  training_prompts = []
  for ascii_art in ascii_art_samples:
      training_prompt = empty_prompt.format(ascii_art=ascii_art) + EOS_TOKEN
      training_prompts.append(training_prompt)
  return { "text" : training_prompts, }


from datasets import load_dataset
dataset = load_dataset("pookie3000/ascii-cats", split = "train")
dataset = dataset.map(formatting_prompts_func_no_prompt, batched = True)

In [20]:
dataset

Dataset({
    features: ['ascii', 'creature', 'text'],
    num_rows: 201
})

 ### Visualize dataset

In [21]:
for i, sample in enumerate(dataset):
    print(f"\n------ Sample {i + 1} ----")
    print(sample["text"])
    if i > 2:
      break


------ Sample 1 ----

    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
<|im_end|>

------ Sample 2 ----

|\---/|
| o_o |
 \_^_/
<|im_end|>

------ Sample 3 ----

 |\__/,|   (`\
 |_ _  |.--.) )
 ( T   )     /
(((^_(((/(((_/
<|im_end|>

------ Sample 4 ----

   |\---/|
   | ,_, |
    \_`_/-..----.
 ___/ `   ' ,""+ \  
(__...'   __\    |`.___.';
  (_,...'(_,.`__)/'.....+
<|im_end|>


In [23]:
# Alternative approach: Pre-tokenize dataset to avoid multiprocessing issues
import torch
from datasets import Dataset

def preprocess_dataset_manually(dataset, tokenizer, max_length=2048):
    """Manually preprocess dataset to avoid multiprocessing issues"""
    print("Preprocessing dataset manually...")
    
    processed_texts = []
    for i, example in enumerate(dataset):
        if i % 50 == 0:
            print(f"Processing example {i}/{len(dataset)}")
        
        text = example["text"]
        # Tokenize manually
        inputs = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding=False,
            return_tensors="pt"
        )
        
        # Convert to the format expected by the trainer
        processed_texts.append({
            "input_ids": inputs["input_ids"].squeeze().tolist(),
            "attention_mask": inputs["attention_mask"].squeeze().tolist(),
            "labels": inputs["input_ids"].squeeze().tolist()  # For causal LM, labels = input_ids
        })
    
    return Dataset.from_list(processed_texts)

# Preprocess the dataset
processed_dataset = preprocess_dataset_manually(dataset, tokenizer)
print(f"Processed dataset size: {len(processed_dataset)}")
print("Sample processed example:")
print(processed_dataset[0])


Preprocessing dataset manually...
Processing example 0/201
Processing example 50/201
Processing example 100/201
Processing example 150/201
Processing example 200/201
Processed dataset size: 201
Sample processed example:
{'input_ids': [198, 262, 23536, 62, 34319, 1843, 7436, 198, 256, 284, 297, 14179, 284, 2130, 5973, 262, 1124, 1124, 715, 262, 1304, 61, 414, 15617, 220, 1124, 4847, 8, 1727, 5957, 26432, 80517, 29, 3804, 2130, 16324, 2130, 5894, 151645], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [198, 262, 23536, 62, 34319, 1843, 7436, 198, 256, 284, 297, 14179, 284, 2130, 5973, 262, 1124, 1124, 715, 262, 1304, 61, 414, 15617, 220, 1124, 4847, 8, 1727, 5957, 26432, 80517, 29, 3804, 2130, 16324, 2130, 5894, 151645]}


### inference

In [None]:
from transformers import TextStreamer

def generate_ascii_art(model):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer("", return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer)
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationMixin
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationConfig
    for token in model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100):
        print(token)
        pass

## Saving

In [None]:
# Fixed generate_ascii_art function
def generate_ascii_art_fixed(model):
    FastLanguageModel.for_inference(model)
    
    # Use the same format as training data - empty prompt with just the template
    empty_prompt = """
{ascii_art}
"""
    
    # Start with just the opening of the template
    prompt = "\n"  # This matches the training format
    inputs = tokenizer(prompt, return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer)
    
    # Add generation parameters for better control
    generation_config = {
        "max_new_tokens": 100,
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.9,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    }
    
    print("Generating ASCII art...")
    for token in model.generate(**inputs, streamer = text_streamer, **generation_config):
        pass
    print("\n" + "="*50)


In [None]:
# Test the fixed function
for i in range(3):
    print(f"\n--- ASCII Art #{i+1} ---")
    generate_ascii_art_fixed(model)



--- ASCII Art #1 ---
Generating ASCII art...

  |\__/,|   (`\
  |o o  |__ _) )/
  |  ^  |   / )
  |  ||  /  (/
 (_(((_(((_/
<|im_end|>


--- ASCII Art #2 ---
Generating ASCII art...

  ((      /\_/\  
   \\.._.' - ^ \  
   /\ | '.__ ^ /  
  (_ .   /     )   
   \  |  /     /
    \ '--'    /
    (     )   (
    (______)  |
    (__))))_))
<|im_end|>


--- ASCII Art #3 ---
Generating ASCII art...

 .       .        
 |\\___//|   .-``\
 |\ ;_i_/ | /   \ \
U   ;  ^  ;  >     <   
  \  ||  /   |   \   |
   \_))_//   |_)  /   `
     '--------|--------'
<|im_end|>



### Save lora adapter

This is both useful for inference and if you want to load the model again

In [12]:
# Alternative approach: Pre-tokenize dataset to avoid multiprocessing issues
import torch
from datasets import Dataset

def preprocess_dataset_manually(dataset, tokenizer, max_length=2048):
    """Manually preprocess dataset to avoid multiprocessing issues"""
    print("Preprocessing dataset manually...")
    
    processed_texts = []
    for i, example in enumerate(dataset):
        if i % 50 == 0:
            print(f"Processing example {i}/{len(dataset)}")
        
        text = example["text"]
        # Tokenize manually
        inputs = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding=False,
            return_tensors="pt"
        )
        
        # Convert to the format expected by the trainer
        processed_texts.append({
            "input_ids": inputs["input_ids"].squeeze().tolist(),
            "attention_mask": inputs["attention_mask"].squeeze().tolist(),
            "labels": inputs["input_ids"].squeeze().tolist()  # For causal LM, labels = input_ids
        })
    
    return Dataset.from_list(processed_texts)

# Preprocess the dataset
processed_dataset = preprocess_dataset_manually(dataset, tokenizer)
print(f"Processed dataset size: {len(processed_dataset)}")
print("Sample processed example:")
print(processed_dataset[0])


Preprocessing dataset manually...
Processing example 0/201
Processing example 50/201
Processing example 100/201
Processing example 150/201
Processing example 200/201
Processed dataset size: 201
Sample processed example:
{'input_ids': [198, 262, 23536, 62, 34319, 1843, 7436, 198, 256, 284, 297, 14179, 284, 2130, 5973, 262, 1124, 1124, 715, 262, 1304, 61, 414, 15617, 220, 1124, 4847, 8, 1727, 5957, 26432, 80517, 29, 3804, 2130, 16324, 2130, 5894, 151645], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [198, 262, 23536, 62, 34319, 1843, 7436, 198, 256, 284, 297, 14179, 284, 2130, 5973, 262, 1124, 1124, 715, 262, 1304, 61, 414, 15617, 220, 1124, 4847, 8, 1727, 5957, 26432, 80517, 29, 3804, 2130, 16324, 2130, 5894, 151645]}


In [13]:
# Create trainer with pre-processed dataset (no multiprocessing needed)
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Set environment variables to disable multiprocessing
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

print("Creating trainer with pre-processed dataset...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_dataset,  # Use pre-processed dataset
    dataset_text_field=None,          # No text field needed since it's pre-processed
    max_seq_length=2048,
    dataset_num_proc=1,               # Not used since dataset is pre-processed
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=5,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none"
    ),
)

print("Trainer created successfully!")


Creating trainer with pre-processed dataset...
Trainer created successfully!


In [14]:
# Start training
print("Starting training...")
trainer_stats = trainer.train()
print("Training completed!")
print(f"Training stats: {trainer_stats}")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 201 | Num Epochs = 5 | Total steps = 130
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 10,092,544 of 606,142,464 (1.67% trained)


Step,Training Loss
1,4.6643
2,4.5427
3,5.3899
4,4.7391
5,4.2399
6,4.7414
7,4.6145
8,4.4987
9,4.0755
10,4.1665


Training completed!
Training stats: TrainOutput(global_step=130, training_loss=2.910807440831111, metrics={'train_runtime': 198.7045, 'train_samples_per_second': 5.058, 'train_steps_per_second': 0.654, 'total_flos': 183906877440000.0, 'train_loss': 2.910807440831111, 'epoch': 5.0})


In [24]:
from huggingface_hub import HfApi
api = HfApi()
user_info = api.whoami(token=os.environ["HF_ACCESS_TOKEN"])
print(f"Username: {user_info['name']}")

Username: Sri1999


In [9]:
model.push_to_hub(
    "Sri1999/Qwen3-0.6B-ascii-cats-lora",  
    tokenizer, 
    token = os.environ["HF_ACCESS_TOKEN"]
)

adapter_model.safetensors: 100%|██████████| 40.4M/40.4M [00:10<00:00, 3.73MB/s]


Saved model to https://huggingface.co/Sri1999/Qwen3-0.6B-ascii-cats-lora


In [1]:
import unsloth
print(unsloth.__version__)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


W0927 16:16:09.654000 20872 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


🦥 Unsloth Zoo will now patch everything to make training faster!
2025.9.9


In [1]:
from unsloth import FastLanguageModel
import os

# 1️⃣  Load the fine-tuned checkpoint
model, tokenizer = FastLanguageModel.from_pretrained(
    r"D:\ascii_art_completion_finetuning\outputs\checkpoint-130",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=False
)


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


W0927 18:22:30.547000 41204 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.9.9: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_id = "Qwen/Qwen3-0.6B"
lora_ckpt = r"D:\ascii_art_completion_finetuning\outputs\checkpoint-130"
out_dir  = r"D:\ascii_art_completion_finetuning\merged-qwen3-0.6b"

tok = AutoTokenizer.from_pretrained(base_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    base_id, trust_remote_code=True,
    torch_dtype=torch.float16, device_map="auto"
)

model = PeftModel.from_pretrained(model, lora_ckpt)
model = model.merge_and_unload()

model.save_pretrained(out_dir, safe_serialization=True)
tok.save_pretrained(out_dir)

print("Merged model saved to:", out_dir)


  from .autonotebook import tqdm as notebook_tqdm
W0928 02:22:05.414000 52100 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
`torch_dtype` is deprecated! Use `dtype` instead!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Merged model saved to: D:\ascii_art_completion_finetuning\merged-qwen3-0.6b


In [3]:
# run this in the cmd :
#python .\convert_hf_to_gguf.py ..\merged-qwen3-0.6b --outfile ..\qwen3-ascii-f16.gguf --outtype f16

In [None]:
from huggingface_hub import create_repo, upload_file

repo_id = "Sri1999/Qwen3-0.6B-ascii-cats-lora-GGUF"

# 1️⃣ Create the repo if it doesn't exist
create_repo(repo_id, repo_type="model", private=False, exist_ok=True)

# 2️⃣ Upload the GGUF file
upload_file(
    path_or_fileobj=r"D:\ascii_art_completion_finetuning\qwen3-ascii-f16.gguf",
    path_in_repo="qwen3-ascii-f16.gguf",
    repo_id=repo_id,
    repo_type="model"
)

print(f"✅ Uploaded to https://huggingface.co/{repo_id}")


qwen3-ascii-f16.gguf: 100%|██████████| 1.20G/1.20G [04:32<00:00, 4.40MB/s]  


✅ Uploaded to https://huggingface.co/Sri1999/Qwen3-0.6B-ascii-cats-lora-GGUF


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = r"D:\ascii_art_completion_finetuning\merged-qwen3-0.6b"

tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).cuda()

prompt = "Generate a cute ASCII cat:\n"
inputs = tok(prompt, return_tensors="pt").to("cuda")
out = model.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
print(tok.decode(out[0], skip_special_tokens=True))


Generate a cute ASCII cat:
```
  \\
 /,\ )     __
(  )-(  )  \ \
 \ \  `-'''  / \
  \__/  `.  /  )
   ||   /   || 
  (||  (||  )



In [11]:
llm = Llama(
    model_path=r"D:\ascii_art_completion_finetuning\qwen3-ascii-f16.gguf",
    n_ctx=8192,       # or even 16384 if you have RAM/VRAM
    verbose=False
)


llama_context: n_ctx_per_seq (8192) < n_ctx_train (40960) -- the full capacity of the model will not be utilized


In [12]:
def generate_ascii_art(max_tokens: int, generation_config) -> None:
    prompt = "\n"  # not empty
    for chunk in llm.create_completion(
        prompt,
        max_tokens=max_tokens,
        stream=True,
        temperature=generation_config["temperature"],
        top_p=generation_config["top_p"],
        min_p=generation_config["min_p"],
        frequency_penalty=generation_config["frequency_penalty"],
        presence_penalty=generation_config["presence_penalty"],
        repeat_penalty=generation_config["repeat_penalty"],
        top_k=generation_config["top_k"],
    ):
        print(chunk["choices"][0]["text"], end="", flush=True)


In [24]:
response = llm(
    "Q: What is the capital of France?\nA:",
    max_tokens=200,
    temperature=0.7
)

NameError: name 'llm' is not defined