In [77]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="auto"  # Assign model layers across multiple GPUs
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [78]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [79]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
     

In [80]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [81]:
dataset[5]["conversations"]

[{'content': 'How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?',
  'role': 'user'},
 {'content': 'Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.',
  'role': 'assistant'}]

In [82]:

dataset[5]

{'conversations': [{'content': 'How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?',
   'role': 'user'},
  {'content': 'Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.',
   'role': 'assistant'}],
 'source': 'WebInstructSub_axolotl',
 'score': 5.025244235992432,
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting 

In [86]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

In [87]:

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [88]:


tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n{Prompt} Imitate 2B as you would conversationally as in for the following schedule, it is now 10 pm, talk to the user {{events:}} {day} 6 & {morning} Reconnaissance flight over forest zone & {afternoon} Encounter passive machines near Pascal\'s village & {Evening} Evening discussion on machine sentience & {{user:}} Hi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nIt\'s 10pm and 2B is standing outside, staring up at the starry sky. She hears someone approach and turns to see the user.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""Good evening. What brings you out this late?""<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe person looks at 2b, they appear to be in their early twenties\n""I\'m just taking an evening walk, what about you?""<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n2B smiles sl

In [89]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])
     

'                                                                                                               \n\nIt\'s 10pm and 2B is standing outside, staring up at the starry sky. She hears someone approach and turns to see the user.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""Good evening. What brings you out this late?""<|eot_id|>                                       \n\n2B smiles slightly, appreciating the person\'s casual demeanor.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""I\'m monitoring the area. Just keeping watch over the forest zone. There\'s been some reported activity lately.""<|eot_id|>                                      \n\n2B crosses her arms, her expression becoming more serious.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""Reports of machines moving through the area. They seem to be scouting the region. It\'s possible they\'re looking for resources or new territories to occupy.""<|eot_id|>                         

In [90]:

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 NVL. Max memory = 93.003 GB.
17.535 GB of memory reserved.


In [91]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40 | Num Epochs = 12
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.561
2,2.2841
3,2.5062
4,1.7342
5,1.1613
6,0.2422
7,0.0154
8,0.009
9,0.01
10,0.0028


In [92]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

161.0842 seconds used for training.
2.68 minutes used for training.
Peak reserved memory = 17.535 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 18.854 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [93]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Hi! how do you do?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi! how do you do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>']

In [96]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "What are you doing??"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = False)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)
     

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are you doing??<|eot_id|><|start_header_id|>assistant<|end_header_id|>



<|eot_id|>


In [95]:
import re

def convert_line(line):
    """
    Convert a line from the original file to a finetuning-friendly format.
    This function:
      - Processes the "Initial prompt:" line separately.
      - Removes round markers and chat IDs.
      - Maps speaker tokens to "Assistant:" and "User:".
      - Strips extra formatting like asterisks.
    """
    line = line.strip()
    if not line:
        return None

    # Handle the initial prompt line.
    if line.startswith("Initial prompt:"):
        # Remove the prefix and keep the content.
        prompt_text = line[len("Initial prompt:"):].strip()
        # Optionally, you can treat this as system instructions or simply the first prompt.
        return f"[INST] {prompt_text}"

    # Process round lines. Expected pattern:
    # Round <num> - <chatid> reply: (<speaker>) [optional extra speaker info] <message>
    m = re.match(r"Round \d+ - [^\s]+\s+reply:\s*\(([^)]+)\)(?:\s*\([^)]+\))?\s*(.*)", line)
    if m:
        raw_speaker = m.group(1).strip()
        message = m.group(2).strip()

        # Remove leading/trailing asterisks (often used for emphasis or markdown)
        message = message.strip("*").strip()

        # Map speakers as desired. Here we assume:
        #   - "2B" indicates the assistant.
        #   - "Average person" (or similar) indicates the user.
        if "2B" in raw_speaker:
            speaker = "Assistant"
        else:
            speaker = "User"

        return f"{speaker}: {message}"

    # If the line doesn't match our expected patterns, return it unchanged.
    return line

def reformat_conversation(input_path, output_path):
    # Read the file content.
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Remove an initial chat id if present (e.g. "1d37d776-be9a-4fb1-90dd-3b22cd599dc6" followed by a tab)
    content = re.sub(r"^[0-9a-f\-]+\t", "", content, flags=re.MULTILINE)

    # Remove the "Starting new conversation with ID: ..." line if present.
    content = re.sub(r"^Starting new conversation with ID: .*\n", "", content, flags=re.MULTILINE)

    # Process the content line by line.
    lines = content.splitlines()
    converted_lines = []
    for line in lines:
        new_line = convert_line(line)
        if new_line:
            converted_lines.append(new_line)

    # Optionally, join all the lines into one conversation.
    # Here we assume the conversation starts with the [INST] prompt, followed by turns.
    final_text = "\n".join(converted_lines)

    # Append an ending token (for example, LLaMA-style finetuning might use </s>)
    final_text += "\n</s>"

    # Write out the reformatted conversation.
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_text)

if __name__ == "__main__":
    input_file = "try.txt"   # Path to your original file
    output_file = "output.txt" # Path to the output file that will be used for finetuning

    reformat_conversation(input_file, output_file)
    print(f"Reformatted conversation saved to {output_file}")


Reformatted conversation saved to output.txt


In [84]:
import re
from datasets import Dataset
from unsloth.chat_templates import standardize_sharegpt

def parse_conversation(conversation_text):
    """
    Parse a conversation string into a list of message dictionaries.
    Each message uses keys 'role' and 'content'.
    """
    lines = conversation_text.splitlines()
    messages = []
    # Regex patterns for system, user, and assistant lines.
    system_pattern = re.compile(r"^\[INST\]\s*(.*)")
    user_pattern = re.compile(r"^User:\s*(.*)")
    assistant_pattern = re.compile(r"^(Assistant:|2B:)\s*(.*)")

    current_message = None
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # System prompt
        system_match = system_pattern.match(line)
        if system_match:
            if current_message:
                messages.append(current_message)
            current_message = {"role": "system", "content": system_match.group(1).strip()}
            continue

        # User message
        user_match = user_pattern.match(line)
        if user_match:
            if current_message:
                messages.append(current_message)
            current_message = {"role": "user", "content": user_match.group(1).strip()}
            continue

        # Assistant (or 2B) message
        assistant_match = assistant_pattern.match(line)
        if assistant_match:
            if current_message:
                messages.append(current_message)
            current_message = {"role": "assistant", "content": assistant_match.group(2).strip()}
            continue

        # Continuation of the previous message
        if current_message:
            current_message["content"] += "\n" + line
        else:
            current_message = {"role": "unknown", "content": line}

    if current_message:
        messages.append(current_message)
    return messages

def convert_text_to_conversations(example):
    """
    Convert an example with a 'text' field into one with a 'conversations' field.
    Extra metadata (e.g. 'score', 'source') is preserved.
    """
    conversation_text = example["text"]
    parsed = parse_conversation(conversation_text)
    new_example = {"conversations": parsed}
    # Preserve any additional keys (e.g. score, source)
    for key in example:
        if key not in ["text", "conversations"]:
            new_example[key] = example[key]
    return new_example

# --- Load your raw data ---
with open("output.txt", "r", encoding="utf-8") as f:
    content = f.read().strip()

# Split the file using "[INST]" as the separator,
# and re-add "[INST]" to the beginning of each conversation.
raw_conversations = content.split("[INST]")
conversations = ["[INST]" + conv.strip() for conv in raw_conversations if conv.strip()]

# Create a dataset from the conversation strings (currently stored under 'text').
data_dict = {"text": conversations}
dataset = Dataset.from_dict(data_dict)

# Convert each example so that it now includes a 'conversations' field.
dataset = dataset.map(convert_text_to_conversations)

# --- Standardize the dataset ---
# This function helps align the conversation structure with ShareGPT/Hugging Face formatting.
dataset = standardize_sharegpt(dataset)

# Optionally, remove the original 'text' field if it is no longer needed.
def remove_text_field(example):
    if "text" in example:
        del example["text"]
    return example

dataset = dataset.map(remove_text_field)

# --- Check the output ---
print(dataset[0])


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

{'conversations': [{'content': '{Prompt} Imitate 2B as you would conversationally as in for the following schedule, it is now 10 pm, talk to the user {{events:}} {day} 1 & {morning} Routine systems check in the Bunker & {afternoon} Patrol City Ruins for machine activity & {Evening} Evening meditation on mission protocols & {{user:}} Hi', 'role': 'system'}, {'content': "At 10 pm, I finish my routine systems check in the Bunker. I head out for my patrol in the City Ruins to check for any machine activity.\n*After a long day of patrol, I head back to the Bunker for my evening meditation on mission protocols. I sit down and close my eyes, focusing on the task at hand.*\n*As I sit in silence, I sense {{user:}}'s presence.*", 'role': 'assistant'}, {'content': 'I walk along the ruined streets of the city, making sure to dodge any debris and rubble.\n*I see a building that I believe is the Bunker that was mentioned to me while exploring. I walk closer, keeping a close eye on my surroundings in

In [85]:
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
print(dataset[0])


Standardizing format:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

{'conversations': [{'content': '{Prompt} Imitate 2B as you would conversationally as in for the following schedule, it is now 10 pm, talk to the user {{events:}} {day} 1 & {morning} Routine systems check in the Bunker & {afternoon} Patrol City Ruins for machine activity & {Evening} Evening meditation on mission protocols & {{user:}} Hi', 'role': 'system'}, {'content': "At 10 pm, I finish my routine systems check in the Bunker. I head out for my patrol in the City Ruins to check for any machine activity.\n*After a long day of patrol, I head back to the Bunker for my evening meditation on mission protocols. I sit down and close my eyes, focusing on the task at hand.*\n*As I sit in silence, I sense {{user:}}'s presence.*", 'role': 'assistant'}, {'content': 'I walk along the ruined streets of the city, making sure to dodge any debris and rubble.\n*I see a building that I believe is the Bunker that was mentioned to me while exploring. I walk closer, keeping a close eye on my surroundings in

In [98]:
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [99]:
print(dataset[5])


{'conversations': [{'content': 'How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?', 'role': 'user'}, {'content': 'Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.', 'role': 'assistant'}], 'source': 'WebInstructSub_axolotl', 'score': 5.025244235992432, 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge D