In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unslo th
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit", # "unsloth/tinyllama" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [5]:
from unsloth import FastLanguageModel  # Import FastLanguageModel here
import torch

model = FastLanguageModel.get_peft_model(
    model,
    r = 32,  # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "embed_tokens",  # Include this for handling embeddings
        "lm_head",       # Include this for handling output tokens
    ],
    lora_alpha = 32,
    lora_dropout = 0,  # Currently only supports dropout = 0
    bias = "none",     # Currently only supports bias = "none"
    use_gradient_checkpointing = False,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)


Unsloth 2024.9.post4 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [6]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Packs short sequences together to save time!
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.451 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

Counting untrained tokens:   0%|          | 0/3013 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,013 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 376
 "-____-"     Number of trainable parameters = 156,303,360


Step,Training Loss
1,2.8724
2,2.8485
3,2.8665
4,2.8002
5,2.7205
6,2.8306
7,2.7766
8,2.8174
9,2.6963
10,2.7489


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
         "Continue the prime sequence.", # instruction
        "2, 3, 5", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the prime sequence.", # instruction
        "2, 3, 5", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous monument in Agra, India?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

In [None]:
from unsloth import FastLanguageModel
import torch
import torch.nn.functional as F

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/tinyllama-bnb-4bit',
    max_seq_length=2048,
)

# Define classification inputs
classification_inputs = [
    {'input_text': 'It was a job well done!', 'label': 'positive'},
    {'input_text': 'Ride was horrific.', 'label': 'negative'},
]

# Define labels and label mapping
label_mapping = {'positive': 1, 'negative': 0}

# Perform classification
results = []
for item in classification_inputs:
    # Tokenize input text and get the input IDs (drop special tokens)
    inputs = tokenizer(item['input_text'], return_tensors='pt', truncation=True)
    input_ids = inputs['input_ids']

    # Generate logits for the input
    with torch.no_grad():
        output = model(**inputs)
        logits = output.logits

    # Sum the logits for each token (or you can take the last token, depending on the task)
    summed_logits = logits.sum(dim=1).squeeze()

    # Consider only the first two logits for binary classification (for illustration)
    binary_logits = summed_logits[:2]

    # Compute softmax to get probabilities
    probs = F.softmax(binary_logits, dim=-1)

    # Determine the predicted label (0 or 1) and map to label name
    predicted_label = torch.argmax(probs).item()
    pred_label_name = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_label)]

    results.append({'input': item['input_text'], 'predicted_label': pred_label_name})

In [None]:
print(results)

In [None]:
from unsloth import FastLanguageModel
import torch

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/tinyllama-bnb-4bit',
    max_seq_length=2048,
)


In [None]:
# Prepare the model for inference as required by Unsloth
model = FastLanguageModel.for_inference(model)

In [None]:
# Define conversation history
conversation = [
    {'role': 'system', 'content': 'You are a technical assistant specializing in solving data science and machine learning problems.'},
    {'role': 'user', 'content': 'What method would you recommend for text classification in small datasets?'},
    {'role': 'assistant', 'content': 'For small datasets, using classical algorithms like Naive Bayes or Support Vector Machines (SVM) is often effective. If you can leverage pre-trained embeddings, even better.'},
]


In [None]:
# Generate a response based on the last user input
user_input = 'Which classification is better to use for spam detection in emails?'
conversation.append({'role': 'user', 'content': user_input})

In [None]:
# Concatenate conversation into a single input prompt
conversation_history = ""
for message in conversation:
    role = message['role']
    content = message['content']
    if role == 'system':
        conversation_history += f"System: {content}\n"
    elif role == 'user':
        conversation_history += f"User: {content}\n"
    elif role == 'assistant':
        conversation_history += f"Assistant: {content}\n"

In [None]:
# Prepare the input for the model
input_ids = tokenizer.encode(conversation_history, return_tensors='pt')

In [None]:
# Decode the generated tokens and print the response
response = tokenizer.decode(generated_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
print(f'Assistant: {response}')

In [None]:
from unsloth import FastLanguageModel

# Extend max context size using RoPE Scaling
max_seq_length = 4096  # Default is 2048, but RoPE allows us to extend to 4096 or more
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/tinyllama-bnb-4bit',
    max_seq_length=max_seq_length,
)

In [None]:
# Confirm context size extension
print(f'Maximum context size extended to: {model.config.max_position_embeddings}')