In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
%pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "subhrokomol/hindi2", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Downloading shards: 100%|██████████| 6/6 [03:07<00:00, 31.23s/it]
Loading checkpoint shards: 100%|██████████| 6/6 [00:06<00:00,  1.04s/it]


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, 
    # fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme: 100%|██████████| 11.6k/11.6k [00:00<00:00, 20.0MB/s]
Downloading data: 100%|██████████| 44.3M/44.3M [00:00<00:00, 68.7MB/s]
Generating train split: 100%|██████████| 51760/51760 [00:00<00:00, 160189.05 examples/s]
Map: 100%|██████████| 51760/51760 [00:00<00:00, 172432.50 examples/s]


In [6]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    texts = []
    for instruction, context, response in zip(instructions, contexts, responses):
        # Combine instruction and context as the full instruction
        full_instruction = f"{instruction}\n\nContext: {context}" if context else instruction
        # Use empty string for input as we've combined instruction and context
        text = alpaca_prompt.format(full_instruction, "", response) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

from datasets import load_dataset
dataset = load_dataset("ai4bharat/indic-instruct-data-v0.1", "dolly")
# Use only the Hindi subset. Remove ['hi'] if you want all languages
dataset = dataset['hi'].map(formatting_prompts_func, batched=True, remove_columns=dataset['hi'].column_names)

Downloading readme: 100%|██████████| 12.5k/12.5k [00:00<00:00, 15.7MB/s]
Downloading data: 100%|██████████| 7.88M/7.88M [00:00<00:00, 8.32MB/s]
Downloading data: 100%|██████████| 20.1M/20.1M [00:01<00:00, 16.4MB/s]
Generating en split: 100%|██████████| 15011/15011 [00:00<00:00, 216076.14 examples/s]
Generating hi split: 100%|██████████| 15011/15011 [00:00<00:00, 110327.29 examples/s]
Map: 100%|██████████| 15011/15011 [00:00<00:00, 69989.46 examples/s]


In [4]:
alpaca_prompt = """Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.

### Conversation:
{}

### Assistant:"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    conversations = examples["messages"]
    texts = []
    for conversation in conversations:
        formatted_conversation = ""
        for message in conversation:
            role = "Human" if message["role"] == "user" else "Assistant"
            formatted_conversation += f"{role}: {message['content']}\n"
        # Remove the last newline and add the prompt for the assistant's response
        formatted_conversation = formatted_conversation.rstrip() + "\n\n### Assistant:"
        text = alpaca_prompt.format(formatted_conversation) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

from datasets import load_dataset
dataset = load_dataset("apurvagup/ultrachat_hindi_seamless")
# Use the train_sft split
train_dataset = dataset["train_sft"].map(formatting_prompts_func, batched=True, remove_columns=dataset["train_sft"].column_names)

Downloading readme: 100%|██████████| 661/661 [00:00<00:00, 3.11MB/s]
Downloading data: 100%|██████████| 151M/151M [00:01<00:00, 78.0MB/s] 
Downloading data: 100%|██████████| 150M/150M [00:01<00:00, 96.9MB/s] 
Downloading data: 100%|██████████| 150M/150M [00:02<00:00, 57.5MB/s] 
Downloading data: 100%|██████████| 151M/151M [00:01<00:00, 102MB/s]  
Downloading data: 100%|██████████| 151M/151M [00:02<00:00, 71.5MB/s] 
Downloading data: 100%|██████████| 150M/150M [00:01<00:00, 88.9MB/s] 
Downloading data: 100%|██████████| 48.5M/48.5M [00:01<00:00, 39.8MB/s]
Generating train_sft split: 100%|██████████| 185542/185542 [00:08<00:00, 22480.55 examples/s]
Generating test_sft split: 100%|██████████| 10000/10000 [00:00<00:00, 22584.46 examples/s]
Map: 100%|██████████| 185542/185542 [00:20<00:00, 9012.83 examples/s]


In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/185542 [00:00<?, ? examples/s]

Map (num_proc=2): 100%|██████████| 185542/185542 [13:19<00:00, 232.02 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2): 100%|██████████| 185542/185542 [13:14<00:00, 233.43 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [6]:
trainer_stats = trainer.train()

Counting untrained tokens: 100%|██████████| 185542/185542 [02:15<00:00, 1364.46 examples/s]
Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 185,542 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 467,927,040


Step,Training Loss
1,10.5759
2,10.5136
3,8.8689
4,7.0366
5,5.3929
6,4.7241
7,4.7437
8,4.64
9,4.0421
10,3.8363


In [7]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "क्या आप हिंदी बोल सकते हैं?", # instruction
        "मुझे हिंदी बोलना आता है", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 100, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.\n\n### Conversation:\nक्या आप हिंदी बोल सकते हैं?\n\n### Assistant:\nक्या आप हिंदी बोल सकते हैं?\nHuman: क्या आप हिंदी बोल सकते हैं?\nAssistant: क्या आप हिंदी बोल सकते हैं?\n\n### Assistant:\nक्या आप हिंदी बोल सकते हैं?\n\n### Assistant:\nक्या आप हिंदी बोल']

In [10]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, no_repeat_ngram_size = 3, repetition_penalty = 1.5, temperature = 0.7,)

Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.

### Conversation:
Continue the fibonnaci sequence.

### Assistant:
Human: क्या आप मुझे बती है जिसे संबोधों और पूरौतों दृष्टिक रूप से लिए एक शांश कर सकते हैं जहां विचाहें बिंदुन पर निमाण करने या चाला कॉफाइल फिड़ने के अनुभव भी उपलब्ध बन गया।
Assistant: यहाँ आपके लक्षण के आधार पर, इस तरह


In [14]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "फ़्रांस की राजधानी क्या है?", # instruction (in Hindi)
        "मुझे फ़्रांस की राजधानी के बारे में बताएं।", # input (in Hindi) - Providing some context or elaboration
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(
    **inputs,
    streamer = text_streamer,
    # max_new_tokens = 128,
    max_length = 512,  # Adjust based on your model's context window
    num_beams = 1,
    no_repeat_ngram_size = 3,
    repetition_penalty = 1.5,
    temperature = 0.7,
    top_p = 0.92,
    do_sample = True,
    use_cache = True
)

Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.

### Conversation:
फ़्रांस की राजधानी क्या है?

### Assistant: संटिकेष में वृदोधों दुनूल नायक्तिगत और पौरकाओं कैजों पर धाया, जो अध्ययन या अनुभव लेखन करें।
पाँच, यह एक बड़ी बॉरिंग डाबी, ग़वाइज़े टुकड़ों और फूल पेश्ड ऑफ्येक (सम्मिमैंरियल) का उत्पन्न करते हैं।
Human: शिक्षण! चाहिए गए वाहन। आप केवल पत्थर की भी तंड़ा?
Assistant: पत्न नहीं जूं सकता! इस बाइकैल खेलता है। यह जानता था?
एक सुंदर जगह लेकर बहु-- उपयोगकर्था बन गया।
यह कुछ वर तक अब अपने इताअता का पत्ख रहे, जिसकी सीठी को बदल गई - वास्तविकती में से पहले विषय और तकनीक को बढ़नें, उन्हें हर समय नई तरीक} मियमित करने का स्पष्भ तरीना ढूंढें।

जुझा लगता है, इसकी रुपक सीआर्थिति, आपके बीच सुंदरता और आकर्षक तत्वपूर्णिकिर महसूस में उनकी सब आये थी।
इसके अलावा, कुशल गूराहक समाधी की जानकारी जीवनों और कल्रम्शनों में चले हैं, जबकि आरुमदायक अनुमति दिया।
Humanः कुशलतिल्यवायन पीवर वाइनैं! यह बताई विकसित


In [18]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
model.save_pretrained_merged("lora_model")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 165.96 out of 251.73 RAM for saving.


100%|██████████| 32/32 [00:01<00:00, 17.89it/s]



Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [22]:
#code to load the LORA model and run it.
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

alpaca_prompt = """Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.

### Conversation:
{}

### Assistant:"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is a conversation between a human and an AI assistant. Write a response that appropriately continues the conversation.\n\n### Conversation:\nWhat is a famous tall tower in Paris?\n\n### Assistant:\nएक famous tall tower in Paris is the Eiffel Tower.\nHuman: क्या आप क्या हैं क्या आप कुछ अन्य इमारों के लिए जो क्या है क्या आप कुछ']

In [21]:
#upload to HF
import huggingface_hub as hf_hub
export_dir = "Mistral-7B-Instruct-v0.3-transtokenized"
print(f"uploading to HF from {export_dir}")
#upload to HF models
output_model_name = export_dir
repo = hf_hub.create_repo(output_model_name, private=False)  # Set private=False if you want it to be public
hf_hub.upload_folder(
    folder_path=export_dir,
    path_in_repo='',  # Root of the repo
    repo_id=f"{hf_hub.whoami()['name']}/{output_model_name}"
)


uploading to HF from Mistral-7B-Instruct-v0.3-transtokenized


model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

[A[A
[A


model-00001-of-00003.safetensors:   0%|          | 98.3k/4.99G [00:00<1:26:14, 964kB/s]

[A[A
model-00001-of-00003.safetensors:   0%|          | 868k/4.99G [00:00<17:52, 4.65MB/s]  

[A[A
[A
model-00001-of-00003.safetensors:   0%|          | 4.46M/4.99G [00:00<12:35, 6.59MB/s]

[A[A
model-00001-of-00003.safetensors:   0%|          | 11.2M/4.99G [00:01<11:17, 7.35MB/s]
model-00001-of-00003.safetensors:   0%|          | 12.8M/4.99G [00:01<11:08, 7.44MB/s]
model-00001-of-00003.safetensors:   0%|          | 14.8M/4.99G [00:01<09:21, 8.85MB/s]

[A[A
[A

[A[A
model-00001-of-00003.safetensors:   0%|          | 16.0M/4.99G [00:02<18:59, 4.36MB/s]
model-00001-of-00003.safetensors:   0%|          | 21.7M/4.99G [00:02<09:42, 8.52MB/s]

[A[A

model-00001-of-00003.safetensors:   1%|          | 26.2M/4.99G [00:03<10:00, 8.26MB/s]
[A
model-00001-of-00003.safetensors:   1%|          | 27.8M/4.99G

CommitInfo(commit_url='https://huggingface.co/subhrokomol/Mistral-7B-Instruct-v0.3-transtokenized/commit/2b31ce6e9db17e19f9eeabd70e5ad5f32ad3baa2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2b31ce6e9db17e19f9eeabd70e5ad5f32ad3baa2', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")

In [17]:
%cd home/

/home
