In [3]:
import os
import mlflow
from datasets import load_dataset
from loguru import logger

from ml.config import config
from ml.data_module.lakefs import get_dataset
from ml.experiment.mlflow.llamacpp import LlamaCppModel
from ml.config import enable_multipart_upload


from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template 
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

def get_trainer_model(
    chat_template,
    dataset,
    from_pretrained,
    sft_configs,
    peft_configs,
    peft_adapters,
    mapping,
    column_to_be_used="conversations",
):
    model, tokenizer = FastLanguageModel.from_pretrained(
        dtype=None,
        **from_pretrained,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    model = FastLanguageModel.get_peft_model(
        model,
        **peft_adapters,
    )

    if mapping:
        tokenizer = get_chat_template(
            tokenizer,
            chat_template=chat_template,
            mapping=mapping,
        )
    else:
        tokenizer = get_chat_template(
            tokenizer,
            chat_template=chat_template,
        )

    def formatting_prompts_func(examples):
        convos = examples[column_to_be_used]
        texts = [
            tokenizer.apply_chat_template(
                convo, tokenize=False, add_generation_prompt=False
            )
            for convo in convos
        ]
        return {
            "text": texts,
        }
        
    print(dataset.select(range(1)).to_pandas().iloc[0]["conversations"]) 

    new_dataset = dataset.map(
        formatting_prompts_func,
        batched=True,
        remove_columns=dataset.column_names  # Remove original columns
    )

    print(new_dataset.select(range(1)).to_pandas().iloc[0]["text"])
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=new_dataset,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        **sft_configs,
        args=TrainingArguments(
            **peft_configs,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
        ),
    )
    
    # trainer = train_on_responses_only(
    #     trainer,
    #     instruction_part=
    #     response_part
    # )
    return trainer, tokenizer ,new_dataset

ModuleNotFoundError: No module named 'ml'

In [19]:
# from ml.finetuning.unsloth import get_trainer_model

In [20]:
import os
import mlflow
from datasets import load_dataset
from loguru import logger

from ml.config import config
from ml.data_module.lakefs import get_dataset
from ml.experiment.mlflow.llamacpp import LlamaCppModel
from ml.config import enable_multipart_upload


from unsloth import FastLanguageModel
# from unsloth.chat_templates import get_chat_template, standardize_sharegpt
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only
model_config, mlflow_config = config()
mlflow.set_tracking_uri(mlflow_config["mlflow"]["URL"])
enable_multipart_upload()
num_proc = model_config.dataset.get("num_proc", 5)
if model_config.dataset.get("lakefs"):
  dataset = get_dataset(
      directory=model_config.dataset.lakefs.directory,
      project_name=model_config.dataset.lakefs.project_name,
      dataset_type=model_config.dataset.lakefs.dataset_type,
      branch_name=model_config.dataset.lakefs.branch_name,
      split=model_config.dataset.lakefs.split,
      num_proc=num_proc,
  )
elif model_config.dataset.get("hf"):
  dataset = load_dataset(
      model_config.dataset.hf.name,
      split=model_config.dataset.hf.split,
      num_proc=num_proc,
  )


column_to_be_used = model_config.dataset.get("column_to_be_used", None)
chat_template = model_config.chat_template
chat_mapping = model_config.dataset.get("chat_mapping", None)
sample_examples = dataset.select(range(5)).to_pandas().to_html()
sample_file = "sample_dataset.html"

trainer_model, tokenizer ,new_dataset = get_trainer_model(
    chat_template="llama-3",
    dataset=dataset,
    from_pretrained=model_config.from_pretrained,
    sft_configs=model_config.sft_configs,
    peft_configs=model_config.peft_configs,
    peft_adapters=model_config.peft_adapters,
    mapping=chat_mapping,
    column_to_be_used=column_to_be_used,
)
# trainer_model.train()
# model = trainer_model.model

# if (
#     trainer_model.state.log_history
#     and "loss" in trainer_model.state.log_history[-1]
# ):
#     final_loss = trainer_model.state.log_history[-1]["loss"]
#     mlflow.log_metric("final_train_loss", final_loss)

# logger.info(f"starting to save model at {model_save_path}")
# model.save_pretrained_gguf(
#     model_save_path,
#     tokenizer,
#     quantization_method=model_config.quantization_method,
# )
# logger.info(f"Model saved at {model_save_path}")
# old_model_path = os.path.join(
#     model_save_path, f"unsloth.{model_config.quantization_method.upper()}.gguf"
# )
# new_model_path = os.path.join(model_save_path, "model.gguf")  # New name
# os.rename(old_model_path, new_model_path)

# mlflow.pyfunc.log_model(
#     artifact_path="model_path",
#     python_model=LlamaCppModel(),
#     artifacts={"model_path": f"{model_save_path}/model.gguf"},
#     pip_requirements=["mlflow==2.4.0", "llama-cpp-python", "pandas"],
# )
# run_id = run.info.run_id
# model_uri = f"runs:/{run_id}/model"
# logger.info(f"Model logged at URI: {model_uri}")
# registered_model_name = "qa_model"
# model_details = mlflow.register_model(
#     model_uri=model_uri, name=registered_model_name
# )
# logger.info(
#     f"Registered model '{model_details.name}' with version {model_details.version}"
# )


==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
[{'from': 'human', 'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation nat

Tokenizing train dataset (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

In [8]:
chat_mapping

{'role': 'from', 'content': 'value', 'user': 'human', 'assistant': 'gpt'}

In [21]:
print(dataset.select(range(1)).to_pandas().iloc[0]["conversations"])

[{'from': 'human', 'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.'}
 {'from': 'gpt', 'value': 'Boolean op

In [22]:
print(new_dataset.select(range(1)).to_pandas().iloc[0]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>human<|end_header_id|>

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. 

Furthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.

Finally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles

In [64]:
begin_token = tokenizer.bos_token
end_token = tokenizer.eos_token
print(f"Begin token: {begin_token}, End token: {end_token}")

Begin token: <|begin_of_text|>, End token: <|eot_id|>


In [70]:
 
begin_token = tokenizer.bos_token
end_token = tokenizer.eos_token

template_text = tokenizer.apply_chat_template([{"role":"user"}, ], tokenize=False)
template_text[len(begin_token):-len(end_token)].strip()

'<|start_header_id|>user<|end_header_id|>'

In [25]:
sample_conversation = [{"role": "user", "content": "test"}, {"role": "assistant", "content": "test"}]
template_text = tokenizer.apply_chat_template(sample_conversation, tokenize=False)
parts = template_text.split("test")  # Split on our test content
instruction_part = parts[0]  # Everything before first "test"
response_part = parts[1]  # Everything between "test" and "test"
print(f"instruction_part: {instruction_part}")
print(f"response_part: {response_part}")

instruction_part: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>


response_part: <|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [30]:
print(new_dataset.select(range(1)).to_pandas().iloc[0]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>human<|end_header_id|>

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. 

Furthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.

Finally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles

In [15]:
chat_mapping = model_config.dataset.get("chat_mapping", None)
chat_mapping

sample_conversation = [{"role": "user", "content": "test"}, {"role": "assistant", "content": "test"}]

{'role': 'from', 'content': 'value', 'user': 'human', 'assistant': 'gpt'}

In [16]:
new_dataset.select(range(3)).to_pandas()

Unnamed: 0,text
0,<|begin_of_text|><|start_header_id|>system<|en...
1,<|begin_of_text|><|start_header_id|>system<|en...
2,<|begin_of_text|><|start_header_id|>system<|en...


In [12]:
model_config, mlflow_config = config()
model_config.chat_template

'llama-3.1'

In [8]:
sample_conversation = [{"role": "user", "content": "test"}, {"role": "assistant", "content": "test"}]
template_text = tokenizer.apply_chat_template(sample_conversation, tokenize=False)
parts = template_text.split("test")  # Split on our test content
instruction_part = parts[0]  # Everything before first "test"
response_part = parts[1]  # Everything between "test" and "test"


In [11]:
print(template_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

test<|eot_id|><|start_header_id|>assistant<|end_header_id|>

test<|eot_id|>


In [10]:
instruction_part

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'

In [9]:
response_part

'<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [5]:
from ml.config import config
import mlflow
model_config, mlflow_config = config()

mlflow.set_tracking_uri(mlflow_config["mlflow"]["URL"])


In [7]:
serving_payload

'{\n  "inputs": "say hi"\n}'

In [40]:
from mlflow.models import validate_serving_input

model_uri = 'runs:/7e81f2493c8e41d080b44923d1214f9e/model'

import mlflow
from unsloth import FastLanguageModel
# Load the model components without passing any device argument
loaded_components = mlflow.transformers.load_model(model_uri=model_uri, return_type="components")
model = loaded_components["model"]
tokenizer = loaded_components["tokenizer"]

# Optionally, wrap the model if required by Unsloth (this step may be a no-op if your model is already compatible)

    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf



Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2025/02/24 16:59:41 INFO mlflow.transformers: 'runs:/7e81f2493c8e41d080b44923d1214f9e/model' resolved as 'mlflow-artifacts:/2/7e81f2493c8e41d080b44923d1214f9e/artifacts/model'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [43]:
import torch

prompt_text = "say hi "
inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

# Generate outputs with scores
outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    output_scores=True,
    return_dict_in_generate=True
)

# Decode the generated sequences
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

# Access the logits (scores) of the generated tokens
logits = outputs.scores

print("Generated Text:", generated_text)

Generated Text: say hi 2 the man
hey man, how's it goin? say hi 2 the man
say hi 2 the man, say hi 2 the man
say hi 2 the man, say hi 2 the man
say hi 2 the man, say hi 2 the man
say hi


In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [34]:
fast_model = FastLanguageModel.get_peft_model(model)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [35]:
fast_model.generate("say hi")

AttributeError: 'str' object has no attribute 'shape'

In [37]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(fast_model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Continue the fibonnaci sequence.",  # instruction
            "1, 1, 2, 3, 5, 8",  # input
            "",  # output - leave this blank for generation!
        )
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, num_logits_to_keep=3,)
tokenizer.batch_decode(outputs)

ValueError: The following `model_kwargs` are not used by the model: ['num_logits_to_keep'] (note: typos in the generate arguments will also show up in this list)

In [33]:
dir(FastLanguageModel)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'for_inference',
 'for_training',
 'from_pretrained',
 'get_peft_model',
 'patch_peft_model',
 'post_patch',
 'pre_patch']

In [28]:
type(model)

peft.peft_model.PeftModelForCausalLM

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [1]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",  # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",  # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",  # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",  # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",  # Gemma 2x faster!
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
from transformers import TextStreamer

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Continue the fibonnaci sequence.",  # instruction
            "1, 1, 2, 3, 5, 8",  # input
            "",  # output - leave this blank for generation!
        )
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
tokenizer.batch_decode(outputs)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [19]:
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj",],
#     lora_alpha = 16,
#     lora_dropout = 0, # Supports any, but = 0 is optimized
#     bias = "none",    # Supports any, but = "none" is optimized
#     # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
#     use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
#     random_state = 3407,
#     use_rslora = False,  # We support rank stabilized LoRA
#     loftq_config = None, # And LoftQ
# )

Unsloth 2025.1.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
from transformers import TextStreamer

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Continue the fibonnaci sequence.",  # instruction
            "1, 1, 2, 3, 5, 8",  # input
            "",  # output - leave this blank for generation!
        )
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nTo continue the Fibonacci sequence, we will add the last two numbers in the sequence to get the next number.\n\nSo, the next number in the sequence would be 8 + 5 = 13.\n\nTherefore, the updated sequence is: 1, 1, 2, 3, 5, ']

In [6]:
from unsloth import FastLanguageModel
import torch


def main():
    max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
    dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

    # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
    fourbit_models = [
        "unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Llama-3.1 15 trillion tokens model 2x faster!
        "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "unsloth/Meta-Llama-3.1-405B-bnb-4bit",  # We also uploaded 4bit for 405b!
        "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",  # New Mistral 12b 2x faster!
        "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
        "unsloth/mistral-7b-v0.3-bnb-4bit",  # Mistral v3 2x faster!
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Phi-3.5-mini-instruct",  # Phi-3.5 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/gemma-2-9b-bnb-4bit",
        "unsloth/gemma-2-27b-bnb-4bit",  # Gemma 2x faster!
    ]  # More models at https://huggingface.co/unsloth

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Meta-Llama-3.1-8B",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    from transformers import TextStreamer

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

  ### Instruction:
  {}

  ### Input:
  {}

  ### Response:
  {}"""

    # alpaca_prompt = Copied from above
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                "Continue the fibonnaci sequence.",  # instruction
                "1, 1, 2, 3, 5, 8",  # input
                "",  # output - leave this blank for generation!
            )
        ],
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    return tokenizer.batch_decode(outputs)


respone = main()
respone

==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n  ### Instruction:\n  Continue the fibonnaci sequence.\n\n  ### Input:\n  1, 1, 2, 3, 5, 8\n\n  ### Response:\n  <|end_of_text|>']