### Install & google mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes datasets
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting xformers
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl (222.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.7/222.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.4.0-py2.py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.2/289.2 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
import wandb
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
!nvidia-smi #A100 GPU

Wed Jun  5 21:04:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              44W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### HF & wandb login

In [None]:
# HuggingFace login -- training model/data & Pushing model
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Log in to Weights & Biases
wandb.login()
# Initialize and configure experiment
run = wandb.init(name="Llama3-unsloth_py_solutions-v4-one", project="CodeMind-Llama3", job_type="training", entity="your-entity")

### Model : Llama-3-8B / *unsloth*

In [None]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Dataset : LimYeri/LeetCode_Python_Solutions_v2

In [None]:
import pandas as pd

df = load_dataset("LimYeri/LeetCode_Python_Solutions_v2", split='train').to_pandas()
df.columns

Downloading readme:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15734 [00:00<?, ? examples/s]

Index(['id', 'content', 'title', 'title_slug', 'question_content',
       'question_hints', 'tag', 'level', 'similar_question_ids'],
      dtype='object')

In [None]:
# 각 열 행개수 확인
df.count()

id                      15734
content                 15734
title                   15734
title_slug              15734
question_content        15734
question_hints           6662
tag                     14489
level                   15734
similar_question_ids     9280
dtype: int64

In [None]:
def formatting_prompts_func(data_point):
    output_texts = []
    for i in range(len(data_point)):
        system_prompt = 'You are a coding instructor. Write a Python code to solve the given problem or provide a detailed explanation of the approach to solving it.'
        problem_description = f"I don't know how to solve LeetCode Problem {data_point['id'][i]}: {data_point['title'][i]}. Could you provide a python solution or explain the approach? \n Here is the problem description: \n{data_point['question_content'][i]}\n\n"
        tag = f"Suggested Data Structures and Techniques: {data_point['tag'][i]}\n" if data_point['tag'][i] else ""  # Assuming 'tag' is a list of lists
        hints = f"This is problem's hints \nHints: {data_point['question_hints'][i]}\n\n" if data_point['question_hints'][i] else ""

        # Concatenating all parts to form the full prompt for the current index
        prompt = f"""<|start_header_id|>system<|end_header_id|>

        {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

        {problem_description}{tag}{hints}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

        {data_point['content'][i]}<|eot_id|>"""

        output_texts.append(prompt)

    return output_texts

In [None]:
output_texts = formatting_prompts_func(df)
print(len(output_texts))

15734


In [None]:
output_texts[0]

"<|start_header_id|>system<|end_header_id|>\n\n        You are a coding instructor. Write a Python code to solve the given problem or provide a detailed explanation of the approach to solving it.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n        I don't know how to solve LeetCode Problem 1: Two Sum. Could you provide a python solution or explain the approach? \n Here is the problem description: \nGiven an array of integers `nums` and an integer `target`, return _indices of the two numbers such that they add up to `target`_.\n\nYou may assume that each input would have **_exactly_ one solution**, and you may not use the _same_ element twice.\n\nYou can return the answer in any order.\n\n**Example 1:**\n\n**Input:** nums = \\[2,7,11,15\\], target = 9\n**Output:** \\[0,1\\]\n**Explanation:** Because nums\\[0\\] + nums\\[1\\] == 9, we return \\[0, 1\\].\n\n**Example 2:**\n\n**Input:** nums = \\[3,2,4\\], target = 6\n**Output:** \\[1,2\\]\n\n**Example 3:**\n\n**Input:** nums = \\

In [None]:
from datasets import Dataset
data_dict = {"text": output_texts}
dataset = Dataset.from_dict(data_dict)

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 15734
})

In [None]:
dataset[0]

{'text': "<|start_header_id|>system<|end_header_id|>\n\n        You are a coding instructor. Write a Python code to solve the given problem or provide a detailed explanation of the approach to solving it.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n        I don't know how to solve LeetCode Problem 1: Two Sum. Could you provide a python solution or explain the approach? \n Here is the problem description: \nGiven an array of integers `nums` and an integer `target`, return _indices of the two numbers such that they add up to `target`_.\n\nYou may assume that each input would have **_exactly_ one solution**, and you may not use the _same_ element twice.\n\nYou can return the answer in any order.\n\n**Example 1:**\n\n**Input:** nums = \\[2,7,11,15\\], target = 9\n**Output:** \\[0,1\\]\n**Explanation:** Because nums\\[0\\] + nums\\[1\\] == 9, we return \\[0, 1\\].\n\n**Example 2:**\n\n**Input:** nums = \\[3,2,4\\], target = 6\n**Output:** \\[1,2\\]\n\n**Example 3:**\n\n**Input:** 

### Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        warmup_steps=200,
        max_steps=0,
        num_train_epochs = 5,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 20,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to="wandb",
        output_dir = "/content/drive/MyDrive/Colab Notebooks/CodeMind/Llama3/outputs",
        save_strategy="epoch",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/15734 [00:00<?, ? examples/s]

In [None]:
# wandb setting
wandb.config = {
    "learning_rate": 2e-4,
    "epochs":5,
    "per_device_train_batch_size":8,
    "gradient_accumulation_steps":2,
    "optim":"adamw_8bit",
    "logging_steps":20
}
wandb.watch(model)

[]

In [None]:
# Training the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15,734 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 4,915
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss
20,1.057
40,0.7555
60,0.5455
80,0.4814
100,0.4671
120,0.4472
140,0.4183
160,0.4041
180,0.4104
200,0.3961


In [None]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,█▅▄▃▄▃▄▄▃▃▂▃▄▄▁▂▁▃▂▃▃▃▂▂▃▃▃▂▃▂▁▂▂▂▂▃▃▂▅▅
train/learning_rate,▂▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,5.000648855652925e+18
train/epoch,4.99746
train/global_step,4915.0
train/grad_norm,0.12749
train/learning_rate,0.0
train/loss,0.0938
train_loss,0.19862
train_runtime,28094.4255
train_samples_per_second,2.8
train_steps_per_second,0.175


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
33.047 GB of memory reserved.


### Save the model

In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/CodeMind/Llama3/CodeMind-Llama3-8B-unsloth_v3(save)")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

In [None]:
# Locally saving the model and pushing it to the Hugging Face Hub (only LoRA adapters)
model.push_to_hub("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one", tokenizer = tokenizer)
tokenizer.push_to_hub("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one")

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Saved model to https://huggingface.co/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one


In [None]:
# 필수 패키지 설치
!apt-get update
!apt-get install -y git make g++

# llama.cpp 레포지토리 클론
!git clone --recursive https://github.com/ggerganov/llama.cpp

# 디렉토리 이동 후 컴파일
%cd llama.cpp
!make clean
!make all -j

In [None]:
model.push_to_hub_gguf("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-GGUF", tokenizer, quantization_method = "f16")

Unsloth: Will remove a cached repo with size 1.1K


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 61.95 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 36.44it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.
Unsloth: We must use f16 for non Llama and Mistral models.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to f16 will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-GGUF into f16 GGUF format.
The output location will be ./LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-GGUF-unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: CodeMind-Llama3-8B-unsloth_v4-one-GGUF
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-valu

CodeMind-Llama3-8B-unsloth_v4-one-GGUF-unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-GGUF


In [None]:
model.save_pretrained_gguf("/content/drive/MyDrive/Colab Notebooks/CodeMind/Llama3/CodeMind-Llama3-8B-unsloth_v3-GGUF(save)", tokenizer, quantization_method = "f16")

In [None]:
%cd /content

/content


### Merge and Share the model

In [None]:
max_seq_length = None
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

adapter_config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
model.push_to_hub_merged("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged", tokenizer, save_method = "merged_16bit", token = "your-token")

Unsloth: You are pushing to hub, but you passed your HF username = LimYeri.
We shall truncate LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged to CodeMind-Llama3-8B-unsloth_v4-one-merged
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 55.31 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 53.05it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
config = AutoConfig.from_pretrained("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged")
tokenizer = AutoTokenizer.from_pretrained("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged")
model = AutoModelForCausalLM.from_pretrained("LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged")

### Inference

In [None]:
max_seq_length = None
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

adapter_config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [None]:
from IPython.display import display, Markdown

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "<|start_header_id|>system<|end_header_id|> You are a coding instructor. Below is a coding test problem. Write a Python code to solve the given problem or provide a detailed explanation of the approach to solving it.<|eot_id|><|start_header_id|>user<|end_header_id|> I don't know how to solve LeetCode problem 3: Longest Substring Without Repeating Characters. Could you provide a python solution or explain the approach?<|eot_id|>"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 3000, use_cache = True)
answer = tokenizer.batch_decode(outputs, skip_special_tokens = True)
text = answer[0].split("assistant")[1].strip()
display(Markdown(text))