# Install Unsloth

In [11]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
# !pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Load the dataset from HuggingFace

In [16]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
from datasets import load_dataset

dataset = load_dataset("espejelomar/code_search_net_python_10000_examples")

# Print available splits
print("Available splits:", list(dataset.keys()))



Available splits: ['train']


In [21]:
# Access the train split
train_dataset = dataset['train']

# Print column names
column_names = train_dataset.column_names
print("Column names:", column_names)


Column names: ['Unnamed: 0', 'repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']


In [22]:
# Access the first example
first_example = train_dataset[0]
print(first_example)

{'Unnamed: 0': 0, 'repository_name': 'getsentry/libsourcemap', 'func_path_in_repository': 'libsourcemap/highlevel.py', 'func_name': 'View.get_original_function_name', 'whole_func_string': 'def get_original_function_name(self, line, col, minified_name,\n                                   minified_source):\n        """Given a token location and a minified function name and the\n        minified source file this returns the original function name if it\n        can be found of the minified function in scope.\n        """\n        # Silently ignore underflows\n        if line < 0 or col < 0:\n            return None\n        minified_name = minified_name.encode(\'utf-8\')\n        sout = _ffi.new(\'const char **\')\n        try:\n            slen = rustcall(_lib.lsm_view_get_original_function_name,\n                            self._get_ptr(), line, col, minified_name,\n                            minified_source, sout)\n            if slen > 0:\n                return _ffi.unpack(sout[0],

In [23]:
# Remove 'Unnamed: 0' column if present
if 'Unnamed: 0' in train_dataset.column_names:
    train_dataset = train_dataset.remove_columns(['Unnamed: 0'])

In [24]:
# Verify the columns and size of the train dataset

print("Columns in train dataset:", train_dataset.column_names)
print(f"Number of examples in train dataset: {len(train_dataset)}")

Columns in train dataset: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
Number of examples in train dataset: 10000


In [25]:
# Create a filtered dataset with only the first 1000 examples
subset_size = 1000
dataset = train_dataset.select(range(subset_size))

In [26]:
# Verify the columns and size of the filtered dataset

print("Columns in filtered dataset:", dataset.column_names)
print(f"Number of examples in filtered dataset: {len(dataset)}")

Columns in filtered dataset: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
Number of examples in filtered dataset: 1000


# Convert dataset to ShareGPT format with proper variable substitution

In [27]:
def to_sharegpt(dataset, merged_prompt, output_column_name, conversation_extension=1):
    """
    Convert dataset to ShareGPT format with proper variable substitution

    Args:
        dataset: The source dataset
        merged_prompt: Template string with {column_name} placeholders
        output_column_name: Column to use as the output/completion
        conversation_extension: Number of examples to combine into a single conversation
    """
    formatted_data = []

    for i in range(0, len(dataset), conversation_extension):
        conversation = []

        # Process each example in the current conversation window
        for j in range(i, min(i + conversation_extension, len(dataset))):
            example = dataset[j]

            # Format the prompt by substituting variables
            prompt = merged_prompt
            for column in dataset.column_names:
                if column in merged_prompt and column in example:
                    placeholder = "{" + column + "}"
                    prompt = prompt.replace(placeholder, str(example[column]))

            # Add the human message
            conversation.append({
                "from": "human",
                "value": prompt
            })

            # Add the assistant message
            conversation.append({
                "from": "assistant",
                "value": example[output_column_name]
            })

        # Add the conversation to the formatted data
        formatted_data.append({"conversations": conversation})

    return formatted_data

In [29]:
# For code explanation
code_explain_dataset = to_sharegpt(
    dataset,
    merged_prompt = "Explain what this Python code does: {func_code_string}",
    output_column_name = "func_documentation_string"
)


In [30]:
code_explain_dataset[0]

{'conversations': [{'from': 'human',
   'value': 'Explain what this Python code does: def get_original_function_name(self, line, col, minified_name,\n                                   minified_source):\n        """Given a token location and a minified function name and the\n        minified source file this returns the original function name if it\n        can be found of the minified function in scope.\n        """\n        # Silently ignore underflows\n        if line < 0 or col < 0:\n            return None\n        minified_name = minified_name.encode(\'utf-8\')\n        sout = _ffi.new(\'const char **\')\n        try:\n            slen = rustcall(_lib.lsm_view_get_original_function_name,\n                            self._get_ptr(), line, col, minified_name,\n                            minified_source, sout)\n            if slen > 0:\n                return _ffi.unpack(sout[0], slen).decode(\'utf-8\', \'replace\')\n        except SourceMapError:\n            # In some rare cases

# Initialize the model and tokenizer

In [31]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama3.2-1b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothBCOTrainer: No module named 'UnslothBCOTrainer'. Using tempfile instead!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA RTX A5000. Num GPUs = 1. Max memory: 23.573 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [32]:
from datasets import Dataset

# First, convert your list to a Hugging Face Dataset
code_explain_dataset_hf = Dataset.from_list(code_explain_dataset)

In [33]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(code_explain_dataset_hf)

Unsloth: Standardizing formats (num_proc=128):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [34]:
from unsloth import apply_chat_template
chat_template = """
{SYSTEM}
USER: {INPUT}
ASSISTANT: {OUTPUT}"""

default_system_message = """You are an expert Python programmer. Write clean, efficient, and well-documented code
that follows PEP 8 style guidelines."""

# Use this system message with the apply_chat_template function
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    default_system_message = default_system_message
)


Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [35]:
dataset[2]

{'conversations': [{'content': 'Explain what this Python code does: def update_views(self):\n        """Update stats views."""\n        # Call the father\'s method\n        super(Plugin, self).update_views()\n\n        # Add specifics informations\n        # Alert and log\n        self.views[\'used\'][\'decoration\'] = self.get_alert_log(self.stats[\'used\'], maximum=self.stats[\'total\'])',
   'role': 'user'},
  {'content': 'Update stats views.', 'role': 'assistant'}],
 'text': 'You are an expert Python programmer. Write clean, efficient, and well-documented code\nthat follows PEP 8 style guidelines.\nUSER: Explain what this Python code does: def update_views(self):\n        """Update stats views."""\n        # Call the father\'s method\n        super(Plugin, self).update_views()\n\n        # Add specifics informations\n        # Alert and log\n        self.views[\'used\'][\'decoration\'] = self.get_alert_log(self.stats[\'used\'], maximum=self.stats[\'total\'])\nASSISTANT: Update stat