In [1]:
!pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu118  # Original line installing PyTorch with CUDA 11.8
!pip install transformers datasets pandas peft bitsandbytes accelerate scipy evaluate # This line installs other packages
# Uninstall the current torchvision
# !pip uninstall torchvision -y # -y flag is used to automatically confirm the uninstallation
# Reinstall torchvision for CUDA 11.8
!pip install torchvision --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.5.1%2Bcu118-cp311-cp311-linux_x86_64.whl (838.4 MB)
Collecting triton==3.1.0 (from torch)
  Using cached https://download.pytorch.org/whl/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 3.2.0
    Uninstalling triton-3.2.0:
      Successfully uninstalled triton-3.2.0
  Attempting uninstall: torch
    Found existing installation: torch 2.6.0+cu118
    Uninstalling torch-2.6.0+cu118:
      Successfully uninstalled torch-2.6.0+cu118
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.21.0+cu118 requires torch==2.6.0, but you have torch 2.5.1+cu118 which is incompatib

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import pandas as pd
import json

In [3]:

# Load dataset
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

train_data = load_data("/content/drive/MyDrive/train_data.json")
val_data = load_data("/content/drive/MyDrive/val_data.json")

In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

# Format data for fine-tuning
def format_prompt(item):
    return f"""<|im_start|>user
{item['question']}<|im_end|>
<|im_start|>assistant
{item['answer']}<|im_end|>"""

def tokenize_function(examples):
    prompts = [format_prompt({"question": q, "answer": a})
               for q, a in zip(examples["question"], examples["answer"])]

    tokenized_output = tokenizer(
        prompts,
        truncation=True,
        padding="max_length",
        max_length=1024,
        return_tensors="pt"
    )
    # Add labels - the input_ids shifted to the right by 1
    tokenized_output["labels"] = tokenized_output["input_ids"].clone()
    # in causal language modeling, the labels are the same as the input_ids shifted to the right by 1
    return tokenized_output

# Load datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

In [5]:
# Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    load_in_4bit=True,
    device_map="auto"
)

# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



# Train the model


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 29,933,568 || all params: 3,115,872,256 || trainable%: 0.9607


In [6]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    fp16=True,
    gradient_checkpointing=True,
    report_to="none"
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)



In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [8]:
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,5.2114,0.263624


Could not locate the best model at /content/drive/MyDrive/checkpoint-50/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=60, training_loss=4.381861968835195, metrics={'train_runtime': 3947.5646, 'train_samples_per_second': 0.486, 'train_steps_per_second': 0.015, 'total_flos': 3.308567390060544e+16, 'train_loss': 4.381861968835195, 'epoch': 3.0})

In [9]:

# Save the model
model.save_pretrained("/content/drive/MyDrive/fine_tuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_model")

('/content/drive/MyDrive/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_model/vocab.json',
 '/content/drive/MyDrive/fine_tuned_model/merges.txt',
 '/content/drive/MyDrive/fine_tuned_model/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_model/tokenizer.json')

In [10]:
# Merge LoRA weights with base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("/content/drive/MyDrive/fine_tuned_merged")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_merged")



('/content/drive/MyDrive/fine_tuned_merged/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_merged/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_merged/vocab.json',
 '/content/drive/MyDrive/fine_tuned_merged/merges.txt',
 '/content/drive/MyDrive/fine_tuned_merged/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_merged/tokenizer.json')

In [None]:
!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python


Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.7.tar.gz (66.7 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python


In [14]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.7.tar.gz (66.7 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hcanceled
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^

In [13]:
!python -m llama_cpp.convert_hf_to_gguf /content/drive/MyDrive/fine_tuned_merged --outfile /content/drive/MyDrive/model_quantized.gguf --outtype q4_0

/usr/bin/python3: Error while finding module specification for 'llama_cpp.convert_hf_to_gguf' (ModuleNotFoundError: No module named 'llama_cpp')
