In [1]:
import wandb
from dotenv import load_dotenv
import os

load_dotenv("../../.env")

wandb.login(key=os.getenv("WANDB_API_KEY"))

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmekhyw[0m ([33mmekhyw-insper[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\felip\_netrc


True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    bnb_4bit_use_double_quant=True
)

model_name = "IlyaGusev/gemma-2-2b-it-abliterated"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from peft import LoraConfig

lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM"
)

In [4]:
import datasets

dataset_sfw = datasets.load_dataset("parquet", data_files="../data/SFW_qa.parquet")
dataset_nsfw = datasets.load_dataset("parquet", data_files="../data/NSFW_qa.parquet")
dataset_sfw = dataset_sfw.shuffle(seed=42)
dataset_nsfw = dataset_nsfw.shuffle(seed=42)

def prepare_dataset(dataset):
    def format_chat(example):
        messages = [
            {"role": "user", "content": example['query']},
            {"role": "assistant", "content": example['response']}
        ]
        formatted_chat = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        return {"text": formatted_chat}
    formatted_dataset = dataset.map(format_chat)
    formatted_dataset = formatted_dataset['train'].remove_columns(
        [col for col in formatted_dataset['train'].column_names if col != "text"]
    )
    return formatted_dataset

dataset_sfw = prepare_dataset(dataset_sfw)
dataset_nsfw = prepare_dataset(dataset_nsfw)

In [5]:
dataset_sfw

Dataset({
    features: ['text'],
    num_rows: 2090473
})

In [6]:
dataset_nsfw

Dataset({
    features: ['text'],
    num_rows: 899457
})

In [7]:
from trl import SFTTrainer

trainer_sfw = SFTTrainer(
    model=model,
    train_dataset=dataset_sfw,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        warmup_steps=50,
        max_steps=1000,
        learning_rate=1e-5,
        fp16=True,
        bf16=False,
        logging_steps=1,
        optim="paged_adamw_8bit",
        output_dir="../models/SFW",
        gradient_checkpointing=True,
        save_strategy="steps",
        save_steps=100
    )
)

max_steps is given, it will override any value given in num_train_epochs


In [8]:
import gc
import torch

torch.cuda.init()

gc.collect()
torch.cuda.empty_cache()

In [9]:
run = wandb.init(
    project='Fine-tune Gemma-2-2b-it-abliterated on CookieBaker SFW Dataset', 
    job_type="training", 
    anonymous="allow"
)

trainer_sfw.train()

wandb.finish()



  0%|          | 0/1000 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


{'loss': 8.538, 'grad_norm': 5.287437438964844, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.0}
{'loss': 8.5407, 'grad_norm': 4.474451065063477, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.0}
{'loss': 8.8183, 'grad_norm': 4.965625762939453, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.0}
{'loss': 8.5632, 'grad_norm': 4.798336029052734, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.0}
{'loss': 8.7347, 'grad_norm': 5.15839147567749, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 9.4355, 'grad_norm': 5.283998012542725, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.0}
{'loss': 8.5549, 'grad_norm': 4.657886505126953, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.0}
{'loss': 10.8133, 'grad_norm': 7.048219203948975, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.0}
{'loss': 9.1017, 'grad_norm': 5.3900556564331055, 'learning_rate': 1.8000000000000001e-06, 'epoch': 0.0}
{'loss': 9.8996, 'grad_norm': 5.7568278312683105, 'learning_rate':

RuntimeError: CUDA error: an illegal instruction was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
del trainer_sfw

trainer_nsfw = SFTTrainer(
    model=model,
    train_dataset=dataset_nsfw,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        warmup_steps=50,
        max_steps=1000,
        learning_rate=1e-5,
        fp16=True,
        bf16=False,
        logging_steps=1,
        optim="paged_adamw_8bit",
        output_dir="../models/NSFW",
        gradient_checkpointing=True,
        save_strategy="steps",
        save_steps=100
    )
)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
run = wandb.init(
    project='Fine-tune Gemma-2-2b-it-abliterated on CookieBaker NSFW Dataset', 
    job_type="training", 
    anonymous="allow"
)

trainer_nsfw.train()

wandb.finish()