In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

In [2]:
model_id = "ethzanalytics/distilgpt2-tiny-conversational"
dataset_id = "Amod/mental_health_counseling_conversations"

# QLoRA Configuration (4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [4]:
dataset = load_dataset(dataset_id)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token # GPT-2 doesn't have a pad token by default

# Load Model with Quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
# 3. LoRA Configuration
# ==========================================
# GPT-2 uses Conv1D layers, so we target 'c_attn'
peft_config = LoraConfig(
    r=16,                    # Rank
    lora_alpha=32,           # Alpha (scaling factor)
    lora_dropout=0.05,       # Dropout
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn"] # Target modules for GPT-2 architecture
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 82,207,488 || trainable%: 0.3587


In [6]:
# Format the data into a conversational prompt
def format_instruction(sample):
    # This formats the input as: "User: <context> \n Assistant: <response>"
    return f"User: {sample['Context']}\nAssistant: {sample['Response']}"

In [8]:
# ==========================================
# 5. Training Arguments
# ==========================================
sft_config = SFTConfig(
    output_dir="./distilgpt2-mental-health",
    
    # --- SFT Specific Parameters (Moved here) ---
    # max_seq_length=256,       # <--- Now lives here!
    packing=False,            # <--- Now lives here!
    dataset_text_field="text", # Placeholder (we use formatting_func, but this is sometimes required by config)

    # --- Training/CPU Parameters ---
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="adamw_torch",
    logging_steps=25,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    save_strategy="epoch",
    
    # CPU Specifics
    use_cpu=True,             # Explicitly force CPU
    no_cuda=True,             # Explicitly disable CUDA
    fp16=False,               # CPU cannot use fp16
)



In [9]:
# ==========================================
# 6. Trainer Initialization
# ==========================================
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    formatting_func=format_instruction,
    args=sft_config,
)



Applying formatting function to train dataset:   0%|          | 0/3512 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/3512 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3512 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1147 > 1024). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/3512 [00:00<?, ? examples/s]

In [10]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


KeyboardInterrupt: 

## INFERENCE

In [None]:
from peft import PeftModel

# Reload base model (standard precision for inference is usually safer/easier)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Merge the adapter with the base model
model_to_merge = PeftModel.from_pretrained(base_model, new_model_name)
merged_model = model_to_merge.merge_and_unload()

# Run a test
test_input = "User: I feel very anxious about my job lately.\nAssistant:"
inputs = tokenizer(test_input, return_tensors="pt").to("cuda")

outputs = merged_model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))