In [1]:
import os
from datetime import datetime

# 1. Fine-Tuning the Full Model (FFT)
Fine-tunes the entire `gpt2-large` model on the `civil_comments` dataset without using adapters.
- Full fine-tuning (FFT) updates all model parameters.
- Uses a lower learning rate (`1e-5`) and more gradient accumulation (`16` steps).
- Computationally expensive but results in a fully fine-tuned model.

In [None]:
!export WANDB_PROJECT=civil.adapter
!export WANDB_WATCH="false"
!export TRANSFORMERS_CACHE=checkpoints/hf_model
!export HF_DATASETS_CACHE=checkpoints/hf_model
!export HF_METRICS_CACHE=checkpoints/hf_model
!export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5

date_str = datetime.now().strftime("%Y%m%d_%H")
save_dir = f"./tmp/test-clm/.{date_str}.fft"

os.environ["WANDB_PROJECT"] = "civil.adapter"
os.environ["WANDB_WATCH"] = "false"
os.environ["TRANSFORMERS_CACHE"] = "checkpoints/hf_model"
os.environ["HF_DATASETS_CACHE"] = "checkpoints/hf_model"
os.environ["HF_METRICS_CACHE"] = "checkpoints/hf_model"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

!python run_clm_noconcat.py \
    --model_name_or_path gpt2-large \
    --dataset_name "civil_comments" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --save_total_limit 5 \
    --learning_rate 1e-5 \
    --overwrite_output_dir --fp16 \
    --do_train \
    --do_eval \
    --eval_steps 200 \
    --save_steps 200 \
    --evaluation_strategy "steps" \
    --save_strategy "steps" \
    --num_train_epochs 5 \
    --output_dir {save_dir} \
    --warmup_steps 0 \
    --warmup_ratio 0.06 \
    --gradient_accumulation_steps=16 \
    --weight_decay 0.01 \
    --report_to "wandb"


# Training an Adapter
Instead of fine-tuning the full model, we train an IA3 adapter on top of gpt2-large.

* Only adapter parameters are updated, keeping the base model frozen.
* Uses a higher learning rate (5e-3) and fewer gradient accumulation steps (8).
* More memory-efficient than full fine-tuning.

In [4]:
# Set environment variables
os.environ["WANDB_PROJECT"] = "civil.adapter"
os.environ["WANDB_WATCH"] = "false"
os.environ["HF_HOME"] = "checkpoints/hf_model"
os.environ["HF_DATASETS_CACHE"] = "checkpoints/hf_model"
os.environ["HF_METRICS_CACHE"] = "checkpoints/hf_model"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

# Set training parameters
adapter_config = "ia3"
date = datetime.now().strftime("%Y%m%d_%H")
save_dir = f"./tmp/test-clm/.{date}.{adapter_config}"

# Run adapter training
!python3 run_clm_noconcat.py \
    --model_name_or_path gpt2-large \
    --dataset_name "civil_comments" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --save_total_limit 5 \
    --learning_rate 5e-3 \
    --overwrite_output_dir --fp16 \
    --do_train \
    --do_eval \
    --num_train_epochs 1 \
    --output_dir {save_dir} \
    --warmup_steps 0 \
    --warmup_ratio 0.06 \
    --train_adapter \
    --load_adapter "./adapters/civil_comments" \
    --adapter_config {adapter_config} \
    --gradient_accumulation_steps 8 \
    --report_to "wandb"

Traceback (most recent call last):
  File "/src/run_clm_noconcat.py", line 651, in <module>
    main()
  File "/src/run_clm_noconcat.py", line 218, in main
    model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()
  File "/src/transformers/hf_argparser.py", line 224, in parse_args_into_dataclasses
    obj = dtype(**inputs)
  File "<string>", line 103, in __init__
  File "/src/transformers/training_args.py", line 1091, in __post_init__
    raise ValueError(
ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.


# Evaluation & Scoring
Generates text samples from a trained adapter and evaluates them.

Uses scaling (PEM composition) to adjust the adapter’s influence (scale=0.7).
Saves generated samples and evaluates them using a separate prediction script.

In [None]:
# Set environment variables
os.environ["TRANSFORMERS_CACHE"] = "checkpoints/hf_model"
os.environ["HF_DATASETS_CACHE"] = "checkpoints/hf_model"
os.environ["HF_METRICS_CACHE"] = "checkpoints/hf_model"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5"

# Set output directories
date = datetime.now().strftime("%Y%m%d_%H_%M")
save_gen = f"res/gen/{date}.txt"
save_pred = f"res/pred/{date}0.7.csv"
scale = 0.7

# Generate text samples with the trained adapter
!python -u gpt2_scale.py \
    --model_type gpt2-large \
    --fp16 \
    --num 1000 \
    --model_name_or_path gpt2-large \
    --temperature 1.0 \
    --length 128 \
    --adapter_config ia3 \
    --prompt "I don’t care if this is controversial" \
    --load_adapter "./tmp/test-clm/20230325_00_59.ia3.0.7

