In [None]:
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer


lora_r = 8 # 'lora_r' is the dimension of the LoRA attention. 8 for GPT-2
lora_alpha = 16 # 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_dropout = 0.05 # 'lora_dropout' is the dropout probability for LoRA layers.
target_modules = ["c_attn", "c_proj", "c_fc"]  # GPT-2 specific target modules

# attn.q_proj
# attn.k_proj
# attn.v_proj
# attn.c_proj
# mlp.c_fc
# mlp.c_proj

# LoraConfig object is created with the following parameters:
# 'r' (rank of the low-rank approximation) is set to 16,
# 'lora_alpha' (scaling factor) is set to 16,
# 'lora_dropout' dropout probability for Lora layers is set to 0.05,
# 'task_type' (set to TaskType.CAUSAL_LM indicating the task type),
# 'target_modules' (the modules to which LoRA is applied) choosing linear layers except the output layer..

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,
    bias="none",  # GPT-2 doesn't use bias in attention layers
)

# 'TrainingArguments' is a class that holds the arguments for training a model.
# 'output_dir' is the directory where the model and its checkpoints will be saved.
# 'evaluation_strategy' is set to "steps", meaning that evaluation will be performed after a certain number of training steps.
# 'do_eval' is set to True, meaning that evaluation will be performed.
# 'optim' is set to "adamw_torch", meaning that the AdamW optimizer from PyTorch will be used.
# 'per_device_train_batch_size' and 'per_device_eval_batch_size' are set to 1, meaning that the batch size for training and evaluation will be 4 per device.
# 'gradient_accumulation_steps' is set to 8, meaning that gradients will be accumulated over 8 steps before performing a backward/update pass.
# 'log_level' is set to "info", meaning that all log messages will be printed.
# 'save_strategy' is set to "epoch", meaning that the model will be saved after each epoch.
# 'logging_steps' is set to 100, meaning that log messages will be printed every 100 steps.
# 'learning_rate' is set to 5e-5, which is the learning rate for the optimizer.
# 'fp16' is set to the opposite of whether bfloat16 is supported on the current CUDA device and the model.
# 'bf16' is set to whether bfloat16 is supported on the current CUDA device and the model..
# 'eval_steps' is set to 100, meaning that evaluation will be performed every 100 steps.
# 'num_train_epochs' is set to 200, meaning that the model will be trained for 200 epochs.
# 'warmup_ratio' is set to 0.1, meaning that 10% of the total training steps will be used for the warmup phase.
# 'lr_scheduler_type' is set to "cosine", meaning that a cosine learning rate scheduler will be used.
# 'seed' is set to 42, which is the seed for the random number generator.

# Training arguments
args = TrainingArguments(
    output_dir="./gpt2-LoRA-nl-to-fol-2",
    evaluation_strategy="steps",
    do_eval=True,
    optim="adamw_torch",
    per_device_train_batch_size=1, 
    gradient_accumulation_steps=8, 
    per_device_eval_batch_size=1,  
    log_level="info",  
    save_strategy="epoch",
    logging_steps=100, 
    learning_rate=5e-5, 
    fp16=True,  
    eval_steps=100,  
    num_train_epochs=200, 
    warmup_ratio=0.1,
    lr_scheduler_type="cosine", 
    seed=42,
)

# Initialize Trainer
# 'model' is the model that will be trained.
# 'train_dataset' and 'eval_dataset' are the datasets that will be used for training and evaluation, respectively.
# 'peft_config' is the configuration for peft, which is used for instruction tuning.
# 'dataset_text_field' is set to "text", meaning that the 'text' field of the dataset will be used as the input for the model.
# 'max_seq_length' is set to 256, meaning that the maximum length of the sequences that will be fed to the model is 256 tokens.
# 'tokenizer' is the tokenizer that will be used to tokenize the input text.
# 'args' are the training arguments that were defined earlier.

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256, 
    tokenizer=tokenizer,
    args=args,
)