In [1]:
import peft
import os
import sys
from peft import get_peft_model, LoraConfig, PeftModel, PeftConfig
import torch
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2Tokenizer, TrainerCallback, EarlyStoppingCallback
current_dir = os.getcwd()
# Add the project root to sys.path (assuming src is in the root directory)
project_root = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
sys.path.append(project_root)
from src.data_processing.Formality_Transfer_Dataset import FormalityTransferDataset


In [None]:
# paths
test_path = os.path.join(project_root, 'data/processed/test.pkl')
train_path = os.path.join(project_root, 'data/processed/train.pkl')
tune_path = os.path.join(project_root, 'data/processed/tune.pkl')
tokeniser_path = os.path.join(project_root, 'src/models/tokenizer/tokenizer.pkl')
sys.path.append(os.path.join(project_root, 'src/data_processing'))
print(test_path)

/scratch/s4776380/LLM/data/processed/test.pkl


In [None]:
# Load datasets
with open(test_path, 'rb') as f:
    test : FormalityTransferDataset = pickle.load(f)
with open(train_path, 'rb') as f:
    train : FormalityTransferDataset = pickle.load(f)
with open(tune_path, 'rb') as f:
    tune : FormalityTransferDataset = pickle.load(f)
with open(tokeniser_path, 'rb') as f:
    tokenizer : GPT2Tokenizer = pickle.load(f)
    print(len(tokenizer))

50261


In [4]:
model = AutoModelForCausalLM.from_pretrained('gpt2-medium', device_map="auto")
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

Embedding(50261, 1024)

In [5]:
# Define LoRA Config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["attn.c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj"],
    lora_dropout=0.05,
    bias="all",
    task_type="CAUSAL_LM"
)

In [6]:
# Add LoRA adaptor
model = get_peft_model(model, lora_config)
model.resize_token_embeddings(len(tokenizer))
model.print_trainable_parameters()
model.to('cpu') # if GPU is available later on, change to 'cuda'

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,
    #pad_to_multiple_of=8
)



trainable params: 3,417,088 || all params: 357,972,992 || trainable%: 0.9546


In [7]:
# Initialize lists to store loss values
training_loss = []
validation_loss = []

# Define the EarlyStoppingCallback with patience and the metric to monitor
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,   # Number of evaluations with no improvement
    early_stopping_threshold=0.01  # Minimum change to qualify as an improvement
)

# Define a custom callback to track losses
class LossLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if 'loss' in logs:
                training_loss.append(logs['loss'])
            if 'eval_loss' in logs:
                validation_loss.append(logs['eval_loss'])

In [None]:
training_args = TrainingArguments(
    output_dir="logs",
    per_device_train_batch_size=4,  
    learning_rate=1e-5,
    num_train_epochs=4,
    logging_dir="logs/training",
    gradient_accumulation_steps=8,  
    logging_steps=500,
    save_total_limit=2, 
    save_strategy="epoch",
    save_steps=10_000,
    eval_strategy="epoch",                # Save at each epoch (or use 'steps')
    load_best_model_at_end=True,          # Required for early stopping
    metric_for_best_model="eval_loss",    # Metric to monitor (use your chosen metric here)
    greater_is_better=False,              # For loss, lower is better (set to True for accuracy, etc.)
    eval_steps=500,
    report_to="tensorboard",
    fp16=False  # Keep as False on MPS
)

In [None]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    callbacks=[LossLoggerCallback, early_stopping_callback],
    data_collator=data_collator,
    train_dataset=train,
    eval_dataset=tune
)

# Disable cache for training
model.config.use_cache = False

In [None]:
# Train the model
trainer.train()

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long)


Epoch,Training Loss,Validation Loss
0,107.0363,3.372763
1,26.7203,3.103028


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long)
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long)


In [None]:
# Save our LoRA model & tokenizer results
lora_model_dir = "src/models/lora_trained"
base_model_dir = "src/models/base_model"
tokenizer_dir = "src/models/trained_tokenizer"

trainer.model.save_pretrained(lora_model_dir)
tokenizer.save_pretrained(tokenizer_dir)
trainer.model.base_model.save_pretrained(base_model_dir)

print("Training complete. Model saved.")

#Save loss data
loss_data = pd.DataFrame({
    'training_loss': training_loss,
    'validation_loss': validation_loss[:len(training_loss)]  # Ensure same length
})
loss_data.to_csv('loss_data_2.csv', index=False)

# Plot the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(training_loss, label='Training Loss')
plt.plot(validation_loss, label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.savefig('loss_plot_2.png')
plt.show()