<a href="https://colab.research.google.com/github/Hajar-Laktaoui/RecSysExplainability/blob/main/DPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import json
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Paths
train_data_path = "/content/drive/MyDrive/SFTmodel/generated_dataTripAdvisor.json"
val_data_path = "/content/drive/MyDrive/SFTmodel/generated_Val_dataTripAdvisor.json"
huggingface_model_path = "/content/drive/MyDrive/SFTmodel/huggingface_model"

# Step 1: Load Pre-trained GPT-2 Model and Tokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Save the model and tokenizer to Hugging Face format for consistency
model.save_pretrained(huggingface_model_path)
tokenizer.save_pretrained(huggingface_model_path)

# Step 2: Load Training and Validation Datasets
train_dataset = load_dataset("json", data_files={"train": train_data_path})["train"]
val_dataset = load_dataset("json", data_files={"validation": val_data_path})["validation"]

# Step 3: Tokenize the Dataset
def preprocess_function(examples):
    if "chosen" in examples:
        chosen = tokenizer(
            examples["chosen"], truncation=True, padding="max_length", max_length=512
        )
    else:
        raise ValueError("Dataset examples must include a 'chosen' field.")

    if "rejected" in examples and examples["rejected"]:
        rejected = tokenizer(
            examples["rejected"], truncation=True, padding="max_length", max_length=512
        )
        return {
            "input_ids": chosen["input_ids"],
            "attention_mask": chosen["attention_mask"],
            "rejected_input_ids": rejected["input_ids"],
            "rejected_attention_mask": rejected["attention_mask"],
        }
    else:
        return {
            "input_ids": chosen["input_ids"],
            "attention_mask": chosen["attention_mask"],
        }

# Apply preprocessing to both train and validation datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Step 4: Define Custom DPO Loss Function
def dpo_loss(model, inputs):
    # Move inputs to the GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    chosen_outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        labels=inputs["input_ids"]
    )
    if "rejected_input_ids" in inputs:
        rejected_outputs = model(
            input_ids=inputs["rejected_input_ids"],
            attention_mask=inputs["rejected_attention_mask"],
            labels=inputs["rejected_input_ids"]
        )
        return chosen_outputs.loss - rejected_outputs.loss
    else:
        return chosen_outputs.loss

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/SFTmodel/dpo-finetuned-model",  # Save to Google Drive
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to="none",
    push_to_hub=False,
    fp16=True,  # Enable mixed precision training for better GPU performance
)

# Step 6: Custom Trainer for DPO
class DPOTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        loss = dpo_loss(model, inputs)
        return (loss, None) if return_outputs else loss

trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

# Step 7: Train the Model
trainer.train()

# Step 8: Save the Fine-Tuned Model to Google Drive
output_dir = "/content/drive/MyDrive/SFTmodel/dpo-finetuned-model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/256018 [00:00<?, ? examples/s]

Map:   0%|          | 0/32002 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
200,0.1077,No log
400,0.1012,No log
600,0.096,No log
800,0.0981,No log
1000,0.0949,No log
1200,0.093,No log
1400,0.092,No log
1600,0.0947,No log
1800,0.0922,No log
2000,0.0936,No log


Step,Training Loss,Validation Loss
200,0.1077,No log
400,0.1012,No log
600,0.096,No log
800,0.0981,No log
1000,0.0949,No log
1200,0.093,No log
1400,0.092,No log
1600,0.0947,No log
1800,0.0922,No log
2000,0.0936,No log
