## Relevant packages

In [None]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
# Modules for fine-tuning
from unsloth import FastLanguageModel
import torch # Import PyTorch
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
# Hugging Face modules
from huggingface_hub import login # Lets you login to API
from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset # Lets you load fine-tuning datasets
# Import weights and biases
import wandb


## API keys

In [None]:
# Initialize Hugging Face & WnB tokens
hugging_face_token = "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"
wnb_token = "74e1599fd25989e53e27f4d82a7447f0c0aecb01"

# Login to Hugging Face
login(hugging_face_token) # from huggingface_hub import login

# Login to WnB
wandb.login(key=wnb_token) # import wandb
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset_YouTube Walkthrough',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Load the basic model (deepseek-r1)


In [None]:
import torch
from transformers import AutoTokenizer
from peft import PeftModel
from unsloth import FastLanguageModel  # Unsloth for fast inference

# Define model paths
base_model_path = "drive/MyDrive/deepseek-r1"  # Change to your base model
lora_model_path = "drive/MyDrive/fine-tuned-deepseek-r1-with-reasoning-0.01"
tokenizer_path = "drive/MyDrive/tokenizer-deepseek-r1-with-reasoning-0.01"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Load the base model optimized with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_path,
    max_seq_length=4096,  # Adjust based on model capability
    dtype=torch.float16,
    load_in_4bit=True,  # Enable quantization for efficiency
)

# Load LoRA adapter correctly
model = PeftModel.from_pretrained(model, lora_model_path)

# Optimize LoRA model for inference (2x faster with Unsloth)
FastLanguageModel.for_inference(model)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model loaded successfully!")

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!




In [None]:
model.save_pretrained("drive/MyDrive/fine-tuned-deepseek-r1")

In [None]:
model_name = "drive/MyDrive/fine-tuned-deepseek-r1"

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Load the base model optimized with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= model_name,
    max_seq_length=4096,  # Adjust based on model capability
    dtype=torch.float16,
    load_in_4bit=True,  # Enable quantization for efficiency
)

# Optimize LoRA model for inference (2x faster with Unsloth)
FastLanguageModel.for_inference(model)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model loaded successfully!")

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




Model loaded successfully!


In [None]:
# Define a system prompt under prompt_style
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a customer service representative with advanced knowledge of water filtration systems, troubleshooting, and warranty replacements. Please answer the following customer inquiry professionally and helpfully.

### Question:
{}

### Response:
<think>{}"""

In [None]:
# Define the input question and prompt format
question = '''Hi there,

I just purchased the iSpring RO System RCC7 and the leak stop valve is faulty - the reservoir will not stay on. Can you please send me a replacement? My address is as follows:

232 Hay Ave
St. Andrews, MB
R1A 3M7
Canada

Thank you,

Ashley Krahn

Ph: (204) 481-2200
ashleykrahn@outlook.com?'''


# Tokenize input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to(device)

# Generate response
outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=2048)

# Decode output
response = tokenizer.batch_decode(outputs)[0].split("### Response:")[1].strip()

print("Model's Response:", response)

Model's Response: <think>
Alright, so I need to help Ashley with her iSpring RO System issue. Let me start by understanding the problem. She mentioned that the leak stop valve is faulty, causing the reservoir not to stay on. Hmm, that sounds like a common issue, but I need to figure out the best way to assist her.

First, I should acknowledge her purchase and the problem she's facing. It's important to be empathetic, so I'll make sure my response is kind and supportive. I should thank her for reaching out and assure her that I can help.

Next, I need to address the faulty leak stop valve. I recall that iSpring has replacement parts available, so I should check if the leak stop valve is something that can be easily replaced. Maybe I should guide her through a quick fix or suggest ordering the replacement part.

Wait, she's in Canada, so I should confirm if the warranty covers this part. iSpring's warranty typically covers defects, but I'm not sure if the leak stop valve falls under that

## DPO_trainer

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [None]:
# Load CSV file
file_path = "drive/MyDrive/RLHF.csv"
df = pd.read_csv(file_path)

# Select relevant columns
df = df[['Model', 'CUSTOMER_QUESTION', 'AI_ANSWER', 'LAURENCE_ANSWER']]

# Combine Model name with Customer Question
df["prompt"] = df.apply(lambda row: f"Product Name: {row['Model']} | {row['CUSTOMER_QUESTION']}", axis=1)

# Drop original columns
df.drop(columns=['Model', 'CUSTOMER_QUESTION'], inplace=True)

# Rename columns to match dataset format
train_df = df.rename(columns={"AI_ANSWER": "rejected", "LAURENCE_ANSWER": "chosen"})

train_df.dropna(inplace=True)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)

# Create a dataset dictionary (for Hugging Face format)
dataset = DatasetDict({
    "train": train_dataset
})

dataset = dataset.map(lambda x: x, remove_columns=["__index_level_0__"])

# Display dataset info
print(dataset)


Map:   0%|          | 0/113 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['rejected', 'chosen', 'prompt'],
        num_rows: 113
    })
})


In [None]:
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Create a new DatasetDict with separate train and test sets
dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

In [None]:
# Add special tokens if necessary
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Define the maximum length
max_length = 1024

In [None]:
get_res = lambda dataset, split, res: [
    "\n\nHuman: " + prompt + "\n\nAssistant: " + resp
    for prompt, resp in zip(dataset[split]["prompt"], dataset[split][res])
]

# Extract from training set
chosen_samples_train = get_res(dataset, "train", "chosen")
rejected_samples_train = get_res(dataset, "train", "rejected")

# Extract from test set
chosen_samples_test = get_res(dataset, "test", "chosen")
rejected_samples_test = get_res(dataset, "test", "rejected")

# Print examples
print('Chosen (Train):', chosen_samples_train[0])
print('Rejected (Train):', rejected_samples_train[0])
print('Chosen (Test):', chosen_samples_test[0])
print('Rejected (Test):', rejected_samples_test[0])


Chosen (Train): 

Human: Product Name: WSP50ARB | I am looking for a sediment filter to catch the sediment leaving my tankless water heater. Will this filter withstand the 120 degree setting? 
Sent from my iPhone

Assistant: Hi Tom,

The WSP50ARB model is not designed for hot water applications. The maximum water temperature that the filter can withstand is 100 F (37.8 C). We recommend installing the spin down sediment filter prior your water heater to act as a pre-filter that would catch sediments and protect your water heater. 

If you have other questions or concerns, please don't hesitate to contact us.

Best,

iSpring Customer Support
Rejected (Train): 

Human: Product Name: WSP50ARB | I am looking for a sediment filter to catch the sediment leaving my tankless water heater. Will this filter withstand the 120 degree setting? 
Sent from my iPhone

Assistant: Dear customer, Thank you for reaching out to iSpring Water Systems. We appreciate your interest in our products and

In [None]:
# Set parameters
max_seq_length = 2048 # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None # Set to default
load_in_4bit = True # Enables 4 bit quantization — a memory saving optimization

# Load the DeepSeek R1 model and tokenizer using unsloth — imported using: from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",  # Load the pre-trained DeepSeek R1 model (8B parameter version)
    max_seq_length=max_seq_length, # Ensure the model can process up to 2048 tokens at once
    dtype=dtype, # Use the default data type (e.g., FP16 or BF16 depending on hardware support)
    load_in_4bit=load_in_4bit, # Load the model in 4-bit quantization to save memory
    token=hugging_face_token, # Use hugging face token
)

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 3,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
    beta = 0.1,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    tokenizer = tokenizer,
    max_length = 2048,
    max_prompt_length = 1024,
)

Extracting prompt in train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/90 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/23 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/23 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/23 [00:00<?, ? examples/s]

In [None]:
dpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 90 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 33
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.6931,0.0,0.0,0.0,0.0,-360.929047,-428.916229,-2.026259,-2.234543,0,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-372.48999,-436.939636,-2.118045,-2.256978,No Log,No Log,No Log,No Log
3,0.6877,0.008786,-0.006173,0.5,0.014959,-390.776917,-460.376801,-2.075708,-2.2003,No Log,No Log,No Log,No Log
4,0.6721,0.002597,-0.043691,0.625,0.046288,-365.703827,-468.328247,-2.073992,-2.159647,No Log,No Log,No Log,No Log
5,0.6518,0.077795,-0.007141,1.0,0.084936,-407.119995,-430.767639,-1.946649,-2.219787,No Log,No Log,No Log,No Log
6,0.6033,0.055814,-0.134859,0.875,0.190672,-338.84552,-466.505249,-2.11635,-2.247275,No Log,No Log,No Log,No Log
7,0.5649,0.102445,-0.175393,1.0,0.277839,-384.916199,-415.720123,-2.148593,-2.245431,No Log,No Log,No Log,No Log
8,0.5038,0.145856,-0.284649,1.0,0.430505,-339.108429,-472.803772,-1.995109,-2.215102,No Log,No Log,No Log,No Log
9,0.4696,0.181653,-0.334493,1.0,0.516146,-367.608917,-443.39209,-2.109426,-2.199303,No Log,No Log,No Log,No Log
10,0.407,0.220494,-0.4771,1.0,0.697595,-439.130432,-458.797363,-2.070659,-2.197767,No Log,No Log,No Log,No Log


TrainOutput(global_step=33, training_loss=0.24401054689378449, metrics={'train_runtime': 119.7015, 'train_samples_per_second': 2.256, 'train_steps_per_second': 0.276, 'total_flos': 0.0, 'train_loss': 0.24401054689378449, 'epoch': 2.8})

In [None]:
metrics = dpo_trainer.evaluate()
print(metrics)

{'eval_loss': 0.03484562784433365, 'eval_runtime': 4.26, 'eval_samples_per_second': 5.399, 'eval_steps_per_second': 0.704, 'eval_rewards/chosen': 1.0909239053726196, 'eval_rewards/rejected': -2.4536514282226562, 'eval_rewards/accuracies': 1.0, 'eval_rewards/margins': 3.5445749759674072, 'eval_logps/chosen': -281.712646484375, 'eval_logps/rejected': -502.7544860839844, 'eval_logits/chosen': -2.0394606590270996, 'eval_logits/rejected': -2.183208703994751, 'epoch': 2.8}


In [None]:
dpo_trainer.save_model('deepseek-r1-reasoning-dpo')

In [None]:
wandb.finish()

## Model use

In [None]:
from transformers import AutoTokenizer

# Path to your fine-tuned model
model_path = "drive/MyDrive/deepseek-r1-reasoning-dpo"  # Replace
tokenizer_path = "drive/MyDrive/tokenizer-deepseek-r1-with-reasoning-0.01"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Load the base model optimized with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=4096,  # Adjust based on model capability
    dtype=torch.float16,
    load_in_4bit=True,  # Enable quantization for efficiency
)


==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Optimize LoRA model for inference (2x faster with Unsloth)
FastLanguageModel.for_inference(model)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [None]:
# Define a system prompt under prompt_style
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a customer service representative with advanced knowledge of water filtration systems, troubleshooting, and warranty replacements. Please answer the following customer inquiry professionally and helpfully.

### Question:
{}

### Response:
<think>{}"""

In [None]:
# Define the input question and prompt format
question = '''Product Name: WSP50ARB | I am looking for a sediment filter to catch the sediment leaving my tankless water heater. Will this filter withstand the 120 degree setting?
Sent from my iPhone
'''


# Tokenize input
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to(device)

# Generate response
outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=2048)

# Decode output
response = tokenizer.batch_decode(outputs)[0].split("### Response:")[1].strip()

print("Model's Response:", response)

Model's Response: <think>
Okay, so I need to help this customer who's asking about the WSP50ARB sediment filter for their tankless water heater. They want to know if it can handle the 120-degree setting. First, I should recall what I know about tankless water heaters and their filtration systems. 

I remember that tankless systems often have built-in filters, usually sediment filters, to protect the heat exchanger from particles that could clog it or cause damage. The WSP50ARB is a specific model, so I should check the specs for that exact filter. 

I think the sediment filter's main job is to catch particles as small as 5 microns, which prevents debris from getting into the heater. But the question is about whether it can withstand the high temperature, specifically the 120-degree setting. I should consider the material the filter is made of. If it's made of materials that can handle high temps without degrading, like maybe something heat-resistant, then it should be fine. 

I also ne

In [None]:
def use_model(model, tokenizer, prompt_style, question):

  # Load the base model optimized with Unsloth
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name=model_path,
      max_seq_length=4096,  # Adjust based on model capability
      dtype=torch.float16,
      load_in_4bit=True,  # Enable quantization for efficiency
  )

  # Optimize LoRA model for inference (2x faster with Unsloth)
  FastLanguageModel.for_inference(model)

  # Move model to GPU if available
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model.to(device)

  print("Model loaded successfully!")

  # Tokenize input
  inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to(device)

  # Generate response
  outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=2048)

  # Decode output
  response = tokenizer.batch_decode(outputs)[0].split("### Response:")[1].strip()

  print("Model's Response:", response)