In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding,EarlyStoppingCallback
from peft import get_peft_model,get_peft_config, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
import evaluate
import numpy as np
from trl import SFTTrainer
import mlflow

In [2]:
TOKEN = "{Enter token here}"

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-instruct", padding_side="right", token=TOKEN,)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    lm_int8_enable_fp32_cpu_offload=True,
    llm_int8_skip_modules=None
)

Unused kwargs: ['lm_int8_enable_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-instruct",
    quantization_config=bnb_config,
    token=TOKEN,
    device_map='auto',
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable()

In [7]:
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
# PEFT Configuration
peft_config = LoraConfig(
    r=10,
    target_modules = target_modules,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

trainable params: 16,509,440 || all params: 3,229,259,264 || trainable%: 0.5112
None




In [8]:
# Load and prepare datasets
df1 = pd.read_csv("dataset/dataset-checkpoint-1-processed.csv")
df2 = pd.read_csv("dataset/dataset-checkpoint-2-processed.csv")

In [9]:
# Merge and randomize set

rd_df = pd.concat([df1, df2])
rd_df_sample = rd_df.sample(frac=1, random_state=42) 

rd_df_sample

Unnamed: 0,Id,comment,quality,sentiment,positive-attribute,negative-attribute,processed_comments
544,103621.0,"Avoid this guy, he's a fool. All these people ...",awful,0,,unhelpful,"Avoid this guy, he's a fool. All these people ..."
410,277557.0,This is the best class at UVSC. Don't listen t...,awesome,1,clear,,This is the best class at UVSC. Don't listen t...
351,,Professor Lee breaks down complex topics into ...,,1,clear,,Professor Elliot breaks down complex topics in...
1281,304408.0,DO NOT TAKE!!! She is a very knowledgeable pro...,average,1,knowledgeable,,DO NOT TAKE!!! She is a very knowledgeable pro...
598,1307460.0,Dr. Parker seems as though he wants you to lea...,poor,0,,unfair,Dr. Harper seems as though he wants you to lea...
...,...,...,...,...,...,...,...
1168,,This professor truly cares about the subject m...,,1,passionate,,This professor truly cares about the subject m...
466,103621.0,This guy is a terrible professor. He is consta...,awful,0,,confusing,This guy is a terrible professor. He is consta...
816,,Professor Smith really loves what they teach! ...,,1,passionate,,Professor Sidney really loves what they teach!...
1496,,This professor seemed completely disinterested...,,0,,unhelpful,This professor seemed completely disinterested...


In [10]:
rd_df_sample['instruction'] = 'Provide an attribute for the following comment. The attribute should one of the following: confusing, unfair, boring, unhelpful, disorganized. Output just the attribute and nothing more. Comment: '+ rd_df_sample['processed_comments']

print(rd_df_sample['instruction'].iloc[1])

Provide an attribute for the following comment. The attribute should one of the following: confusing, unfair, boring, unhelpful, disorganized. Output just the attribute and nothing more. Comment: This is the best class at UVSC. Don't listen to anyone that tells you it's hard -- as long as you can memorize, you'll be fine. There's a ton of material, but nothing is difficult to understand. Emerson & Homan are AWESOME. You DO have to live at school to get an A though, so be ready for 12hr study-sessions!


In [11]:
template = """

### Instruction:

{}

### Response:\n"""

In [12]:
# Filter out rows where 'negative-attribute' is NaN
rd_df_sample = rd_df_sample[rd_df_sample['negative-attribute'].notna()]

# Create the prompt format using special tokens
rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(
    lambda x: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
              f"You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
              f"{x}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
)

# Rename 'negative-attribute' to 'response'
rd_df_sample.rename(columns={'negative-attribute': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'].astype(str) + "\n<|eot_id|>"

# Select only the 'prompt' and 'response' columns
rd_df_sample = rd_df_sample[['prompt', 'response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rd_df_sample.rename(columns={'negative-attribute': 'response'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rd_df_sample['response'] = rd_df_sample['response'].astype(str) + "\n<|eot_id|>"


In [13]:
print(rd_df_sample['prompt'].iloc[2])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
Provide an attribute for the following comment. The attribute should one of the following: confusing, unfair, boring, unhelpful, disorganized. Output just the attribute and nothing more. Comment: This class felt like it dragged on forever. Lectures were monotone and rarely strayed from the textbook, making it hard to stay focused. I found myself daydreaming constantly and struggling to absorb the material, even though I sat in the front row.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [14]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(rd_df_sample, test_size=0.07, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict for easy handling of both sets
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [15]:
prompt= """### Instruction:
Provide an attribute for the following comment. The attribute should one of the following: confusing, unfair, boring, unhelpful, disorganized. Output just the attribute and nothing more. Comment: Professor Smith really hates what they teach!

### Response:"""
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
input_ids=input_ids, max_new_tokens=5
)

print(tokenizer.decode(generation_output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|>### Instruction:
Provide an attribute for the following comment. The attribute should one of the following: confusing, unfair, boring, unhelpful, disorganized. Output just the attribute and nothing more. Comment: Professor Smith really hates what they teach!

### Response: 
unfair 

Please


In [16]:
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

In [17]:
import os
temp_output_dir = "/tmp/training_checkpoints"
os.makedirs(temp_output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=temp_output_dir,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=5,
    save_steps=15000000,
    logging_steps=5,
    num_train_epochs = 3.0,
    load_best_model_at_end=True,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [18]:
# Set up the early stopping callback with patience
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations to wait for improvement
    early_stopping_threshold=0.0  # Minimum change to qualify as an improvement
)

In [19]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset["train"],
    eval_dataset = dataset["test"],
    dataset_text_field="prompt",
    max_seq_length=150,
    args=training_args,
    callbacks=[early_stopping_callback],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1721 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

In [20]:
for name, module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

In [21]:
with mlflow.start_run(run_name='run'):
    trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
5,4.2653,4.32739
10,4.0999,4.28837
15,4.0124,4.199097
20,4.0911,4.127709
25,4.3493,4.030751
30,4.1224,3.965452
35,3.8055,3.888191
40,3.6398,3.819569
45,3.6816,3.742836
50,3.8691,3.667592


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [22]:
from peft import LoraModel

lora_model = LoraModel(model,peft_config,"negativeattr-llama3-full")

# Define the directory and naming for saving
save_directory = "loras/negativeattr-llama3-full"

# Save only the LoRA weights
lora_model.save_pretrained(save_directory)

print(f"LoRA weights saved to {save_directory}")



LoRA weights saved to loras/negativeattr-llama3-full


In [23]:
import shutil
shutil.rmtree(temp_output_dir)