In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding,EarlyStoppingCallback
from peft import get_peft_model,get_peft_config, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
import evaluate
import numpy as np
from trl import SFTTrainer
import mlflow

In [2]:
TOKEN = "{Enter token here}"

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-instruct", padding_side="right", token=TOKEN,)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    lm_int8_enable_fp32_cpu_offload=True,
    llm_int8_skip_modules=None
)

Unused kwargs: ['lm_int8_enable_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-instruct",
    quantization_config=bnb_config,
    token=TOKEN,
    device_map='auto',
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable()

In [7]:
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
# PEFT Configuration
peft_config = LoraConfig(
    r=10,
    target_modules = target_modules,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

trainable params: 16,509,440 || all params: 3,229,259,264 || trainable%: 0.5112
None




In [8]:
# Load and prepare datasets
df = pd.read_csv("dataset/pdf-parse-dataset.csv")

rd_df_sample = df.sample(frac=1, random_state=42) 
rd_df_sample

Unnamed: 0,raw-comments,processed-comments
1447,Professor Smith's lectures were incredibly den...,|<startofcomment>| Professor Smith's lectures ...
1114,He might not be the clearest lecturer in the w...,|<startofcomment>| He might not be the cleares...
1064,I would not recommend this professor. I really...,|<startofcomment>| I would not recommend this ...
2287,This professor truly brought the material to l...,|<startofcomment>| This professor truly brough...
1537,"His lectures jump around a lot, and it's hard ...",|<startofcomment>| His lectures jump around a ...
...,...,...
1638,Lectures and are really long and boring. Test ...,|<startofcomment>| Lectures and are really lon...
1095,Sidney is incredible smart and knows statistic...,|<startofcomment>| Sidney is incredible smart ...
1130,This class felt incredibly chaotic. The syllab...,|<startofcomment>| This class felt incredibly ...
1294,Professor Elliot breaks down complex topics in...,|<startofcomment>| Professor Elliot breaks dow...


In [9]:
rd_df_sample['instruction'] = 'Please parse the following comments. Do so by placing the token |<startofcomment>| at the start of a comment and the token |<endofcomment>| at the end of one. Output just this and nothing more. Comments to process: '+ rd_df_sample['raw-comments']

print(rd_df_sample['instruction'].iloc[1])

Please parse the following comments. Do so by placing the token |<startofcomment>| at the start of a comment and the token |<endofcomment>| at the end of one. Output just this and nothing more. Comments to process: He might not be the clearest lecturer in the world but he is the absolute BEST one-on-one teacher I had in my four years at UND. He's so encouraging and genuinely wants his students to do well. If you're not willing to spend time on the homework & go to his office hours then you probably won't get a good grade, but that's your own fault. Difficult but here are my suggestions: Attend ALL lectures&fill out his course packet. Be polite to him. Ask for help after class and in office hours. Write down all your work for online HW even if you aren't doing the notebook. Do the ENTIRE study guide and practice test to study for exams. If can do those comfortably you're set. Orgo is intrinsically a difficult course, but Taylor did a good job explaining the harder concepts. Of course it

In [10]:
template = """

### Instruction:

{}

### Response:\n"""

In [11]:
# Create the prompt format using special tokens
rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(
    lambda x: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
              f"You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
              f"{x}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
)

# Rename 'positive-attribute' to 'response'
rd_df_sample.rename(columns={'processed-comments': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'].astype(str) + "\n<|eot_id|>"

# Select only the 'prompt' and 'response' columns
rd_df_sample = rd_df_sample[['prompt', 'response']]

In [12]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(rd_df_sample, test_size=0.07, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict for easy handling of both sets
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [13]:
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

In [14]:
import os
temp_output_dir = "/tmp/training_checkpoints"
os.makedirs(temp_output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=temp_output_dir,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=5,
    save_steps=15000000,
    logging_steps=5,
    num_train_epochs = 3.0,
    load_best_model_at_end=True,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [15]:
# Set up the early stopping callback with patience
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations to wait for improvement
    early_stopping_threshold=0.0  # Minimum change to qualify as an improvement
)

In [16]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset["train"],
    eval_dataset = dataset["test"],
    dataset_text_field="prompt",
    max_seq_length=350,
    args=training_args,
    callbacks=[early_stopping_callback],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2324 [00:00<?, ? examples/s]

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

In [17]:
for name, module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

In [18]:
with mlflow.start_run(run_name='run'):
    trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
5,3.561,3.617306
10,3.4744,3.603386
15,3.4974,3.566506
20,3.4383,3.521774
25,3.3954,3.473594
30,3.3207,3.429173
35,3.6202,3.386432
40,3.8494,3.336605
45,3.2098,3.292722
50,3.174,3.244813


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [19]:
from peft import LoraModel

lora_model = LoraModel(model,peft_config,"pdfparser-llama3-full")

# Define the directory and naming for saving
save_directory = "loras/pdfparser-llama3-full"

# Save only the LoRA weights
lora_model.save_pretrained(save_directory)

print(f"LoRA weights saved to {save_directory}")



LoRA weights saved to loras/pdfparser-llama3-full


In [20]:
import shutil
shutil.rmtree(temp_output_dir)