Make sure to be using `unsloth` for environment

In [1]:
TRAINING = True
EPOCHS = 1
BATCH_SIZE = 32
LR = 1e-5
LIMIT = 10

In [11]:
# install dependencies
from vllm import SamplingParams
import json
from unsloth import is_bfloat16_supported, FastLanguageModel
from datasets import Dataset, load_from_disk
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
import torch
from datetime import datetime
from tqdm import tqdm
import gc

In [17]:
# GPU check
!nvidia-smi
print("PyTorch Version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)
print("cuDNN Version:", torch.backends.cudnn.version())
print("CUDA Available:", torch.cuda.is_available())
print("GPU Count:", torch.cuda.device_count())

Sun May  4 11:09:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla M40 24GB                 Off |   00000000:02:00.0 Off |                    0 |
| N/A   51C    P0             60W /  250W |    6710MiB /  23040MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# model loading
max_seq_length = 1256 # prompts are ~1000, so leaving 256 for response
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.1-8B-Instruct", # "unsloth/Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)


==((====))==  Unsloth 2025.4.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla M40 24GB. Num GPUs = 1. Max memory: 22.395 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 5.2. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
# layer freezing
total_layers = len(model.model.layers)
freeze_percentage = 80
num_freeze_layers = int(total_layers * (freeze_percentage / 100))
for i, layer in enumerate(model.model.layers):
    if i < num_freeze_layers:
        for param in layer.parameters():
            if param.dtype in [torch.float16, torch.bfloat16, torch.float32]:
                param.requires_grad = False
    else:
        for param in layer.parameters():
            if param.dtype in [torch.float16, torch.bfloat16, torch.float32]:
                param.requires_grad = True
print(f"First {num_freeze_layers} layers frozen. Rest are trainable.")

First 25 layers frozen. Rest are trainable.


In [6]:
# qlora
lora_rank = 32
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",  # Enable long context finetuning
    random_state = 3407,
)

Unsloth 2025.4.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
# Data prep
dataset = load_from_disk("../data/connections_ds")
# Note that you will need to run make_ds.py ahead of time to generate this dataset

dataset = dataset.map(lambda example: {"text": example["input"]}, remove_columns=["input"])
dataset = dataset.map(lambda example: {"label": example["target"]}, remove_columns=["target"])

dataset = dataset.select(range(0, 10))
train_test_split = dataset.train_test_split(test_size=0.1) # splits 10% off to test
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']
print(f"Loaded dataset of {len(dataset)} samples")

Loaded dataset of 10 samples


In [8]:
# Note prompt lengths
prompt_lengths = [tokenizer(example['text'], return_tensors="pt")["input_ids"].shape[1] for example in test_dataset]
print(max(prompt_lengths)) # maxing out at most at 1000 tokens in prompt

972


In [13]:
# configuration
date = datetime.now().strftime('%Y%m%d-%H%M%S')
class ClearCacheCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        """ Clear GPU cache after each epoch """
        torch.cuda.empty_cache()
        return control
training_args = TrainingArguments(
    per_device_train_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = EPOCHS,
    learning_rate = LR,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = f"/home/jamesbarrett_umass_edu/cs685proj-new/conventional/outputs/{date}",
    logging_dir = f"/home/jamesbarrett_umass_edu/cs685proj-new/conventional/model_logs/{date}",
    logging_first_step = True,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    report_to = "none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = training_args,
    callbacks=[ClearCacheCallback]
)

Map (num_proc=2): 100%|██████████| 9/9 [00:01<00:00,  5.83 examples/s]
num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map: 100%|██████████| 1/1 [00:00<00:00, 39.53 examples/s]


In [14]:
# fine tune (if TRAINING)
if TRAINING:
    trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 4 x 1) = 128
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,1.7935,1.79532


In [16]:
# clear cache after training
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
# evaluate on test
eval_result = trainer.evaluate()
print("Test Loss:", eval_result["eval_loss"])

Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Test Loss: 1.79531991481781


In [None]:
# clear cache after evaluation
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
# inference
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

predictions = []
for input in tqdm(test_dataset['text']):
    tok_input = tokenizer([input], truncation=True, padding=True, return_tensors="pt").to('cuda')
    output = model.generate(input_ids=tok_input['input_ids'][0].unsqueeze(0), attention_mask=tok_input['attention_mask'][0].unsqueeze(0), max_new_tokens = 128, use_cache = True)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append({
        'input_sentence': input,
        'whole prediction': generated_text,
        'new token prediction': generated_text.split('Groupings:')[-1]
    })

with open(f'inferences_training={TRAINING}.json', 'w') as json_file:
    json.dump(predictions, json_file, indent=4)

100%|██████████| 1/1 [00:18<00:00, 18.81s/it]
