<a href="https://colab.research.google.com/github/RiverGumSecurity/AILabs/blob/main/notebooks/bhisblogs-alpaca.ipynb" target="new"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import warnings
import os
import sys
import pathlib
try:
    import unsloth
except:
    if 'google.colab' in sys.modules:
        !pip install unsloth

# suppresses some noisey warnings which are just annoying
warnings.filterwarnings('ignore')
max_seq_length = 4096

# Setup Hugging Face Credentials.
HF_APIKEY = ''
if 'google.colab' in sys.modules:
    from google.colab import userdata
    HF_APIKEY = userdata.get('HF_APIKEY')
else:
    with open(pathlib.Path.home() / '.hfkey') as hf:
        HF_APIKEY = hf.read().strip()
if not HF_APIKEY:
    print('[-] ERROR: Cannot continue without Hugging Face API Key')
    sys.exit(0)
os.environ['HF_TOKEN'] = HF_APIKEY

model, tokenizer = unsloth.FastLanguageModel.from_pretrained(
    model_name = "joffthyer/alpaca-llama3",
    max_seq_length = max_seq_length,
    dtype = None, load_in_4bit = True
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.2: Fast Llama patching. Transformers = 4.47.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
prompt = """
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def format_prompts(p):
    # these are provided as lists
    instructions = p["instruction"]
    inputs       = p["input"]
    outputs      = p["output"]
    texts = []
    for ins, inp, outp in zip(instructions, inputs, outputs):
        text = prompt.format(ins, inp, outp) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts }


In [3]:
import re
import json
import datasets
import requests
from bs4 import BeautifulSoup

# get dataset
r = requests.get('https://raw.githubusercontent.com/RiverGumSecurity/Datasets/refs/heads/main/BHIS/bhis-blogs-cleaned.json')
data = json.loads(r.text)
ds = datasets.Dataset.from_list(data)
ds = ds.map(format_prompts, batched=True)

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [4]:
# trl version 0.12.0 is the latest
# - tokenizer arg becomes "processing_class"
# - dataset_text field is no longer required
#

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

newmodel = model
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = ds,
    #max_seq_length = max_seq_length,
    #dataset_num_proc = 2,
    #packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # Set this for 1 full training run.
        num_train_epochs = 1,
        #max_steps = 50,
        learning_rate = 2e-4,
        # Floating Point 16 (2 bytes memory use)
        fp16 = not is_bfloat16_supported(),
        # Brain Float 16 (2 bytes memory use but more efficient)
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.05,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [5]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 484 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.784
2,2.9054
3,2.4949
4,2.7738
5,2.5524
6,2.4714
7,2.3104
8,2.4702
9,2.451
10,2.1602


In [6]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory/max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
7.812 GB of memory reserved.
273.3011 seconds used for training.
4.56 minutes used for training.
Peak reserved memory = 7.812 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 32.566 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [7]:
model.push_to_hub('bhisblogs-alpaca-llama3')
tokenizer.push_to_hub('bhisblogs-alpaca-llama3')

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/bhisblogs-alpaca-llama3


No files have been modified since last commit. Skipping to prevent empty commit.


In [10]:
from transformers import TextStreamer

unsloth.FastLanguageModel.for_inference(model)
inputs = tokenizer([prompt.format('''What is a ''', "", "")
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 4096)

<|begin_of_text|>
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
Can you explain what a red teamer does?

### Input:


### Response:
Red teaming is a type of penetration testing where a skilled penetration tester or team of penetration testers simulates a real world attack in order to identify vulnerabilities and weaknesses in an organization s network security The goal of red teaming is to identify security issues that could be exploited by real attackers and to help the organization improve its security posture Red teaming can also be used to test an organization s incident response capabilities and to identify potential weaknesses in its security controls The goal of a red team is to identify vulnerabilities and weaknesses in an organization s network security and to help the organization improve its security posture The red team will typically start by pe

KeyboardInterrupt: 