In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = ""
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
import json

# Load the TSV file
df = pd.read_csv('/content/cti-mcq.tsv', sep='\t')
df.columns = df.columns.str.strip()

# Create a list to hold the JSON objects
data = []

# Process each row in the DataFrame
for _, row in df.iterrows():
    entry = {
        'question': row['Question'],
        'options': {
            'A': row['Option A'],
            'B': row['Option B'],
            'C': row['Option C'],
            'D': row['Option D']
        },
        'correct_answer': row['GT']
    }
    data.append(entry)

# Save to JSONL file
with open('/content/dataJson.jsonl', 'w') as f:
    for item in data:
        f.write(json.dumps(item) + '\n')


In [None]:
import json

input_file = "/content/dataJson.jsonl"
output_file = "/content/cleaned_dataJson.jsonl"

# Function to sanitize options (convert all option values to strings)
def sanitize_options(options):
    for key, value in options.items():
        options[key] = str(value)  # Ensure all options are strings
    return options

# Open the original dataset and the output file
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        try:
            # Parse each line as JSON
            example = json.loads(line)

            # Sanitize the 'options' field
            if 'options' in example:
                example['options'] = sanitize_options(example['options'])

            # Write the sanitized example back to the new file
            outfile.write(json.dumps(example) + "\n")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip any lines with JSON errors

print("Dataset has been cleaned and saved to", output_file)


Dataset has been cleaned and saved to /content/cleaned_dataJson.jsonl


In [None]:
import re
from typing import Dict, Any
from datasets import load_dataset

def convert_options_to_dict(options_str: str) -> Dict[str, str]:
    options_dict = {}
    # Regex to match option patterns like "A: value"
    option_pattern = re.compile(r"([A-D])\)\s*(.*)")
    matches = option_pattern.findall(options_str)
    if matches:
        for key, value in matches:
            options_dict[key] = value
    return options_dict

def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
    questions = examples["question"]
    options = examples["options"]
    correct_answers = examples["correct_answer"]
    texts = []

    for question, option, correct_answer in zip(questions, options, correct_answers):
        # Print the raw data to ensure it's correct (for debugging)
        print(f"Processing question: {question}")
        print(f"Options: {option}")
        print(f"Correct answer: {correct_answer}")

        # Convert options to dictionary if it's a string
        if isinstance(option, str):
            print(f"Converting options from string: {option}")
            option = convert_options_to_dict(option)

        # Print the type of the options dictionary to debug
        print(f"Processed options type: {type(option)}")
        print(f"Processed options content: {option}")

        # Ensure the dictionary has the required keys
        if isinstance(option, dict) and all(k in option for k in ['A', 'B', 'C', 'D']):
            instruction = "You are a cybersecurity expert specializing in cyber threat intelligence. You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D."
            input_text = f"Question:\n{question}\nOptions:\nA) {option['A']}\nB) {option['B']}\nC) {option['C']}\nD) {option['D']}"
            output = f"\nThe correct answer is {correct_answer}."
            text = f"{instruction}\n{input_text}\nImportant: The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n{output}"
            texts.append(text)
            print(f"Generated text: {text}")
        else:
            print(f"Skipping example with missing or invalid options: {option}")

    return {"text": texts}

# Load and format the dataset
dataset = load_dataset("json", data_files="/content/cleaned_dataJson.jsonl", split="train")

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Print the first formatted example to verify
print(f"Formatted text: {formatted_dataset[0]['text']}")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed options type: <class 'dict'>
Processed options content: {'A': 'Architecture and Design', 'B': 'Implementation', 'C': 'Operation', 'D': 'All of the above'}
Generated text: You are a cybersecurity expert specializing in cyber threat intelligence. You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.
Question:
Which phase can CWE-440 be introduced in?
Options:
A) Architecture and Design
B) Implementation
C) Operation
D) All of the above
Important: The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.

The correct answer is D.
Processing question: Which prerequisite is essential for leveraging a race condition according to CAPEC-26?
Options: {'A': 'Adversary h

In [None]:
print(f"Formatted text: {formatted_dataset[0]['text']}")


Formatted text: You are a cybersecurity expert specializing in cyber threat intelligence. You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.
Question:
Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?
Options:
A) Audit
B) Execution Prevention
C) Operating System Configuration
D) User Account Control
Important: The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.

The correct answer is B.


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and evaluation subsets
dataset_split = formatted_dataset.train_test_split(test_size=0.2, seed=42)

# Extract training and evaluation subsets
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from sklearn.metrics import accuracy_score

# Define a metrics calculation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # For classification tasks
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,  # Increased for faster training
    gradient_accumulation_steps=4,  # Keeping this for memory efficiency
    warmup_steps=200,  # Increased warmup for smoother adaptation
    num_train_epochs=3,  # Reduced epochs to monitor early stopping
    learning_rate=3e-5,  # More conservative learning rate for fine-tuning
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,  # Less frequent logging to speed up training
    optim="adamw_8bit",
    weight_decay=0.02,  # Slightly increased to regularize better
    lr_scheduler_type="cosine_with_restarts",  # Experimenting with a different scheduler
    seed=3407,
    output_dir="outputs",
)

# Initialize SFTTrainer with compute_metrics
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=512,  # Adjust as needed
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    compute_metrics=compute_metrics,  # Add this line
)

# Train the model
trainer.train()


Map (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,250
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.1807
2,2.2191
3,2.277
4,2.3581
5,2.1036
6,2.1201
7,1.8718
8,1.7075
9,1.4851
10,1.49


TrainOutput(global_step=1250, training_loss=0.504863601744175, metrics={'train_runtime': 8052.4805, 'train_samples_per_second': 1.242, 'train_steps_per_second': 0.155, 'total_flos': 7.57130748839854e+16, 'train_loss': 0.504863601744175, 'epoch': 5.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

model.save_pretrained("/content/drive/MyDrive/modelCTI") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/modelCTI")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('/content/drive/MyDrive/modelCTI/tokenizer_config.json',
 '/content/drive/MyDrive/modelCTI/special_tokens_map.json',
 '/content/drive/MyDrive/modelCTI/tokenizer.json')