In [12]:
!pip install -U transformers accelerate bitsandbytes peft datasets evaluate rouge_score trl peft huggingface_hub

Collecting transformers
  Using cached transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading huggingface_hub-0.26.5-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.8/447.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.26.3
    Uninstalling huggingface-hub-0.26.3:
      Successfully uninstalled huggingface-hub-0.26.3
Successfully installed huggingface_hub-0.26.5


In [19]:
from datasets import load_dataset, Dataset
import pandas as pd

df = pd.read_csv("./Cyber-Threat-Intelligence-Custom-Data_new_processed.csv")

dataset = Dataset.from_pandas(df)
dataset[0]

{'id': 249,
 'text': 'A cybersquatting domain save-russia[.]today is launching DoS attacks on Ukrainian news sites.',
 'relations': "[{'from_id': 44658, 'id': 9, 'to_id': 44659, 'type': 'targets'}, {'from_id': 44656, 'id': 114, 'to_id': 44657, 'type': 'uses'}, {'from_id': 44658, 'id': 115, 'to_id': 44657, 'type': 'uses'}]",
 'diagnosis': 'The diagnosis is a cyber attack that involves the use of a cybersquatting domain save-russia[.]today to launch DoS attacks on Ukrainian news sites. The attacker targets the Ukrainian news sites as the victim, using the cybersquatting',
 'solutions': '1. Implementing DNS filtering to block access to known cybersquatting domains 2. Conducting regular vulnerability assessments and penetration testing to identify and address potential security weaknesses 3. Implementing firewalls and intrusion detection systems to detect and prevent Do',
 'id_1': 44656,
 'label_1': 'attack-pattern',
 'start_offset_1': 2,
 'end_offset_1': 16,
 'id_2': 44657,
 'label_2': 'u

In [20]:
train_dataset = dataset.select(range(300))
validation_dataset = dataset.select(range(300, 350))

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-0.5B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)


In [5]:
def create_prompt(text):
  prompt = '''Based on this text about potential cyber-attack
  {TEXT}

  Give me a diagnosis on the attack and its possible solution
  '''
  prompt = prompt.replace("{TEXT}", text[:512])

  messages = [
      {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
      {"role": "user", "content": prompt}
  ]

  return tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )

In [6]:
def generate_output(prompt, model):
  model.eval()
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
  with torch.no_grad():
      outputs = model.generate(
          **inputs,
          max_length=512,
          temperature=1.0,
          top_k=50,
          top_p=1.0,
          repetition_penalty=1.1
          # eos_token_id=terminators
      )

  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, outputs)
  ]

  return tokenizer.batch_decode(generated_ids, skip_special_tokens=False)

In [7]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Zero-Shot

In [None]:
from tqdm import tqdm

rouge1_total = 0.0
rouge2_total = 0.0
rougeL_total = 0.0

iterable_dataset = validation_dataset.to_iterable_dataset()
for row in tqdm(iterable_dataset, total=len(validation_dataset)):
  prompt = create_prompt(row["text"])
  response = generate_output(prompt, model)

  scores = rouge_score.compute(
      predictions=[response[0]],
      references=[row["diagnosis"] + "\n" + row["solutions"]]
  )

  rouge1_total += scores["rouge1"]
  rouge2_total += scores["rouge2"]
  rougeL_total += scores["rougeL"]

print("\nAverage ROUGE Scores")
print("ROUGE-1:", rouge1_total / len(validation_dataset))
print("ROUGE-2:", rouge2_total / len(validation_dataset))
print("ROUGE-L:", rougeL_total / len(validation_dataset))

100%|██████████| 50/50 [12:02<00:00, 14.44s/it]


Average ROUGE Scores
ROUGE-1: 0.24205454137076013
ROUGE-2: 0.02441641542112784
ROUGE-L: 0.11437688840361104





# Fine-Tuning

In [8]:
template = '''### Text:
{TEXT}

### Diagnosis:
{DIAGNOSIS}

### Solutions:
{SOLUTIONS}
'''

In [9]:
def formatting_prompts_func(example):
  output_texts = []
  for i in range(len(example["text"])):
    text = template.replace("{TEXT}", example["text"][i])
    text = text.replace("{DIAGNOSIS}", example["diagnosis"][i])
    text = text.replace("{SOLUTIONS}", example["solutions"][i])

    output_texts.append(text)

  return output_texts

In [10]:
import os
os.environ['WANDB_MODE'] = 'disabled'

In [22]:
import time
from transformers import TrainingArguments
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig

output_dir = f"./qwen-training-{str(int(time.time()))}"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=50,
    eval_accumulation_steps=5,
    per_device_train_batch_size=4,
    report_to=None
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config
)

trainer.train()



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Step,Training Loss
50,2.2904
100,2.2792
150,2.14
200,2.1095
250,2.0597
300,2.0467
350,1.9878
400,1.9791
450,1.9462
500,1.9108


TrainOutput(global_step=750, training_loss=2.0207281392415366, metrics={'train_runtime': 586.9627, 'train_samples_per_second': 5.111, 'train_steps_per_second': 1.278, 'total_flos': 1744593249484800.0, 'train_loss': 2.0207281392415366, 'epoch': 10.0})

In [24]:
trainer.model.save_pretrained("cyber-qwen-summarizaton")

In [25]:
from peft import PeftModel

new_model_id = "cyber-qwen-summarizaton"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
new_model = PeftModel.from_pretrained(base_model, new_model_id)
new_model = new_model.merge_and_unload()



In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
new_model.push_to_hub("RMA1403/cyber-qwen-summarizaton")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]



Saving checkpoint shards:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RMA1403/cyber-qwen-summarizaton/commit/c4d06a4dc58807306ffd4ad9f0248b323b265d54', commit_message='Upload Qwen2ForCausalLM', commit_description='', oid='c4d06a4dc58807306ffd4ad9f0248b323b265d54', pr_url=None, repo_url=RepoUrl('https://huggingface.co/RMA1403/cyber-qwen-summarizaton', endpoint='https://huggingface.co', repo_type='model', repo_id='RMA1403/cyber-qwen-summarizaton'), pr_revision=None, pr_num=None)

In [27]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

0

In [29]:
model_id = "RMA1403/cyber-qwen-summarizaton"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [30]:
from tqdm import tqdm

rouge1_total = 0.0
rouge2_total = 0.0
rougeL_total = 0.0

iterable_dataset = validation_dataset.to_iterable_dataset()
for row in tqdm(iterable_dataset, total=len(validation_dataset)):
  prompt = create_prompt(row["text"])
  response = generate_output(prompt, model)

  scores = rouge_score.compute(
      predictions=[response[0]],
      references=[row["diagnosis"] + "\n" + row["solutions"]]
  )

  rouge1_total += scores["rouge1"]
  rouge2_total += scores["rouge2"]
  rougeL_total += scores["rougeL"]

print("\nAverage ROUGE Scores")
print("ROUGE-1:", rouge1_total / len(validation_dataset))
print("ROUGE-2:", rouge2_total / len(validation_dataset))
print("ROUGE-L:", rougeL_total / len(validation_dataset))

100%|██████████| 50/50 [11:23<00:00, 13.67s/it]


Average ROUGE Scores
ROUGE-1: 0.2733460216010307
ROUGE-2: 0.04009295009765816
ROUGE-L: 0.1324242408111903



