In [1]:
import torch
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [2]:
#1. install required packages
!pip install -q bitsandbytes transformers accelerate peft datasets sentencepiece safetensors

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.

In [3]:
import os
DRIVE_BASE = '/kaggle/working/ti_llm'
os.makedirs(DRIVE_BASE, exist_ok=True)
print('Saving outputs to', DRIVE_BASE)

Saving outputs to /kaggle/working/ti_llm


In [None]:
#3. Hugging Face login
HF_TOKEN = 'REPLACE_WITH_YOUR_HF_TOKEN'  # Hugging Face token
if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN)

In [5]:
#4. choose base model
MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.2'
MAX_LENGTH = 512

In [6]:
#5. download a small threat intelligence dataset
import requests, json

misp_url = 'https://raw.githubusercontent.com/MISP/misp-galaxy/main/clusters/threat-actor.json'
resp = requests.get(misp_url)
misp = resp.json()
values = misp.get('values', [])
print('MISP actors fetched:', len(values))

attack_url = 'https://raw.githubusercontent.com/mitre-attack/attack-stix-data/master/enterprise-attack/enterprise-attack-17.0.json'
resp2 = requests.get(attack_url)
attack = resp2.json()
techniques = []
for obj in attack.get('objects', [])[:200]:
  if obj.get('type') == 'attack-pattern':
    name = obj.get('name')
    desc = obj.get('description', '')
    techniques.append({'name': name, 'description': desc})
print('MITRE ATT&CK techniques fetched:', len(techniques))

MISP actors fetched: 857
MITRE ATT&CK techniques fetched: 199


In [7]:
#6. build small instruction-response pairs training dataset
train_pairs = []
for actor in values[:80]:   # limit MISP actors to 80 for test
  val = actor.get('value')
  desc = actor.get('description') or actor.get('meta', {}).get('description', 'No description available')
  instr = f"Tell me abou the threat actor {val}."
  resp = desc
  train_pairs.append({'instruction': instr, 'output': resp})

for tech in techniques[:120]:   # limit MITRE ATT&CK techniques to 120 for test
  instr = f"Explain the ATT&CK technique {tech['name']}."
  resp = tech['description'] or 'Description not available.'
  train_pairs.append({'instruction': instr, 'output': resp})

print('Total training pairs:', len(train_pairs))

with open(os.path.join(DRIVE_BASE, 'ti_train_pairs.json'), 'w') as file:
  json.dump(train_pairs, file, indent=2)

Total training pairs: 200


In [8]:
#7. prepare dataset with 'datasets' library
from datasets import Dataset

def to_prompt(x):
  # Use .get() to safely access keys and provide a default value
  instruction = x.get('instruction', '')
  output = x.get('output', '')

  prompt = (
      "### Instruction:\n" + instruction + "\n\n"
      + "### Response:\n" + output
  )
  return {'text': prompt}

raw_ds = Dataset.from_list(train_pairs)
ds = raw_ds.map(to_prompt)
print(ds[0])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'instruction': 'Tell me abou the threat actor APT1.', 'output': "PLA Unit 61398 (Chinese: 61398部队, Pinyin: 61398 bùduì) is the Military Unit Cover Designator (MUCD)[1] of a People's Liberation Army advanced persistent threat unit that has been alleged to be a source of Chinese computer hacking attacks", 'text': "### Instruction:\nTell me abou the threat actor APT1.\n\n### Response:\nPLA Unit 61398 (Chinese: 61398部队, Pinyin: 61398 bùduì) is the Military Unit Cover Designator (MUCD)[1] of a People's Liberation Army advanced persistent threat unit that has been alleged to be a source of Chinese computer hacking attacks"}


In [9]:
#8. tokenize dataset and create datacollator
from transformers import AutoTokenizer

print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(examples):
  return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

tokenized_ds = ds.map(tokenize_fn, batched=True, remove_columns=['text'])
print(tokenized_ds[0])


Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'instruction': 'Tell me abou the threat actor APT1.', 'output': "PLA Unit 61398 (Chinese: 61398部队, Pinyin: 61398 bùduì) is the Military Unit Cover Designator (MUCD)[1] of a People's Liberation Army advanced persistent threat unit that has been alleged to be a source of Chinese computer hacking attacks", 'input_ids': [1, 27332, 3133, 3112, 28747, 13, 22467, 528, 534, 280, 272, 5483, 10964, 330, 6316, 28740, 28723, 13, 13, 27332, 12107, 28747, 13, 3898, 28741, 13332, 28705, 28784, 28740, 28770, 28774, 28783, 325, 1209, 5965, 28747, 28705, 28784, 28740, 28770, 28774, 28783, 29237, 30031, 28725, 367, 4279, 262, 28747, 28705, 28784, 28740, 28770, 28774, 28783, 287, 28912, 670, 28943, 28731, 349, 272, 19844, 13332, 16881, 8648, 1028, 325, 28755, 28779, 5072, 10908, 28740, 28793, 302, 264, 5619, 28742, 28713, 8592, 352, 8094, 10023, 24777, 5483, 5028, 369, 659, 750, 22362, 298, 347, 264, 2832, 302, 6707, 6074, 14413, 288, 10813, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [10]:
#9. QLoRA + LoRA setup
import torch
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

print('Loading 4-bit quantized model (this may take a few minutes)...')
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.to('cuda:0')

# config LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
print('Model wrapped with LoRA.')

2025-08-10 03:43:34.004409: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754797414.274224      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754797414.346227      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading 4-bit quantized model (this may take a few minutes)...


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Model wrapped with LoRA.


In [12]:
# test base model before training
import torch

# inference helper for base model
def generate_answer_base(prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# test
q = 'Explain the ATT&CK technique Rc.common.'
prompt = f"### Instruction:\n{q}\n\n### Response:\n"

print("=== Base model ===")
print(generate_answer_base(prompt, max_new_tokens=150))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


=== Base model ===
### Instruction:
Explain the ATT&CK technique Rc.common.

### Response:

The Rc.common technique, as defined in the MITRE ATT&CK framework, refers to the use of Remote Command and Shell (Rc) tools to execute commands on a target system. These tools can be used for various malicious purposes, such as data exfiltration, privilege escalation, or installing additional malware.

Rc.common is a tactic within the Remote Access (R) category of the ATT&CK framework. It specifically describes the use of remote command and shell tools to gain access to a target system and execute commands. This can be achieved through various means, such as exploiting vulnerabilities in software, using stolen credentials, or exploiting misconfigured services.



In [11]:
#10. training with 'Trainer'
from transformers import Trainer, TrainingArguments, default_data_collator

training_args = TrainingArguments(
    output_dir=os.path.join(DRIVE_BASE, 'ti_llm_output'),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=150,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    optim='paged_adamw_8bit',
    disable_tqdm=False,
    report_to="none",
    #logging_strategy="steps"
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,1.03
20,0.8705
30,0.8159
40,0.7211
50,0.6534
60,0.5984
70,0.6168
80,0.5515
90,0.5415
100,0.4293


TrainOutput(global_step=150, training_loss=0.5735097932815552, metrics={'train_runtime': 1499.0347, 'train_samples_per_second': 1.601, 'train_steps_per_second': 0.1, 'total_flos': 5.060002450086298e+16, 'train_loss': 0.5735097932815552, 'epoch': 11.56})

In [12]:
#11. save LoRA adapter and tokenizer

peft_dir = os.path.join(DRIVE_BASE, 'peft_lora_ti_adapter')
model.save_pretrained(peft_dir)
tokenizer.save_pretrained(peft_dir)
print('Saved peft adapter files to: ', peft_dir)

Saved peft adapter files to:  /kaggle/working/ti_llm/peft_lora_ti_adapter


In [14]:
#12. inference: load base model + LoRA adapter and run queries
from peft import PeftModel
import torch

model_base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map={'': 0},
    trust_remote_code=True
)
model_base.eval()

model_with_lora = PeftModel.from_pretrained(
    model_base,
    peft_dir,
    device_map={'': 0}
)
model_with_lora.eval()

# inference helper for LoRA fine-tuned model
def generate_answer_lora(prompt, max_new_tokens=200):
  inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
  with torch.no_grad():
    out = model_with_lora.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
  return tokenizer.decode(out[0], skip_special_tokens=True)

# test fine-tuned model
q = 'Explain the ATT&CK technique Rc.common.'
prompt = f"### Instruction:\n{q}\n\n### Response:\n"

print("\n=== Fine-tuned (LoRA) model ===")
print(generate_answer_lora(prompt, max_new_tokens=150))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Fine-tuned (LoRA) model ===
### Instruction:
Explain the ATT&CK technique Rc.common.

### Response:
During the boot process, macOS executes <code>source /etc/rc.common</code>, which is a shell script containing various utility functions. This file also defines routines for processing command-line arguments and for setting environment variables, such as <code>OSXShell>

Adversaries can infect the <code>rc.common</code> file with shell scripts that execute during bootup as the root account. This technique can be used to persistively infect the system.

### Mach-Ones

There is a variant of this technique that targets the Mach-ones file <code>source /etc/mach-ones</code>. This file is similar to <code>rc
