In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes trl

In [2]:
!pip install -U transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/formatted.jsonl",  # üîÅ change path if needed
    split="train"
)

# ‚úÖ Use only 15k samples (random)
dataset = dataset.shuffle(seed=42).select(range(min(15000, len(dataset))))

print("Dataset size:", len(dataset))
print(dataset[0])

Dataset size: 15000
{'text': "<s>[INST] You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver accurate, in-depth, actionable guidance on information-security principles‚Äîconfidentiality, integrity, availability, authenticity, non-repudiation, and privacy‚Äîby offering concise executive summaries that drill down into technical detail, industry standards, and threat models while referencing frameworks such as NIST CSF and MITRE ATT&CK; you may share defensive scripts, detection rules, lab-safe PoC payloads, exploit snippets, and hardening checklists clearly marked for educational/testing use only, redacting or stubbing any data that could cause real harm in production. You must never generate or improve ransomware, wipers, botnets, RATs, phishing kits, social-engineering lures, or any instructions that facilitate fraud, data theft, unauthorized intrusion, or the defeat of security controls‚Äîin such cases you must briefly refuse with an apology

In [5]:
print(dataset[0])

{'text': "<s>[INST] You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver accurate, in-depth, actionable guidance on information-security principles‚Äîconfidentiality, integrity, availability, authenticity, non-repudiation, and privacy‚Äîby offering concise executive summaries that drill down into technical detail, industry standards, and threat models while referencing frameworks such as NIST CSF and MITRE ATT&CK; you may share defensive scripts, detection rules, lab-safe PoC payloads, exploit snippets, and hardening checklists clearly marked for educational/testing use only, redacting or stubbing any data that could cause real harm in production. You must never generate or improve ransomware, wipers, botnets, RATs, phishing kits, social-engineering lures, or any instructions that facilitate fraud, data theft, unauthorized intrusion, or the defeat of security controls‚Äîin such cases you must briefly refuse with an apology and a one-sentence 

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [7]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [8]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # ‚úÖ ADD THIS LINE (IMPORTANT)
    tokens["labels"] = tokens["input_ids"].copy()

    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/results",

    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,

    logging_steps=20,

    # ‚úÖ Save checkpoints
    save_steps=200,
    save_total_limit=3,

    # ‚ùå removed evaluation_strategy (causing error)
    report_to="none"
)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [12]:
trainer.train()

Step,Training Loss
20,2.502136
40,1.617574
60,1.074876
80,1.021276
100,0.971715
120,0.981772
140,0.941804
160,0.922039
180,0.910259
200,0.89392


TrainOutput(global_step=1875, training_loss=0.8573296081542969, metrics={'train_runtime': 4964.7511, 'train_samples_per_second': 3.021, 'train_steps_per_second': 0.378, 'total_flos': 4.772223516672e+16, 'train_loss': 0.8573296081542969, 'epoch': 1.0})

In [13]:
model.save_pretrained("/content/drive/MyDrive/fine_tuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_model")

('/content/drive/MyDrive/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_model/chat_template.jinja',
 '/content/drive/MyDrive/fine_tuned_model/tokenizer.json')

In [14]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load base model (normal, not 4bit)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype="auto"
)

# Load LoRA
model = PeftModel.from_pretrained(
    model,
    "/content/drive/MyDrive/fine_tuned_model"
)

# Merge LoRA ‚Üí base model
model = model.merge_and_unload()

# Save merged model
model.save_pretrained("/content/drive/MyDrive/merged_model")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.save_pretrained("/content/drive/MyDrive/merged_model")

('/content/drive/MyDrive/merged_model/tokenizer_config.json',
 '/content/drive/MyDrive/merged_model/chat_template.jinja',
 '/content/drive/MyDrive/merged_model/tokenizer.json')

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/merged_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [20]:
prompt = """<s>[INST] You are a cybersecurity expert.
Analyze encrypted C2 channels using TLS and discuss detection techniques like JA3 fingerprinting, beaconing analysis, and certificate inspection. [/INST]"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output = model.generate(
    **inputs,
    max_new_tokens=400,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    repetition_penalty=1.2,
)

response = tokenizer.decode(output[0], skip_special_tokens=True)

# remove weird tokens
response = response.split("</s>")[0]

print(response)

[INST] You are a cybersecurity expert. 
Analyze encrypted C2 channels using TLS and discuss detection techniques like JA3 fingerprinting, beaconing analysis, and certificate inspection. [/INST] Encrypted Communication Channel Analysis: Implement SSL/TLS to encrypt communication between attackers and remote controllers (C2). This is done through the use of Transport Layer Security protocols such as Transport Layer Security over Established Network (TLSv1) or Secure Socket Layer (SSLv3) for HTTPS. The key exchange process should involve cryptographic algorithms like RSA encryption with ECB mode padding followed by DH-based key agreement. To analyze this channel, implement tools that can inspect traffic patterns, decode ciphertexts, and identify signature variations from legitimate communication streams. These include tools like Wireshark for packet capture, OpenSSL's `ssl_verify` function, and PostgreSQL's `pgcrypto` extension for decrypting data. Certificate validation becomes critical 