# classifier-mistral

- includes
  - determining the computing device
  - model name
  - data path
  - config

In [4]:
import json
import evaluate
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import login
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from dotenv import load_dotenv
import os
from pathlib import Path
import random
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PRETRAINED_MODEL_PATH = "models-pretrained/"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3" 
DATA_PATH = "data"
OMM_PATH = "omm_v1"
TEST_DATA = "test.jsonl"
TRAIN_DATA = "train.jsonl"
MODEL_NAME_PATH = "mistral"
MODEL_DIR = "model"
TOKENIZER_DIR = "tokenizer"
RANDOM_SEED = 42

load_dotenv()
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.reset_peak_memory_stats()

# DEFINE QUANTIZATION HERE. Choose from ("none" | "8bit" | "4bit"), None for LoRa, others for QLoRa
QUANTIZATION = "None"

base_prompt = """You are a security classifier. You receive a text of json output from elastic security siem which contains the event of a system. Classify if this event Normal or Suspicious. Only Answer 'Normal' or 'Suspicious'"""

print("This model is training on",DEVICE)

This model is training on cuda:0


## data processing

- create label maps
- process json files and split into test data and train data(only need to run once)
  - 7:3 ratio

In [5]:
# label maps
id2label = {0: "Normal", 1: "Suspicious"}
label2id = {v:k for k,v in id2label.items()}

In [6]:
# data paths
normal_temp_file = os.path.join(DATA_PATH, "temp_normal.json")
sus_temp_file = os.path.join(DATA_PATH, "temp_sus.json")
test_data_file = os.path.join(DATA_PATH, TEST_DATA)
train_data_file = os.path.join(DATA_PATH, TRAIN_DATA)

In [4]:
# omm data
normal_data = {"1pass.json","2pass.json","3pass.json","4pass.json","5pass.json","6pass.json","7pass.json","8pass.json"}
sus_data = {"1output.json","2output.json","3output.json","4output.json","5output.json","6output.json","7output.json","8output.json"}

def read_json_files(file_set, path):
    data={}
    for file in file_set:
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    file_content = json.load(f)
                    data.update(file_content)
                except json.JSONDecodeError:
                    print(f"Error reading {file}: Invalid JSON format")
        else:
            print(f"Warning: {file} not found")
    return data

normal_data = read_json_files(normal_data, os.path.join(DATA_PATH, OMM_PATH))
sus_data = read_json_files(sus_data, os.path.join(DATA_PATH, OMM_PATH))

# Balance suspicious data
num_normal = len(normal_data)
num_sus = len(sus_data)

if num_sus < num_normal:
    sus_items = list(sus_data.items())
    needed = num_normal - num_sus
    duplicated = random.choices(sus_items, k=needed)
    for i, (k, v) in enumerate(duplicated):
        sus_data[f"{k}_dup{i}"] = v

with open(normal_temp_file, "w", encoding="utf-8") as f:
    json.dump(normal_data, f, indent=4)
with open(sus_temp_file, "w", encoding="utf-8") as f:
    json.dump(sus_data, f, indent=4)

open(train_data_file, "w", encoding="utf-8")

def json_to_plain_text(example):
    """Convert a JSON object into a plain text representation."""
    text_parts = []
    for key, value in example.items():
        if value not in [None, "", [], {}]:  # Ignore empty values
            if isinstance(value, (list, dict)):
                value = json.dumps(value, separators=(",", ":"))  # Convert lists/dicts to string
            text_parts.append(f"{key}: {value}")  # Format as "key: value"
    return " | ".join(text_parts)  # Join all key-value pairs

def save_events_to_jsonl(input_file, output_file, label):

    with open(input_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    with open(output_file, "a", encoding="utf-8") as f:
        for _, event_data in data.items():
            event_source = event_data["_source"]
            text_representation = json_to_plain_text(event_source)
            f.write(json.dumps({"text": text_representation, "label": label}) + "\n")

save_events_to_jsonl(normal_temp_file, train_data_file, "Normal") 
save_events_to_jsonl(sus_temp_file, train_data_file, "Suspicious")

temp_norm_path = Path(normal_temp_file)
temp_sus_path = Path(sus_temp_file)

if temp_norm_path.exists():
    temp_norm_path.unlink()
if temp_sus_path.exists():
    temp_sus_path.unlink()

In [None]:
def detect_schema_issues(input_file):
    """Detects inconsistent data types in a JSONL file"""
    field_types = {}

    with open(input_file, "r", encoding="utf-8") as infile:
        for line_number, line in enumerate(infile, start=1):
            try:
                data = json.loads(line)

                for key, value in data.items():
                    value_type = type(value).__name__

                    if key not in field_types:
                        field_types[key] = set()
                    field_types[key].add(value_type)

            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {line_number}")

    print("Detected Field Types:")
    for field, types in field_types.items():
        print(f"{field}: {types}")

# Run schema detection
detect_schema_issues(train_data_file)

## load dataset

In [7]:
label_encoder = LabelEncoder()

dataset = load_dataset("json", data_files={train_data_file}, split='train')

all_labels = [example["label"] for example in dataset]
label_encoder.fit(all_labels)

def format_prompt(example):
    prompt = f"<s>[INST] Classify this log as 'Normal' or 'Suspicious': {example['text']} [/INST]"
    return {"prompt": prompt, "labels": int(label_encoder.transform([example["label"]])[0])}

dataset = dataset.map(format_prompt)
dataset = dataset.train_test_split(test_size=0.2)
val_test_dataset = dataset['test'].train_test_split(test_size=0.5)

train_dataset = dataset["train"]
eval_dataset = val_test_dataset["train"]
test_dataset = val_test_dataset["test"]

Map:   0%|          | 0/115516 [00:00<?, ? examples/s]

In [12]:
print(train_dataset)

Dataset({
    features: ['text', 'label', 'prompt', 'labels'],
    num_rows: 92412
})


## fetching pretrained model

- fetch it only if it does not exist on models-pretrained directory (only need to run once)
- load the fetched model (run this if the model has already been fetched)

In [13]:
# create the paths needed
path = os.path.join(PRETRAINED_MODEL_PATH, MODEL_NAME_PATH)
pretrained_model_path = os.path.join(path, MODEL_DIR)
pretrained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

# Pre-define quantization configs

################## 4bit ##################
bb_config_4b = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
##########################################

################## 8bit ##################
bb_config_8b = BitsAndBytesConfig(
    load_in_8bit=True,
)
##########################################

def quantization_config(quantization):
    if quantization == "8bit":
        return bb_config_8b
    else:
        return bb_config_4b

In [None]:
# fetch the llama model from hugging face
login(token=os.getenv("hugging_face_PAG"))

if QUANTIZATION == "none":
    pretrained_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto", 
    torch_dtype="auto",
    trust_remote_code=True,
    ).to(DEVICE)
else:
    pretrained_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto", 
    torch_dtype="auto",
    quantization_config=quantization_config(QUANTIZATION),
    trust_remote_code=True,
    ).to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    add_bos_token=True,
    trust_remote_code=True,
    )

os.makedirs(path, exist_ok=True)
os.makedirs(pretrained_model_path, exist_ok=True)
os.makedirs(pretrained_tokenizer_path, exist_ok=True)

pretrained_model.save_pretrained(pretrained_model_path)
tokenizer.save_pretrained(pretrained_tokenizer_path)

NameError: name 'quantization_config' is not defined

In [36]:
# load the fetched model from models-pretrained
if QUANTIZATION == "none":
    pretrained_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_path,
    device_map="auto", 
    torch_dtype="auto",
    trust_remote_code=True,
    ).to(DEVICE)
else:
    pretrained_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_path,
    device_map="auto", 
    torch_dtype="auto",
    quantization_config=quantization_config(QUANTIZATION),
    trust_remote_code=True,
    ).to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_tokenizer_path, 
    add_bos_token=True,
    trust_remote_code=True,
    )



## tokenize function

In [37]:
# tokenize the dataset
def tokenize(example):
    out = tokenizer(example["prompt"], truncation=True, padding="max_length", max_length=4096)
    out["labels"] = example["labels"]
    return out

In [38]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## tokenize the datasets

In [None]:
tokenizer.pad_token = tokenizer.eos_token


In [21]:
train_dataset = train_dataset.map(tokenize)
eval_dataset = eval_dataset.map(tokenize)

Map:   0%|          | 0/92412 [00:00<?, ? examples/s]

Map:   0%|          | 0/11552 [00:00<?, ? examples/s]

## evaluate function

In [22]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## testing untrained model

In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [16]:
print("Untrained model predictions:")
print("--------------------------")
model_input = tokenizer(base_prompt, return_tensors="pt").to(DEVICE)
pretrained_model.eval()
with torch.no_grad():
    print(tokenizer.decode(pretrained_model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Untrained model predictions:
--------------------------
You are a security classifier. You receive a text of json output from elastic security siem which contains the event of a system. Classify if this event Normal or Suspicious. Only Answer 'Normal' or 'Suspicious'

```json
{
  "event": {
    "event_id": "1234567890",
    "event_type": "File Integrity Monitoring",
    "event_category": "File Integrity",
    "event_subcategory": "File Modified",
    "event_source": "Windows File System",
    "event_target": "C:\\Users\\User1\\Desktop\\file1.txt",
    "event_action": "Modified",
    "event_time": "2022-01-01T12:00:00Z",
    "event_user": "User1",
    "event_process": "Notepad.exe",
    "event_process_id": "1234",
    "event_process_parent_id": "5678",
    "event_process_command_line": "C:\\Windows\\System32\\notepad.exe C:\\Users\\User1\\Desktop\\file1.txt"
  }
}
```



## Train model

In [39]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
peft_config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'o_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [40]:
model = get_peft_model(pretrained_model, peft_config)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 7,254,839,296 || trainable%: 0.0939


In [25]:
def format(example):
    prompt = f"[CLS] {example["text"]}\nClassification:"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

In [41]:
# hyperparameters
lr = 1e-4
batch_size = 4
num_epochs = 2

# Explicitly set padding token in the model config
model.config.pad_token_id = tokenizer.pad_token_id

In [42]:
# define training arguments
training_args = TrainingArguments(
    output_dir= MODEL_NAME + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy = "epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    fp16=False,
    bf16=True,
    seed=RANDOM_SEED,
    label_names=["labels"]
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [44]:
model.gradient_checkpointing_enable()

In [45]:
tokenizer.pad_token = tokenizer.eos_token

In [46]:
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# save model

# create the paths needed
path = os.path.join("models", MODEL_NAME_PATH)
trained_model_path = os.path.join(path, MODEL_DIR)
trained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

os.makedirs(path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(trained_tokenizer_path, exist_ok=True)

full_model = model.merge_and_unload()
full_model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_tokenizer_path)

## testing trained model

In [None]:
print("trained model predictions:")
print("--------------------------")
isCorrect_trained = 0
total_trained = 0
accuracy_trained = 0
not_zero = 0
log_interval = 100
for i, entry in enumerate(dataset["test"]):
    text = entry["text"]
    total_trained += 1
    try:
        inputs = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            logits = model(inputs).logits
        predictions = torch.argmax(logits)
        if predictions == entry["label"]:
            isCorrect_trained += 1
        if predictions != 0:
            not_zero += 1
    
        if (i) % log_interval == 0:
            print(f"Processed: {total_trained}, Correct: {isCorrect_trained}, not0: {not_zero}", end="\r")
    except:
        print("Skipped one row")
        total_trained -= 1

accuracy_trained = isCorrect_trained / total_trained
print(f"Accuracy: {accuracy_trained}")