# classifier-llama

- includes
  - determining the computing device
  - model name
  - data path
  - config

In [1]:
import json
import re
from pprint import pprint
import evaluate
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
import numpy as np
from dotenv import load_dotenv
import os
from pathlib import Path
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PRETRAINED_MODEL_PATH = "models-pretrained/"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
DATA_PATH = "data"
OMM_PATH = "omm_v1"
TEST_DATA = "test.jsonl"
TRAIN_DATA = "train.jsonl"
MODEL_NAME_PATH = "llama"
MODEL_DIR = "model"
TOKENIZER_DIR = "tokenizer"
RANDOM_SEED = 42

load_dotenv()
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.reset_peak_memory_stats()

print("This model is training on",DEVICE)

This model is training on cuda:0


## data processing

- create label maps
- process json files and split into test data and train data(only need to run once)
  - 7:3 ratio

In [2]:
# label maps
id2label = {0: "Normal", 1: "Suspicious"}
label2id = {v:k for k,v in id2label.items()}

In [67]:
# data paths
normal_temp_file = os.path.join(DATA_PATH, "temp_normal.json")
sus_temp_file = os.path.join(DATA_PATH, "temp_sus.json")
test_data_file = os.path.join(DATA_PATH, TEST_DATA)
train_data_file = os.path.join(DATA_PATH, TRAIN_DATA)

In [101]:
# omm data
normal_data = {"1pass.json","2pass.json","3pass.json","4pass.json","5pass.json","6pass.json","7pass.json","8pass.json"}
sus_data = {"1output.json","2output.json","3output.json","4output.json","5output.json","6output.json","7output.json","8output.json"}

def read_json_files(file_set, path):
    data={}
    for file in file_set:
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    file_content = json.load(f)
                    data.update(file_content)
                except json.JSONDecodeError:
                    print(f"Error reading {file}: Invalid JSON format")
        else:
            print(f"Warning: {file} not found")
    return data

normal_data = read_json_files(normal_data, os.path.join(DATA_PATH, OMM_PATH))
sus_data = read_json_files(sus_data, os.path.join(DATA_PATH, OMM_PATH))

with open(normal_temp_file, "w", encoding="utf-8") as f:
    json.dump(normal_data, f, indent=4)
with open(sus_temp_file, "w", encoding="utf-8") as f:
    json.dump(sus_data, f, indent=4)

open(train_data_file, "w", encoding="utf-8")

def save_events_to_jsonl(input_file, output_file, label):

    with open(input_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    with open(output_file, "a", encoding="utf-8") as f:
        for _, event_data in data.items():
            event_source = event_data["_source"]
            event_source["label"] = label
            f.write(json.dumps(event_source) + "\n")

save_events_to_jsonl(normal_temp_file, train_data_file, label2id["Normal"]) 
save_events_to_jsonl(sus_temp_file, train_data_file, label2id["Suspicious"])

with open(train_data_file, "r", encoding="utf-8") as file:
    full_data = [json.loads(line) for line in file]

train_data, test_data = train_test_split(full_data, test_size=0.3, shuffle=True, random_state=RANDOM_SEED)

def flatten_data(y):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for key, value in x.items():
                flatten(value, name + key + '_')
        elif isinstance(x, list):
            for i, value in enumerate(x):
                flatten(value, name + str(i) + '_')
        else:
            out[name[:-1]] = str(x)  # 🔹 Convert all values to string

    flatten(y)
    return out


train_data = [flatten_data(entry) for entry in train_data]
test_data = [flatten_data(entry) for entry in test_data]

def get_all_columns(data_list):
    """Finds all unique keys in a dataset."""
    all_keys = set()
    for entry in data_list:
        all_keys.update(entry.keys())
    return all_keys

# Collect all unique columns from train & test data
train_columns = get_all_columns(train_data)
test_columns = get_all_columns(test_data)
all_columns = train_columns.union(test_columns)  # Merge both sets

def standardize_entry(entry, all_columns):
    """Ensures all rows have the same columns."""
    standardized_entry = {col: str(entry.get(col, " ")) for col in all_columns}  # Convert everything to string
    return standardized_entry

# Apply standardization
train_data = [standardize_entry(entry, all_columns) for entry in train_data]
test_data = [standardize_entry(entry, all_columns) for entry in test_data]

with open(test_data_file, "w", encoding="utf-8") as file:
    for entry in test_data:
        
        file.write(json.dumps(entry) + "\n")

with open(train_data_file, "w", encoding="utf-8") as file:
    for entry in train_data:
        
        file.write(json.dumps(entry) + "\n")

temp_norm_path = Path(normal_temp_file)
temp_sus_path = Path(sus_temp_file)

if temp_norm_path.exists():
    temp_norm_path.unlink()
if temp_sus_path.exists():
    temp_sus_path.unlink()

In [86]:
def detect_schema_issues(input_file):
    """Detects inconsistent data types in a JSONL file"""
    field_types = {}

    with open(input_file, "r", encoding="utf-8") as infile:
        for line_number, line in enumerate(infile, start=1):
            try:
                data = json.loads(line)

                for key, value in data.items():
                    value_type = type(value).__name__

                    if key not in field_types:
                        field_types[key] = set()
                    field_types[key].add(value_type)

            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {line_number}")

    print("Detected Field Types:")
    for field, types in field_types.items():
        print(f"{field}: {types}")

# Run schema detection
detect_schema_issues(train_data_file)

Detected Field Types:
@timestamp: {'str'}
agent: {'dict'}
data_stream: {'dict'}
ecs: {'dict'}
elastic: {'dict'}
event: {'dict'}
host: {'dict'}
message: {'str'}
process: {'dict'}
registry: {'dict'}
user: {'dict'}
label: {'str'}
file: {'dict'}
group: {'dict'}
destination: {'dict'}
elastic_agent: {'dict'}
flow: {'dict'}
network: {'dict'}
source: {'dict'}
type: {'str'}
dll: {'dict'}
dns: {'dict'}
cloud: {'dict'}
input: {'dict'}
log: {'dict'}
powershell: {'dict'}
winlog: {'dict'}
client: {'dict'}
method: {'str'}
query: {'str'}
related: {'dict'}
resource: {'str'}
server: {'dict'}
status: {'str'}
tls: {'dict'}
Effective_process: {'dict'}
icmp: {'dict'}
path: {'str'}
system: {'dict'}
tags: {'list'}
_temp: {'dict'}
error: {'dict'}


## load dataset

In [102]:
dataset = load_dataset("json", data_files={"train": train_data_file, "test": test_data_file})

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['process_Ext_code_signature_2_subject_name', 'registry_data_strings_3', 'user_domain', 'winlog_event_data_Payload_0', 'source_ip', 'dns_answers_1_ttl', 'event_category', 'process_args_16', 'process_group_leader_entity_id', 'tls_detailed_client_hello_extensions_supported_groups_3', 'tls_client_supported_ciphers_14', 'process_entry_leader_working_directory', 'dns_answers_3_name', 'process_args_3', 'process_parent_supplemental_groups_2_id', 'tls_detailed_client_hello_extensions_signature_algorithms_4', 'powershell_file_script_block_id', 'process_Ext_effective_parent_entity_id', 'flow_final', 'dll_Ext_code_signature_1_subject_name', 'registry_data_strings_17', 'winlog_activity_id', 'tls_client_server_name', 'tls_detailed_client_hello_supported_compression_methods_0', 'winlog_user_identifier', 'dns_question_registered_domain', 'dll_Ext_code_signature_0_trusted', 'related_ip_6', 'file_Ext_windows_zone_identifier', 'process_code_signature_

## fetching pretrained model

- fetch it only if it does not exist on models-pretrained directory (only need to run once)
- load the fetched model (run this if the model has already been fetched)

In [46]:
# create the paths needed
path = os.path.join(PRETRAINED_MODEL_PATH, MODEL_NAME_PATH)
pretrained_model_path = os.path.join(path, MODEL_DIR)
pretrained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

In [None]:
# fetch the llama model from hugging face
login(token=os.getenv("hugging_face_PAG"))

pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    ).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    pretrained_model.resize_token_embeddings(len(tokenizer))

os.makedirs(path, exist_ok=True)
os.makedirs(pretrained_model_path, exist_ok=True)
os.makedirs(pretrained_tokenizer_path, exist_ok=True)

pretrained_model.save_pretrained(pretrained_model_path)
tokenizer.save_pretrained(pretrained_tokenizer_path)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('models-pretrained/llama/tokenizer/tokenizer_config.json',
 'models-pretrained/llama/tokenizer/special_tokens_map.json',
 'models-pretrained/llama/tokenizer/tokenizer.json')

In [33]:
# load the fetched model from models-pretrained
pretrained_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_path).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    pretrained_model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


## tokenize function

In [47]:
# tokenize the dataset
def tokenize_function(examples):
    examples_copy = examples.copy()
    label = examples_copy.pop("label", 0)
    texts = json.dump(examples_copy, separators=(",", ":"))

    label = examples.get("label", 0)

    encoding = tokenizer(
        texts,
        truncation=True, 
        padding="max_length", 
        max_length=5000,
        return_tensors="pt"
    )

    encoding["labels"] = torch.tensor(label)
    
    return encoding

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## tokenize the dataset

In [50]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        num_shards: 1
    })
    test: IterableDataset({
        features: Unknown,
        num_shards: 1
    })
})

## evaluate function

In [39]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.tensor(predictions).to(DEVICE)
    labels = torch.tensor(labels).to(DEVICE)

    predictions = torch.argmax(predictions, dim=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## test data processing

In [40]:
def dict_to_string(data):
    if isinstance(data, str):
        data = json.loads(data)
    
    if not data:
        return ""
    
    formatted_string =  ", ".join([f"{k}: {v}" for k, v in data.items() if k != "label"])
    return {"input": formatted_string}

test_inputs = dataset["test"].map(dict_to_string)
test_inputs

IterableDataset({
    features: Unknown,
    num_shards: 1
})

## testing untrained model

In [42]:
print("Untrained model predictions:")
print("--------------------------")
isCorret_untrained = 0;
total_untrained = 0;
accuracy_untrained = 0;
for text in test_inputs:
    try:
        total_untrained += 1
        inputs = tokenizer.encode(text["input"], return_tensors="pt").to(DEVICE)
        logits = pretrained_model(inputs).logits
        predictions = torch.argmax(logits, dim=1)
        if predictions == text["label"]:
            isCorret_untrained += 1
    except:
        print(f"Skipping malformed row: {text}")

accuracy_untrained = isCorret_untrained / total_untrained
print(f"Accuracy: {accuracy_untrained}")

Failed to load JSON from file '/home/hugo/Desktop/mrnet/classifier/data/test.jsonl' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Column(/event/category) changed from array to string in row 4


Untrained model predictions:
--------------------------


ArrowInvalid: JSON parse error: Column(/event/category) changed from array to string in row 4