# classifier-llama

- includes
  - determining the computing device
  - model name
  - data path
  - config

In [1]:
import json
import re
from pprint import pprint
import evaluate
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
import numpy as np
from dotenv import load_dotenv
import os
from pathlib import Path
from datetime import datetime
import random
from math import floor

 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PRETRAINED_MODEL_PATH = "models-pretrained/"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
DATA_PATH = "data"
OMM_PATH = "omm_v1"
TEST_DATA = "test.jsonl"
TRAIN_DATA = "train.jsonl"
MODEL_NAME_PATH = "llama"
MODEL_DIR = "model"
TOKENIZER_DIR = "tokenizer"
RANDOM_SEED = 42

load_dotenv()
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.reset_peak_memory_stats()

print("This model is training on",DEVICE)

This model is training on cuda:0


## data processing

- create label maps
- process json files and split into test data and train data(only need to run once)
  - 7:3 ratio

In [3]:
# label maps
id2label = {0: "Normal", 1: "Suspicious"}
label2id = {v:k for k,v in id2label.items()}

In [8]:
# data paths
normal_temp_file = os.path.join(DATA_PATH, "temp_normal.json")
sus_temp_file = os.path.join(DATA_PATH, "temp_sus.json")
test_data_file = os.path.join(DATA_PATH, TEST_DATA)
train_data_file = os.path.join(DATA_PATH, TRAIN_DATA)

In [4]:
# omm data
normal_data = {"1pass.json","2pass.json","3pass.json","4pass.json","5pass.json","6pass.json","7pass.json","8pass.json"}
sus_data = {"1output.json","2output.json","3output.json","4output.json","5output.json","6output.json","7output.json","8output.json"}

def read_json_files(file_set, path):
    data={}
    for file in file_set:
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    file_content = json.load(f)
                    data.update(file_content)
                except json.JSONDecodeError:
                    print(f"Error reading {file}: Invalid JSON format")
        else:
            print(f"Warning: {file} not found")
    return data

normal_data = read_json_files(normal_data, os.path.join(DATA_PATH, OMM_PATH))
sus_data = read_json_files(sus_data, os.path.join(DATA_PATH, OMM_PATH))

# Balance suspicious data
num_normal = len(normal_data)
num_sus = len(sus_data)

if num_sus < num_normal:
    sus_items = list(sus_data.items())
    needed = num_normal - num_sus
    duplicated = random.choices(sus_items, k=needed)
    for i, (k, v) in enumerate(duplicated):
        sus_data[f"{k}_dup{i}"] = v

with open(normal_temp_file, "w", encoding="utf-8") as f:
    json.dump(normal_data, f, indent=4)
with open(sus_temp_file, "w", encoding="utf-8") as f:
    json.dump(sus_data, f, indent=4)

open(train_data_file, "w", encoding="utf-8")

def json_to_plain_text(example):
    """Convert a JSON object into a plain text representation."""
    text_parts = []
    for key, value in example.items():
        if value not in [None, "", [], {}]:  # Ignore empty values
            if isinstance(value, (list, dict)):
                value = json.dumps(value, separators=(",", ":"))  # Convert lists/dicts to string
            text_parts.append(f"{key}: {value}")  # Format as "key: value"
    return " | ".join(text_parts)  # Join all key-value pairs

def save_events_to_jsonl(input_file, output_file, label):

    with open(input_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    with open(output_file, "a", encoding="utf-8") as f:
        for _, event_data in data.items():
            event_source = event_data["_source"]
            text_representation = json_to_plain_text(event_source)
            f.write(json.dumps({"text": text_representation, "label": label}) + "\n")

save_events_to_jsonl(normal_temp_file, train_data_file, label2id["Normal"]) 
save_events_to_jsonl(sus_temp_file, train_data_file, label2id["Suspicious"])

with open(train_data_file, "r", encoding="utf-8") as file:
    full_data = [json.loads(line) for line in file]

train_data, test_data = train_test_split(full_data, test_size=0.3, shuffle=True, random_state=RANDOM_SEED)

with open(test_data_file, "w", encoding="utf-8") as file:
    for entry in test_data:
        
        file.write(json.dumps(entry) + "\n")

with open(train_data_file, "w", encoding="utf-8") as file:
    for entry in train_data:
        
        file.write(json.dumps(entry) + "\n")

temp_norm_path = Path(normal_temp_file)
temp_sus_path = Path(sus_temp_file)

if temp_norm_path.exists():
    temp_norm_path.unlink()
if temp_sus_path.exists():
    temp_sus_path.unlink()

## load dataset

In [9]:
dataset = load_dataset("json", data_files={"train": train_data_file, "test": test_data_file})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 80861
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 34655
    })
})

## fetching pretrained model

- fetch it only if it does not exist on models-pretrained directory (only need to run once)
- load the fetched model (run this if the model has already been fetched)

In [5]:
# create the paths needed
path = os.path.join(PRETRAINED_MODEL_PATH, MODEL_NAME_PATH)
pretrained_model_path = os.path.join(path, MODEL_DIR)
pretrained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

In [7]:
# fetch the llama model from hugging face
login(token=os.getenv("hugging_face_PAG"))

pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    ).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    pretrained_model.resize_token_embeddings(len(tokenizer))

os.makedirs(path, exist_ok=True)
os.makedirs(pretrained_model_path, exist_ok=True)
os.makedirs(pretrained_tokenizer_path, exist_ok=True)

pretrained_model.save_pretrained(pretrained_model_path)
tokenizer.save_pretrained(pretrained_tokenizer_path)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


('models-pretrained/llama\\tokenizer\\tokenizer_config.json',
 'models-pretrained/llama\\tokenizer\\special_tokens_map.json',
 'models-pretrained/llama\\tokenizer\\tokenizer.json')

In [6]:
# load the fetched model from models-pretrained
pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_path,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    pretrained_model.resize_token_embeddings(len(tokenizer))

## tokenize function

In [7]:
# tokenize the dataset
def tokenize_function(examples):
    text = examples["text"]

    # Tokenize texts in batch mode
    encoding = tokenizer(
        text,
        truncation=True, 
        padding="max_length", 
        max_length=5000,
        return_tensors="pt"
    )

    return encoding

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## tokenize the dataset

In [10]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/80861 [00:00<?, ? examples/s]

Map:   0%|          | 0/34655 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 80861
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 34655
    })
})

In [11]:
tokenized_dataset["train"].to_json(os.path.join(DATA_PATH,"train_tokenized.jsonl"))
tokenized_dataset["test"].to_json(os.path.join(DATA_PATH,"test_tokenized.jsonl"))

Creating json from Arrow format:   0%|          | 0/81 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

1568777606

In [5]:
tokenized_dataset = load_dataset("json", data_files={"train": os.path.join(DATA_PATH,"train_tokenized.jsonl"), "test": os.path.join(DATA_PATH,"test_tokenized.jsonl")})

## evaluate function

In [10]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## testing untrained model

In [11]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [23]:
print("Untrained model predictions:")
print("--------------------------")
isCorret_untrained = 0
total_untrained = 0
accuracy_untrained = 0
not_zero = 0
log_interval = 100
for i, entry in enumerate(dataset["test"]):
    text = entry["text"]
    total_untrained += 1
    inputs = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = pretrained_model(inputs).logits
    predictions = torch.argmax(logits)
    if predictions == entry["label"]:
        isCorret_untrained += 1
    if predictions != 0:
        not_zero += 1
    
    if (i) % log_interval == 0:
        print(f"Processed: {total_untrained}, Correct: {isCorret_untrained}, not0: {not_zero}", end="\r")

accuracy_untrained = isCorret_untrained / total_untrained
print(f"Accuracy: {accuracy_untrained}")

Untrained model predictions:
--------------------------
Processed: 301, Correct: 265, not0: 36

KeyboardInterrupt: 

## Train model

In [12]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    r=4
)
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'o_proj', 'k_proj', 'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [13]:
model = get_peft_model(pretrained_model, peft_config)
model.print_trainable_parameters()

trainable params: 856,064 || all params: 1,236,676,608 || trainable%: 0.0692


In [14]:
# hyperparameters
lr = 1e-4
batch_size = 4
num_epochs = 1

# Explicitly set padding token in the model config
model.config.pad_token_id = tokenizer.pad_token_id

In [15]:
# define training arguments
training_args = TrainingArguments(
    output_dir= MODEL_NAME + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy = "epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    seed=RANDOM_SEED,
    label_names=["labels"]
)

In [16]:
# if the data is too long
# Take half of each dataset
half_train = floor(len(tokenized_dataset["train"]) * 0.4)
half_eval = floor(len(tokenized_dataset["test"]) * 0.4)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(half_train)),
    eval_dataset=tokenized_dataset["test"].select(range(half_eval)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
model.gradient_checkpointing_enable()

In [19]:
model = torch.compile(model)

In [20]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [21]:
# save model

# create the paths needed
path = os.path.join("models", MODEL_NAME_PATH)
trained_model_path = os.path.join(path, MODEL_DIR)
trained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

os.makedirs(path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(trained_tokenizer_path, exist_ok=True)

full_model = model.merge_and_unload()
full_model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_tokenizer_path)

('models\\llama\\tokenizer\\tokenizer_config.json',
 'models\\llama\\tokenizer\\special_tokens_map.json',
 'models\\llama\\tokenizer\\tokenizer.json')

## load the trained model

In [4]:
path = os.path.join("models", MODEL_NAME_PATH)
trained_model_path = os.path.join(path, MODEL_DIR)
trained_tokenizer_path = os.path.join(path, TOKENIZER_DIR)

# load the fetched trained model
model = AutoModelForSequenceClassification.from_pretrained(
    trained_model_path,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(trained_tokenizer_path, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    pretrained_model.resize_token_embeddings(len(tokenizer))

## testing trained model

In [None]:
print("trained model predictions:")
print("--------------------------")
isCorrect_trained = 0
total_trained = 0
accuracy_trained = 0
not_zero = 0
log_interval = 100
for i, entry in enumerate(dataset["test"]):
    text = entry["text"]
    total_trained += 1
    try:
        inputs = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            logits = model(inputs).logits
        predictions = torch.argmax(logits)
        if predictions == entry["label"]:
            isCorrect_trained += 1
        if predictions != 0:
            not_zero += 1
    
        if (i) % log_interval == 0:
            print(f"Processed: {total_trained}, Correct: {isCorrect_trained}, not0: {not_zero}", end="\r")
    except:
        print("Skipped one row")
        total_trained -= 1

accuracy_trained = isCorrect_trained / total_trained
print(f"Accuracy: {accuracy_trained}")

trained model predictions:
--------------------------
Processed: 6301, Correct: 6157, not0: 3118