# classifier-llama

- includes
  - determining the computing device
  - model name
  - data path
  - config

In [None]:
import json
import re
from pprint import pprint
import evaluate
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
import numpy as np
from dotenv import load_dotenv
import os
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PRETRAINED_MODEL_PATH = "models-pretrained/"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
DATA_PATH = "data"
OMM_PATH = "omm_v1"
NORMAL_DATA = "normal_data_output.json"
SUSPICIOUS_DATA = "suspicious_data_output.json"
MODEL_NAME_PATH = "llama"
MODEL_DIR = "model"
TOKENIZER_DIR = "tokenizer"
RANDOM_SEED = 42

load_dotenv()
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.reset_peak_memory_stats()

print("This machine is training on",DEVICE)

This machine is training on cuda:0


## data processing

- create label maps
- process json files (only need to run once)

In [2]:
# label maps
id2label = {0: "Normal", 1: "Suspicious"}
label2id = {v:k for k,v in id2label.items()}

In [4]:
# omm data
normal_data = {"1pass.json","2pass.json","3pass.json","4pass.json","5pass.json","6pass.json","7pass.json","8pass.json"}
sus_data = {"1output.json","2output.json","3output.json","4output.json","5output.json","6output.json","7output.json","8output.json"}

def read_json_files(file_set, path):
    data={}
    for file in file_set:
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    file_content = json.load(f)
                    data.update(file_content)
                except json.JSONDecodeError:
                    print(f"Error reading {file}: Invalid JSON format")
        else:
            print(f"Warning: {file} not found")
    return data

normal_data = read_json_files(normal_data, os.path.join(DATA_PATH, OMM_PATH))
sus_data = read_json_files(sus_data, os.path.join(DATA_PATH, OMM_PATH))

normal_output_file = os.path.join(DATA_PATH, NORMAL_DATA)
sus_output_file = os.path.join(DATA_PATH, SUSPICIOUS_DATA)

with open(normal_output_file, "w", encoding="utf-8") as f:
    json.dump(normal_data, f, indent=4)
with open(sus_output_file, "w", encoding="utf-8") as f:
    json.dump(sus_data, f, indent=4)

## fetching pretrained model

- fetch it only if it does not exist on models-pretrained directory (only need to run once)
- 

In [None]:
login(token=os.getenv("hugging_face_PAG"))

pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    ).to(DEVICE)
pretrained_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

path = os.path.join(PRETRAINED_MODEL_PATH, MODEL_NAME_PATH)

os.makedirs(path, exist_ok=True)

pretrained_model.save_pretrained(os.path.join(path, MODEL_DIR))
pretrained_tokenizer.save_pretrained(os.path.join(path, TOKENIZER_DIR))

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('models-pretrained/llama/tokenizer/tokenizer_config.json',
 'models-pretrained/llama/tokenizer/special_tokens_map.json',
 'models-pretrained/llama/tokenizer/tokenizer.json')