# Dataset Prep

In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('/home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/train_emails.csv')
val_df = pd.read_csv('/home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/val_emails.csv')
test_df = pd.read_csv('/home/kosmas/projects/llm-in-cybersecurity/final-project/datasets/test_emails.csv')

# Display basic information
print(f"Training set: {train_df.shape[0]} examples")
print(f"Validation set: {val_df.shape[0]} examples")
print(f"Test set: {test_df.shape[0]} examples")

# Check for class balance
print("\nClass distribution:")
print("Training set:")
print(train_df['label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))
print("\nValidation set:")
print(val_df['label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))
print("\nTest set:")
print(test_df['label'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))

# Display a few examples with both ham and spam
def display_examples(df, label_value, n=3):
    examples = df[df['label'] == label_value].sample(n, random_state=42)
    return examples[['subject', 'body', 'label']]

# Display ham examples (label=0)
print("\nSample HAM emails:")
display(display_examples(train_df, 0))

# Display spam examples (label=1)
print("\nSample SPAM emails:")
display(display_examples(train_df, 1))

# Check for missing values
print("\nMissing values in train set:")
print(train_df.isnull().sum())
print("\nMissing values in eval set:")
print(val_df.isnull().sum())
print("\nMissing values in test set:")
print(test_df.isnull().sum())

Training set: 91942 examples
Validation set: 19702 examples
Test set: 19702 examples

Class distribution:
Training set:
label
1    51.31%
0    48.69%
Name: proportion, dtype: object

Validation set:
label
1    51.31%
0    48.69%
Name: proportion, dtype: object

Test set:
label
1    51.31%
0    48.69%
Name: proportion, dtype: object

Sample HAM emails:


Unnamed: 0,subject,body,label
82912,Re: getting rid of mkproto.sh from Samba3,-----BEGIN PGP SIGNED MESSAGE-----\nHash: SHA1...,0
48117,Re: getting rid of mkproto.sh from Samba3,"On Sun, 2007-06-03 at 20:29 -0500, Gerald (Jer...",0
43559,re : original karamojong / jie language,leo connolly cross-posted a question about kar...,0



Sample SPAM emails:


Unnamed: 0,subject,body,label
89728,digital cameras - - toshiba / sony / hewlett p...,t\no d a y ' s\ns p e c\ni a l\nvisit : http :...,1
75960,As casselberry before concan,\n\nTHE ALERT IS ON!!!\n\n\nPromoting sym: CHV...,1
18122,Fun fun fun,\nDear 7c393e5e81dd091bc5c33e91a55e1458\n\nSum...,1



Missing values in train set:
sender      22722
receiver    23431
date        22739
subject       532
body            0
label           0
urls        22722
source          0
dtype: int64

Missing values in eval set:
sender      4920
receiver    5055
date        4925
subject      109
body           1
label          0
urls        4920
source         0
dtype: int64

Missing values in test set:
sender      4984
receiver    5137
date        4985
subject      112
body           0
label          0
urls        4984
source         0
dtype: int64


In [2]:
from datasets import Dataset
from datasets import DatasetDict

# User prompt that combines email subject and body
user_prompt = """Analyze the <EMAIL> and determine if it's PHISHING or LEGITIMATE.

<EMAIL>
Body: {body}
</EMAIL>"""


def create_conversation(sample):
    # Convert numerical label to text
    label_text = "PHISHING" if sample["label"] == 1 else "LEGITIMATE"

    return {
        "messages": [
            {
                "role": "user",
                "content": user_prompt.format(
                    subject=sample["subject"], body=sample["body"]
                ),
            },
            {"role": "assistant", "content": label_text},
        ]
    }


# Convert pandas dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Convert datasets to conversation format
train_dataset = train_dataset.map(
    create_conversation, remove_columns=train_dataset.column_names
)
val_dataset = val_dataset.map(
    create_conversation, remove_columns=val_dataset.column_names
)

# Create a DatasetDict for easier handling
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})

# Print a sample conversation
print("Sample conversation format:")
print(dataset_dict["train"][0]["messages"])

Map:   0%|          | 0/91942 [00:00<?, ? examples/s]

Map:   0%|          | 0/19702 [00:00<?, ? examples/s]

Sample conversation format:
[{'content': "Analyze the <EMAIL> and determine if it's PHISHING or LEGITIMATE.\n\n<EMAIL>\nBody: christmass s @ | e - w ! ndows xp home\nwe have everything !\nwindows x ' p professional 20 o 2 . . . . . . . . . . . 5 o\nadobe photoshop 7 . 0 . . . . . . . . . . . . . . . . . . . . . . . . 60\nmicrosoft office x ' p pro 2 oo 2 . . . . . . . . . . . . . . 60\ncorel draw graphics suite 11 . . . . . . . . . . . . . 60\nfull information : http : / / inequality . bestsoftshop . info /\nyour paypal account\narnold callahan\nstockbroker\nbiosupplynet , a sciquest company , rtp , 27709 , united states of america\nphone : 218 - 821 - 3963\nmobile : 747 - 674 - 4118\nemail : rxqlwxca @ gee - wiz . com\nthis message is beng sent to confirm your account . please do not reply directly to this message\nthis shareware is a 16 minute usage freeware\nnotes :\nthe contents of this reply is for attention and should not be depend marginalia\nhide midway hastings\ntime : sat , 1

# Finetuning

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig

# Hugging Face model id
model_id = "google/gemma-3-1b-pt" # or `google/gemma-3-4b-pt`, `google/gemma-3-12b-pt`, `google/gemma-3-27b-pt`

# Select model class based on id
if model_id == "google/gemma-3-1b-pt":
    model_class = AutoModelForCausalLM
else:
    model_class = AutoModelForImageTextToText

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch_dtype, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

# BitsAndBytesConfig: Enables 4-bit quantization to reduce model size/memory usage
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=model_kwargs['torch_dtype'],
    bnb_4bit_quant_storage=model_kwargs['torch_dtype'],
)

# Load model and tokenizer
model = model_class.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it") # Load the Instruction Tokenizer to use the official Gemma template

In [4]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

In [5]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-text-to-sql",         # directory to save and repository id
    max_seq_length=512,                     # max sequence length for model and packing of the dataset
    packing=True,                           # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,   # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)

In [6]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    peft_config=peft_config,
    processing_class=tokenizer
)

Converting train dataset to ChatML:   0%|          | 0/91942 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/91942 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/91942 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/91942 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/19702 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/19702 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/19702 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/19702 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [7]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,3.2929
20,2.9457
30,2.5856
40,2.4789
50,2.4102
60,2.4831
70,2.3354
80,2.3959
90,2.5705
100,2.4721


NaN or Inf found in input tensor.


KeyboardInterrupt: 