## Authors:
- Jakub Janicki
- Kuba Kondracki
- Hubert Twardowski

In [1]:
import transformers
import evaluate
import numpy as np
import os
import pandas as pd
import pickle
from nlp import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Data setup

In [2]:
ALBERT_MODEL = 'albert-base-v2'    # https://huggingface.co/albert/albert-base-v2
tokenizer = transformers.AlbertTokenizer.from_pretrained(ALBERT_MODEL)
    
def encode_batch(batch):
    return tokenizer(batch['text_combined'], padding=True, truncation=True, max_length=512)

# precompute the train and test datasets
if os.path.exists('train_dataset.pkl') and os.path.exists('test_dataset.pkl'):
    with open('train_dataset.pkl', 'rb') as f:
        train_dataset = pickle.load(f)
    with open('test_dataset.pkl', 'rb') as f:
        test_dataset = pickle.load(f)
else:
    df = pd.read_csv('phishing_email.csv')[['text_combined', 'label']].dropna()
    dataset = Dataset.from_pandas(df)
    data_dct = dataset.train_test_split(train_size=0.7)
    train_dataset = data_dct['train'].map(encode_batch, batched=True)
    test_dataset = data_dct['test'].map(encode_batch, batched=True)
    with open('train_dataset.pkl', 'wb') as f:
        pickle.dump(train_dataset, f)
    with open('test_dataset.pkl', 'wb') as f:
        pickle.dump(test_dataset, f)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [3]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


## Model setup

In [None]:
id2label = {0: "SAFE", 1: "PHISHING"}
label2id = {"PHISHING": 1, "SAFE": 0}
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    f"albert/{ALBERT_MODEL}", num_labels=2, id2label=id2label, label2id=label2id, attn_implementation="sdpa")
# SDPA also from ALBERT usage guide

training_args = transformers.TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    auto_find_batch_size=True,
    dataloader_num_workers=4,
     # this model is too large to tune hyperparameters. 
     # lr of 1e-5 has always worked fine with models we've trained previously, with some wd like 1e-8 or 1e-10 preventing explosions.
     # instead of hparam tuning, we decided to use lr scheduler to diminish the impact of incorrect lr.
    learning_rate=1e-5,
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr": 1e-8},
    weight_decay=1e-8,
    num_train_epochs=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    logging_steps=100,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=500, # checkpoint every 500 steps
    bf16=True, # bfloat16 since im getting {"grad_norm": nan} with float16 and i cant quite run float32
    report_to="tensorboard", # setup tensorboard
    max_steps=1000, # limit the number of steps to 1000
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


## Sanity check

In [11]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Training label distribution: {sum(train_dataset['label']) / len(train_dataset['label'])}")
print(f"Test label distribution: {sum(test_dataset['label']) / len(test_dataset['label'])}")

print(f"First train sample: {train_dataset[0]}")
print(f"Model architecture: {model}")

Training dataset size: 65988
Test dataset size: 16498
Training label distribution: 0.521534218342729
Test label distribution: 0.5137592435446721
First train sample: {'label': tensor(0), 'input_ids': tensor([    2,  2247,  1266,   289,   530,  4696,   315,   169,   883, 21156,
          228,   286,   883,  7681, 21156,   315,   169,  1067,   296,   303,
         1100,   148, 11597,  7912,   315,   169,  2094,   883,  4247,   169,
          883,   289,   913,  2824,   193,  1015,  3062,   138,  1100,   148,
        11597,  7912,  1364,   265,   883,  1466,   868,    83,  1466,   651,
         1100,   148,   289,  3553,  1466, 12238, 21156,  2945,  7681, 21156,
         1100,   148,  7912,    43,  1148,  9021,   442,  6305,  1696, 21156,
          883,   169,    87,   910,   193,   868,  2670, 21156,  3081,   955,
          529,  4466,  2383,  1139,   341, 14403,  5457,   609,  1131,  2247,
         1266,   530,   134,  1832,    28,    18,   254,    34,    62,     3,
            0,     0,

In [6]:
print(f"TensorBoard logs will be stored in: {training_args.logging_dir}")

TensorBoard logs will be stored in: ./logs


In [7]:
baseline_metrics = trainer.evaluate()
print(f"Baseline metrics: {baseline_metrics}")

100%|██████████| 2063/2063 [02:47<00:00, 12.32it/s]

Baseline metrics: {'eval_loss': 0.715407133102417, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.51709298096739, 'eval_runtime': 174.6065, 'eval_samples_per_second': 94.487, 'eval_steps_per_second': 11.815}





In [12]:
# check for correct padding

# https://huggingface.co/docs/transformers/model_doc/albert usage tip:
# ALBERT is a model with absolute position embeddings so it’s usually advised to pad the inputs on the right rather than the left.
train_dataset[0]

{'label': tensor(0),
 'input_ids': tensor([    2,  2247,  1266,   289,   530,  4696,   315,   169,   883, 21156,
           228,   286,   883,  7681, 21156,   315,   169,  1067,   296,   303,
          1100,   148, 11597,  7912,   315,   169,  2094,   883,  4247,   169,
           883,   289,   913,  2824,   193,  1015,  3062,   138,  1100,   148,
         11597,  7912,  1364,   265,   883,  1466,   868,    83,  1466,   651,
          1100,   148,   289,  3553,  1466, 12238, 21156,  2945,  7681, 21156,
          1100,   148,  7912,    43,  1148,  9021,   442,  6305,  1696, 21156,
           883,   169,    87,   910,   193,   868,  2670, 21156,  3081,   955,
           529,  4466,  2383,  1139,   341, 14403,  5457,   609,  1131,  2247,
          1266,   530,   134,  1832,    28,    18,   254,    34,    62,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

## Training

In [6]:
# train
trainer.train()
trainer.evaluate()
trainer.save_model("albert-base-v2-phishing-emails")

 10%|█         | 100/1000 [00:32<03:38,  4.12it/s]

{'loss': 0.4846, 'grad_norm': 64.0195083618164, 'learning_rate': 9.755527298894292e-06, 'epoch': 0.01}


                                                  
 10%|█         | 100/1000 [03:28<03:38,  4.12it/s] 

{'eval_loss': 0.2660793662071228, 'eval_accuracy': 0.9004727845799491, 'eval_runtime': 176.0971, 'eval_samples_per_second': 93.687, 'eval_steps_per_second': 11.715, 'epoch': 0.01}


 20%|██        | 200/1000 [03:52<03:15,  4.10it/s]   

{'loss': 0.3478, 'grad_norm': 19.467729568481445, 'learning_rate': 9.046039886902864e-06, 'epoch': 0.02}


                                                  
 20%|██        | 200/1000 [06:53<03:15,  4.10it/s] 

{'eval_loss': 0.2388971745967865, 'eval_accuracy': 0.9253242817311189, 'eval_runtime': 180.2094, 'eval_samples_per_second': 91.549, 'eval_steps_per_second': 11.448, 'epoch': 0.02}


 30%|███       | 300/1000 [07:16<02:43,  4.28it/s]   

{'loss': 0.3681, 'grad_norm': 77.08357238769531, 'learning_rate': 7.940987335200904e-06, 'epoch': 0.04}


                                                  
 30%|███       | 300/1000 [10:11<02:43,  4.28it/s] 

{'eval_loss': 0.30863192677497864, 'eval_accuracy': 0.9216874772699721, 'eval_runtime': 174.8873, 'eval_samples_per_second': 94.335, 'eval_steps_per_second': 11.796, 'epoch': 0.04}


 40%|████      | 400/1000 [10:36<02:25,  4.12it/s]   

{'loss': 0.2629, 'grad_norm': 85.00262451171875, 'learning_rate': 6.548539886902863e-06, 'epoch': 0.05}


                                                  
 40%|████      | 400/1000 [13:31<02:25,  4.12it/s] 

{'eval_loss': 0.24285057187080383, 'eval_accuracy': 0.9478724693902291, 'eval_runtime': 175.3395, 'eval_samples_per_second': 94.092, 'eval_steps_per_second': 11.766, 'epoch': 0.05}


 50%|█████     | 500/1000 [13:55<02:02,  4.09it/s]  

{'loss': 0.1968, 'grad_norm': 0.056783415377140045, 'learning_rate': 5.0049999999999995e-06, 'epoch': 0.06}


                                                  
 50%|█████     | 500/1000 [16:51<02:02,  4.09it/s] 

{'eval_loss': 0.25608712434768677, 'eval_accuracy': 0.9495090313977452, 'eval_runtime': 176.0272, 'eval_samples_per_second': 93.724, 'eval_steps_per_second': 11.72, 'epoch': 0.06}


 60%|██████    | 600/1000 [17:16<01:35,  4.21it/s]  

{'loss': 0.2929, 'grad_norm': 158.9840850830078, 'learning_rate': 3.461460113097138e-06, 'epoch': 0.07}


                                                  
 60%|██████    | 600/1000 [20:09<01:35,  4.21it/s] 

{'eval_loss': 0.4986262023448944, 'eval_accuracy': 0.8962904594496303, 'eval_runtime': 173.4406, 'eval_samples_per_second': 95.122, 'eval_steps_per_second': 11.895, 'epoch': 0.07}


 70%|███████   | 700/1000 [20:33<01:10,  4.26it/s]  

{'loss': 0.2633, 'grad_norm': 0.0711698830127716, 'learning_rate': 2.0690126647990974e-06, 'epoch': 0.08}


                                                  
 70%|███████   | 700/1000 [23:26<01:10,  4.26it/s] 

{'eval_loss': 0.1919512003660202, 'eval_accuracy': 0.9565401866892956, 'eval_runtime': 173.382, 'eval_samples_per_second': 95.154, 'eval_steps_per_second': 11.899, 'epoch': 0.08}


 80%|████████  | 800/1000 [23:50<00:46,  4.30it/s]  

{'loss': 0.1846, 'grad_norm': 110.15643310546875, 'learning_rate': 9.63960113097138e-07, 'epoch': 0.1}


                                                  
 80%|████████  | 800/1000 [26:46<00:46,  4.30it/s] 

{'eval_loss': 0.1652226448059082, 'eval_accuracy': 0.9618741665656443, 'eval_runtime': 175.8193, 'eval_samples_per_second': 93.835, 'eval_steps_per_second': 11.734, 'epoch': 0.1}


 90%|█████████ | 900/1000 [27:10<00:24,  4.10it/s]  

{'loss': 0.1885, 'grad_norm': 0.7435556650161743, 'learning_rate': 2.544727011057081e-07, 'epoch': 0.11}


                                                  
 90%|█████████ | 900/1000 [30:05<00:24,  4.10it/s] 

{'eval_loss': 0.16509948670864105, 'eval_accuracy': 0.9628439810886168, 'eval_runtime': 174.4278, 'eval_samples_per_second': 94.584, 'eval_steps_per_second': 11.827, 'epoch': 0.11}


100%|██████████| 1000/1000 [30:29<00:00,  4.09it/s] 

{'loss': 0.1602, 'grad_norm': 0.3658551275730133, 'learning_rate': 1e-08, 'epoch': 0.12}


                                                   
100%|██████████| 1000/1000 [33:24<00:00,  2.00s/it]


{'eval_loss': 0.16470371186733246, 'eval_accuracy': 0.9627227542732453, 'eval_runtime': 174.9576, 'eval_samples_per_second': 94.297, 'eval_steps_per_second': 11.791, 'epoch': 0.12}
{'train_runtime': 2004.5988, 'train_samples_per_second': 3.991, 'train_steps_per_second': 0.499, 'train_loss': 0.27497378349304197, 'epoch': 0.12}


100%|██████████| 2063/2063 [02:47<00:00, 12.31it/s]


## Results

In [None]:
results = trainer.evaluate()
print(results)

100%|██████████| 2063/2063 [02:48<00:00, 12.27it/s]

{'eval_loss': 0.16470371186733246, 'eval_accuracy': 0.9627227542732453, 'eval_runtime': 175.2363, 'eval_samples_per_second': 94.147, 'eval_steps_per_second': 11.773, 'epoch': 0.12122681537156019}



