# **Fine Tuning MobileBERT**

In [25]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

from transformers import MobileBertForSequenceClassification, MobileBertTokenizer,\
    Trainer, TrainingArguments

from datasets import Dataset

from sklearn.metrics import f1_score, accuracy_score

In [3]:
# Load Preprocessed Data 

train_df = pd.read_csv(os.path.join('data', 'phishing_dataset', 'train.csv'))
test_df = pd.read_csv(os.path.join('data', 'phishing_dataset', 'test.csv'))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
# Load tokenizer

tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True, 
                     padding="max_length", add_special_tokens=True, return_tensors = 'pt')

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)



Map: 100%|██████████| 16440/16440 [01:14<00:00, 219.45 examples/s]
Map: 100%|██████████| 7047/7047 [00:18<00:00, 371.04 examples/s]


In [30]:
# Load model & train

model = MobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=2)

from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(preds):
    out = np.argmax(preds.predictions, axis=1)
    return {
        'accuracy': accuracy_score(preds.label_ids, out),
        'f1': f1_score(preds.label_ids, out, average='weighted')  # You can change the average type as needed
    }



training_args = TrainingArguments(
    output_dir = os.path.join('models', 'checkpoints'), 
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_steps = 1
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics = compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


100%|██████████| 1410/1410 [03:02<00:00,  7.73it/s]
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [6]:
trainer.train()

  5%|▌         | 500/9864 [01:12<23:17,  6.70it/s]Checkpoint destination directory models\checkpoints\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 122154.52, 'learning_rate': 1.8986212489862126e-05, 'epoch': 0.15}


 10%|█         | 1000/9864 [02:25<27:39,  5.34it/s]Checkpoint destination directory models\checkpoints\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 1.7589, 'learning_rate': 1.7972424979724253e-05, 'epoch': 0.3}


 15%|█▌        | 1500/9864 [03:46<23:42,  5.88it/s]Checkpoint destination directory models\checkpoints\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.6154, 'learning_rate': 1.6958637469586377e-05, 'epoch': 0.46}


 20%|██        | 2000/9864 [05:06<20:59,  6.24it/s]Checkpoint destination directory models\checkpoints\checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2942, 'learning_rate': 1.59448499594485e-05, 'epoch': 0.61}


 25%|██▌       | 2500/9864 [06:22<17:50,  6.88it/s]Checkpoint destination directory models\checkpoints\checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 1.0888, 'learning_rate': 1.4931062449310625e-05, 'epoch': 0.76}


 30%|███       | 3000/9864 [07:38<16:31,  6.92it/s]Checkpoint destination directory models\checkpoints\checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1562, 'learning_rate': 1.3917274939172751e-05, 'epoch': 0.91}


                                                   
 33%|███▎      | 3289/9864 [09:23<33:32:28, 18.36s/it]

{'eval_loss': 0.20653901994228363, 'eval_accuracy': 0.9697743720732227, 'eval_runtime': 60.7086, 'eval_samples_per_second': 116.079, 'eval_steps_per_second': 23.226, 'epoch': 1.0}


 35%|███▌      | 3500/9864 [09:53<15:02,  7.05it/s]   Checkpoint destination directory models\checkpoints\checkpoint-3500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1664, 'learning_rate': 1.2903487429034875e-05, 'epoch': 1.06}


 41%|████      | 4000/9864 [11:04<13:43,  7.12it/s]Checkpoint destination directory models\checkpoints\checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0946, 'learning_rate': 1.1889699918897e-05, 'epoch': 1.22}


 46%|████▌     | 4500/9864 [12:16<14:16,  6.26it/s]Checkpoint destination directory models\checkpoints\checkpoint-4500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1953, 'learning_rate': 1.0875912408759123e-05, 'epoch': 1.37}


 51%|█████     | 5000/9864 [13:26<11:26,  7.08it/s]Checkpoint destination directory models\checkpoints\checkpoint-5000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1127, 'learning_rate': 9.86212489862125e-06, 'epoch': 1.52}


 56%|█████▌    | 5500/9864 [14:37<10:17,  7.06it/s]Checkpoint destination directory models\checkpoints\checkpoint-5500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0893, 'learning_rate': 8.848337388483375e-06, 'epoch': 1.67}


 61%|██████    | 6000/9864 [15:48<09:24,  6.85it/s]

{'loss': 11.4375, 'learning_rate': 7.8345498783455e-06, 'epoch': 1.82}


 66%|██████▌   | 6500/9864 [16:59<07:52,  7.12it/s]

{'loss': 0.213, 'learning_rate': 6.820762368207624e-06, 'epoch': 1.98}


                                                   
 67%|██████▋   | 6577/9864 [18:09<16:08:28, 17.68s/it]

{'eval_loss': 0.15238140523433685, 'eval_accuracy': 0.9816943380161771, 'eval_runtime': 58.4432, 'eval_samples_per_second': 120.579, 'eval_steps_per_second': 24.126, 'epoch': 2.0}


 71%|███████   | 7000/9864 [19:09<06:40,  7.15it/s]   

{'loss': 0.0585, 'learning_rate': 5.8069748580697495e-06, 'epoch': 2.13}


 76%|███████▌  | 7500/9864 [20:20<05:35,  7.05it/s]

{'loss': 0.0503, 'learning_rate': 4.793187347931874e-06, 'epoch': 2.28}


 81%|████████  | 8000/9864 [21:31<04:18,  7.22it/s]

{'loss': 0.0551, 'learning_rate': 3.779399837793999e-06, 'epoch': 2.43}


 86%|████████▌ | 8500/9864 [22:43<03:20,  6.79it/s]

{'loss': 0.038, 'learning_rate': 2.7656123276561238e-06, 'epoch': 2.59}


 91%|█████████ | 9000/9864 [23:55<02:02,  7.04it/s]

{'loss': 0.3996, 'learning_rate': 1.7518248175182485e-06, 'epoch': 2.74}


 96%|█████████▋| 9500/9864 [25:06<00:51,  7.06it/s]

{'loss': 0.0487, 'learning_rate': 7.380373073803731e-07, 'epoch': 2.89}


                                                   
100%|██████████| 9864/9864 [27:01<00:00,  6.08it/s]

{'eval_loss': 0.1447024643421173, 'eval_accuracy': 0.9824038597984958, 'eval_runtime': 59.7336, 'eval_samples_per_second': 117.974, 'eval_steps_per_second': 23.605, 'epoch': 3.0}
{'train_runtime': 1621.7974, 'train_samples_per_second': 30.411, 'train_steps_per_second': 6.082, 'train_loss': 6193.007407180295, 'epoch': 3.0}





TrainOutput(global_step=9864, training_loss=6193.007407180295, metrics={'train_runtime': 1621.7974, 'train_samples_per_second': 30.411, 'train_steps_per_second': 6.082, 'train_loss': 6193.007407180295, 'epoch': 3.0})

In [43]:
checkpoint = "models\\checkpoints\\checkpoint-9500"

model = MobileBertForSequenceClassification.from_pretrained(checkpoint)


trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics = compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.evaluate()



100%|██████████| 1410/1410 [00:59<00:00, 23.70it/s]


{'eval_loss': 0.16192379593849182,
 'eval_accuracy': 0.9816943380161771,
 'eval_f1': 0.9816642980997267,
 'eval_runtime': 59.7833,
 'eval_samples_per_second': 117.876,
 'eval_steps_per_second': 23.585}

In [None]:
# Evaluation Details

# TODO


In [52]:
# Try custom sample

txt1 = """

Dear Luke Ingram

Thank you for starting your application for our Data Science-Machine Learning Engineer position at Zynex Medical . We noticed that you have not fully completed your application.

You may continue the application process by navigating to the following page: Please Click Here to Complete Your Application

We hope you consider completing and submitting your application before the position is no longer available.

Thank you for your interest in our company, we look forward to seeing your completed application!
"""

txt2 = "Amazon here: click here to reset your passowrd www.samazon.com/passwordreset"

txt3 = "Check out this profile in linkedin: www.linkedin.com/person333"

tokens = tokenizer.encode_plus(txt3, max_length=512, truncation=True, padding="max_length", add_special_tokens=True, 
                               return_tensors='pt')

tokens



{'input_ids': tensor([[  101,  4638,  2041,  2023,  6337,  1999,  5799,  2378,  1024,  7479,
          1012,  5799,  2378,  1012,  4012,  1013,  2711, 22394,  2509,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [53]:
tokens = tokens.to(torch.device('cuda'))
output = model(**tokens)

probs = F.softmax(output[0], dim=-1)

probs

tensor([[1.0000e+00, 9.3530e-09]], device='cuda:0', grad_fn=<SoftmaxBackward0>)