In [13]:
!pip install torch transformers




In [8]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [1]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install datasets
!pip install tqdm



In [3]:
from datasets import load_dataset
dataset_dict = load_dataset("shawhin/phishing-site-classification")

In [4]:
#pre-trained model path
model_path = "google-bert/bert-base-uncased"

#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

#model with binary classification head
id2label = {0:'Safe', 1:'Not Safe'}
label2id = {'Safe': 0, "Not Safe": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          num_labels=2,
                                                          id2label = id2label,
                                                          label2id = label2id,)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# freezing all base model parameters
for name , param in model.base_model.named_parameters():
    param.requires_grad = False

#set base model polling layers as training parameters
for name, param in model.base_model.named_parameters():
    if 'pooler' in name:
        param.requires_grad = True

In [6]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 450/450 [00:00<00:00, 9610.36 examples/s]


In [7]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
})

## creating a data collator that will dynamically pad token sequences in a batch during training so they have the same length

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Metrics

## Hyperparameters

In [21]:

accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")  # This is what you're using later

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                    references=labels)['roc_auc'], 3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                   references=labels)['accuracy'], 3)
    
    return {"Accuracy": acc, "AUC": auc}

In [18]:
!pip install transformers[torch] accelerate>=0.26.0

In [22]:
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-phising-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end =True,
)

In [23]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['test'],
    tokenizer = tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.2534,0.32512,0.864,0.948
2,0.3935,0.363376,0.851,0.949
3,0.3596,0.315647,0.869,0.947
4,0.3422,0.453723,0.822,0.95
5,0.3513,0.310607,0.862,0.951
6,0.3554,0.277857,0.876,0.955
7,0.3217,0.29433,0.873,0.952
8,0.3037,0.285416,0.88,0.953
9,0.3158,0.275615,0.876,0.954
10,0.3077,0.290084,0.869,0.955


TrainOutput(global_step=2630, training_loss=0.3304322710508629, metrics={'train_runtime': 1529.6084, 'train_samples_per_second': 13.729, 'train_steps_per_second': 1.719, 'total_flos': 706603239165360.0, 'train_loss': 0.3304322710508629, 'epoch': 10.0})

In [24]:
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)


{'Accuracy': 0.896, 'AUC': 0.947}
