In [1]:
import pandas as pd

dataset = pd.read_csv("Copyphishing.csv")

cleaned_dataset = dataset[['Feature', 'Boolean']].dropna()
cleaned_dataset = cleaned_dataset.rename(columns={'Feature': 'text', 'Boolean': 'label'})

cleaned_dataset['label'] = cleaned_dataset['label'].astype(int)

print(cleaned_dataset.head())

                                                text  label
0  Dear eBay User ,\n\nAfter fraud complaints fro...      1
1  Dear valued customer, Our records indicate tha...      1
2  Dear Key Bank customer.  Please read this mess...      1
3  Dear Key Bank customer.  Please read this mess...      1
4  LEGAL NOTICE The following message is an email...      1


In [2]:
from transformers import AutoTokenizer
from datasets import Dataset

# loading BERT Tokenizer...
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset = Dataset.from_pandas(cleaned_dataset)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# applying tokenization on each batch
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print("Tokenized Data : \n")
print(tokenized_dataset[0])

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 8194/8194 [00:01<00:00, 4356.54 examples/s]

Tokenized Data : 

{'text': "Dear eBay User ,\\n\\nAfter fraud complaints from the eBay members, the eBay Inc. had\\ndeveloped a security program against the fraudulend attempts of accounts\\nthefts. For that we have to securise all the members informations by\\nupdating and checking the registrated informations. Please update  your\\ninformation by completing the form from the forwarded link so we can check\\nyour account validity and your identity\\nand login to eBay in order to update your informations.\\nThis process will take 5 days, period when you will not be able to\\nacces your eBay account. After this period you will receive instructions to\\nenter and securise your eBay account.Please click the link below and sign in into your account: http://signin.ebay.com/aw-cgi/eBayISAPI.dll?SignIn&ssPageName=h:h:sin:US -- As outlined in our User Agreement, eBay will periodically send you information about site changes and enhancements. Visit our Privacy Policy and User Agreement if you 




In [3]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import torch

# training -> 80%, validation -> 20%
train_dataset, val_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",       
    evaluation_strategy="epoch",  
    learning_rate=3e-5,           
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    num_train_epochs=5,           
    weight_decay=0.01,            
    logging_dir="./logs",         
    logging_steps=10,             
    save_steps=500,               
    report_to="none"
)

# initializing ...
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  0%|          | 5/2050 [00:52<5:55:53, 10.44s/it]

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

trainer.compute_metrics=compute_metrics

evaluation_results = trainer.evaluate()
print("Evaluation Metrics:", evaluation_results)

In [None]:
import optuna # optuna is a open-source hyperparameter optimization framework
from transformers import Trainer, TrainingArguments

def model_training(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 3, 6)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )
    
    trainer.train()
    evaluation_results = trainer.evaluate()
    return evaluation_results["eval_f1"]

# Running Optuna study ...
study = optuna.create_study(direction="maximize")
study.optimize(model_training, n_trials=10)

print("Best Hyperparameters:", study.best_params)