# Dataset Exploration

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/PhiUSIIL_Phishing_URL_Dataset.csv')

In [None]:
df.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0.0,0.0,1.0,34.0,20.0,28.0,119.0,0.0,124.0,1.0
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0.0,0.0,1.0,50.0,9.0,8.0,39.0,0.0,217.0,1.0
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0.0,0.0,1.0,10.0,2.0,7.0,42.0,2.0,5.0,1.0
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1.0,1.0,1.0,3.0,27.0,15.0,22.0,1.0,31.0,1.0
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1.0,0.0,1.0,244.0,15.0,34.0,72.0,1.0,85.0,1.0


In [None]:
df.shape

(122279, 56)

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1.0,70376
0.0,51902


In [None]:
df.duplicated().sum()

0

In [None]:
df.isna().sum()

Unnamed: 0,0
FILENAME,0
URL,0
URLLength,0
Domain,0
DomainLength,0
IsDomainIP,0
TLD,0
URLSimilarityIndex,0
CharContinuationRate,0
TLDLegitimateProb,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122279 entries, 0 to 122278
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    122279 non-null  object 
 1   URL                         122279 non-null  object 
 2   URLLength                   122279 non-null  int64  
 3   Domain                      122279 non-null  object 
 4   DomainLength                122279 non-null  int64  
 5   IsDomainIP                  122279 non-null  int64  
 6   TLD                         122279 non-null  object 
 7   URLSimilarityIndex          122279 non-null  float64
 8   CharContinuationRate        122279 non-null  float64
 9   TLDLegitimateProb           122279 non-null  float64
 10  URLCharProb                 122279 non-null  float64
 11  TLDLength                   122279 non-null  int64  
 12  NoOfSubDomain               122279 non-null  int64  
 13  HasObfuscation

# Dataset Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = df[['URL', 'label']]

In [None]:
df = df.dropna(subset=['label'])

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

In [None]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['label'])

In [None]:
print(f"Training set: {len(train_df)}, Validation set: {len(val_df)}, Test set: {len(test_df)}")

Training set: 85594, Validation set: 18342, Test set: 18342


In [None]:
print("Training label distribution:\n", train_df['label'].value_counts())
print("Validation label distribution:\n", val_df['label'].value_counts())
print("Test label distribution:\n", test_df['label'].value_counts())

Training label distribution:
 label
1.0    49263
0.0    36331
Name: count, dtype: int64
Validation label distribution:
 label
1.0    10556
0.0     7786
Name: count, dtype: int64
Test label distribution:
 label
1.0    10557
0.0     7785
Name: count, dtype: int64


# Dataset Preprocessing

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
def tokenize_function(examples):
  return tokenizer(examples['URL'], padding="max_length", truncation=True)

In [None]:
train_encodings = tokenizer(list(train_df['URL'].values), truncation=True, padding=True, max_length=512)

In [None]:
val_encodings =  tokenizer(list(val_df['URL'].values), truncation=True, padding=True, max_length=512)

In [None]:
test_encodings = tokenizer(list(test_df['URL'].values), truncation=True, padding=True, max_length=512)

# PyTorch Datasets

In [None]:
import torch

In [None]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
train_labels = train_df['label'].map({0: 0, 1: 1}).tolist()
val_labels = val_df['label'].map({0: 0, 1: 1}).tolist()
test_labels = test_df['label'].map({0: 0, 1: 1}).tolist()

In [None]:
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Model Setup

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training the Model

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none",
    fp16=True,
    logging_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    save_steps=500,
)



In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the predicted class with the highest score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0198,0.011971,0.996789,1.0,0.998392
2,0.0119,0.011055,0.997259,0.999716,0.998486


TrainOutput(global_step=5350, training_loss=0.015889821275372373, metrics={'train_runtime': 4368.447, 'train_samples_per_second': 39.187, 'train_steps_per_second': 1.225, 'total_flos': 4.504145534496768e+16, 'train_loss': 0.015889821275372373, 'epoch': 2.0})

# Evaluation

In [None]:
test_results = trainer.evaluate(test_dataset)
print(test_results)

{'eval_loss': 0.011141737923026085, 'eval_precision': 0.9971655328798186, 'eval_recall': 0.9997158283603297, 'eval_f1': 0.9984390520788988, 'eval_runtime': 128.5934, 'eval_samples_per_second': 142.636, 'eval_steps_per_second': 4.464, 'epoch': 2.0}


# Save Model

In [None]:
save_directory = "/content/results_model"

In [None]:
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/results_model/tokenizer_config.json',
 '/content/results_model/special_tokens_map.json',
 '/content/results_model/vocab.txt',
 '/content/results_model/added_tokens.json')

In [None]:
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/results_model/tokenizer_config.json',
 '/content/results_model/special_tokens_map.json',
 '/content/results_model/vocab.txt',
 '/content/results_model/added_tokens.json')

In [None]:
model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json')

In [None]:
torch.save(model.state_dict(), './results/pytorch_model.bin')