In [None]:
# ! pip install xgboost

In [1]:
# Import basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Dataset/phishing_site_urls.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.Label.value_counts()

In [None]:
import re

In [None]:
def tokenize_url(url):
    return re.split(r'\W+', url)

In [None]:
df['tokens'] = df['URL'].apply(tokenize_url)

In [None]:
from gensim.models import Word2Vec

In [None]:
# Train Word2Vec on tokenized URLs
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Create URL embedding by averaging token vectors
def get_embedding(tokens):
    valid_tokens = [t for t in tokens if t in w2v_model.wv]
    if not valid_tokens:
        return np.zeros(w2v_model.vector_size)
    return np.mean([w2v_model.wv[t] for t in valid_tokens], axis=0)

In [None]:
df['embedding'] = df['tokens'].apply(get_embedding)

In [None]:
x = np.vstack(df['embedding'].values)
y = df['Label'].values

In [None]:
df.head()

### Train traditional Machine learning algorithms

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
import xgboost as xgb

In [None]:
# Train XGBoost
model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [None]:
# model_xgb.fit(X_train, y_train)

### LLM Model Training

In [3]:
data = df[['URL', 'Label']]

In [4]:
data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [5]:
data['Label'] = data['Label'].map({'good': 0, 'bad': 1})

In [6]:
data.rename(columns={'Label': 'label'}, inplace=True)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd

In [8]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(data)

In [9]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def tokenize(example):
    return tokenizer(example['URL'], truncation=True, padding='max_length')

In [11]:
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.01)

Map:   0%|          | 0/549346 [00:00<?, ? examples/s]

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Save model
model.save_pretrained("phishing_model")
tokenizer.save_pretrained("phishing_model")

In [None]:
# Load model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("phishing_model")
tokenizer = AutoTokenizer.from_pretrained("phishing_model")

In [None]:
# prediction
import torch

urls = ["http://phishy.site", "https://secure.bank.com"]
inputs = tokenizer(urls, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
print(predictions)  # tensor([1, 0]) → bad, good

In [None]:
# Get label back
id2label = {0: 'good', 1: 'bad'}
pred_labels = [id2label[int(pred)] for pred in predictions]
print(pred_labels)