In [3]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split


In [20]:
df = pd.read_csv("Dataset.zip")
df

Unnamed: 0,Type,url_length,number_of_dots_in_url,having_repeated_digits_in_url,number_of_digits_in_url,number_of_special_char_in_url,number_of_hyphens_in_url,number_of_underline_in_url,number_of_slash_in_url,number_of_questionmark_in_url,...,having_digits_in_subdomain,number_of_digits_in_subdomain,having_repeated_digits_in_subdomain,having_path,path_length,having_query,having_fragment,having_anchor,entropy_of_url,entropy_of_domain
0,0,37,2,0,0,8,0,0,5,0,...,0,0,1,0,3,0,0,0,4.010412,2.751629
1,1,70,5,0,0,12,0,0,6,0,...,0,0,1,0,4,0,0,0,4.089470,3.532573
2,0,42,2,0,6,8,0,0,3,1,...,0,0,1,0,1,1,0,0,4.386016,3.344698
3,0,46,2,0,0,7,0,0,4,0,...,0,0,1,0,2,0,0,0,4.221947,3.189898
4,0,51,3,0,0,9,0,0,5,0,...,0,0,1,0,3,0,0,0,4.103538,2.952820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247945,0,42,1,0,0,6,0,0,4,0,...,0,0,1,0,2,0,0,0,3.758289,3.323231
247946,0,42,2,0,0,8,0,0,5,0,...,0,0,1,0,3,0,0,0,3.937093,3.026987
247947,1,33,2,0,0,8,0,0,5,0,...,0,0,1,0,3,0,0,0,3.813207,3.327820
247948,1,83,1,1,19,9,0,0,7,0,...,0,0,1,0,5,0,0,0,4.540173,3.375000


In [5]:
X = df.drop("Type", axis=1)
y = df["Type"]

X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32)
X.shape, y.shape

(torch.Size([247950, 41]), torch.Size([247950]))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}  |   X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}      |   y_test shape: {y_test.shape}")

X_train shape: torch.Size([198360, 41])  |   X_test shape: torch.Size([49590, 41])
y_train shape: torch.Size([198360])      |   y_test shape: torch.Size([49590])


In [7]:
class PhishingClassifier(nn.Module):
    def __init__(self, input_size, hidden_layer):
        super(PhishingClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_layer)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_layer, out_features=1)
    
    def forward(self, x):
        return self.layer_2(self.relu(self.layer_1(x)))

In [8]:
model_0 = PhishingClassifier(input_size=X.shape[1], hidden_layer=32)
model_0

PhishingClassifier(
  (layer_1): Linear(in_features=41, out_features=32, bias=True)
  (relu): ReLU()
  (layer_2): Linear(in_features=32, out_features=1, bias=True)
)

In [9]:
# Accuracy block
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct/len(y_pred)) * 100
    return acc

In [10]:
with torch.inference_mode():
    initial_pred = model_0(X_test)
initial_pred

tensor([[2.3754],
        [2.9292],
        [4.7695],
        ...,
        [4.7660],
        [6.5536],
        [2.4911]])

In [11]:
#Loos function and Optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(params=model_0.parameters(), lr=0.01)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [12]:
# Training Loop
torch.manual_seed(42)
epochs = 2000

for epoch in range(epochs):
    model_0.train()
    # Forward Pass
    y_logits = model_0(X_train).squeeze()
    y_pred = torch.round(torch.sigmoid(y_logits))
    # Loss
    loss = loss_fn(y_logits, y_train)
    # Training Accuracy
    acc = accuracy_fn(y_true=y_train, y_pred=y_pred)
    # Zero gradients
    optimizer.zero_grad()
    # Back Propagation
    loss.backward()
    # Optimizer Step
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss}")
        print(f"Accuracy: {acc}")

Epoch: 0 | Loss: 2.0590240955352783
Accuracy: 48.138233514821536
Epoch: 10 | Loss: 0.6385346055030823
Accuracy: 64.42226255293406
Epoch: 20 | Loss: 0.5984965562820435
Accuracy: 66.49879007864489
Epoch: 30 | Loss: 0.5807524919509888
Accuracy: 68.41802782819116
Epoch: 40 | Loss: 0.569290041923523
Accuracy: 70.32163742690058
Epoch: 50 | Loss: 0.5614228844642639
Accuracy: 71.56130268199233
Epoch: 60 | Loss: 0.5552595853805542
Accuracy: 72.08862673926195
Epoch: 70 | Loss: 0.5524211525917053
Accuracy: 72.12542851381328
Epoch: 80 | Loss: 0.5709681510925293
Accuracy: 68.82990522282718
Epoch: 90 | Loss: 0.5768841505050659
Accuracy: 68.15739060294415
Epoch: 100 | Loss: 0.5521175265312195
Accuracy: 71.25378100423472
Epoch: 110 | Loss: 0.5427201986312866
Accuracy: 72.35682597297843
Epoch: 120 | Loss: 0.54136061668396
Accuracy: 72.376487194999
Epoch: 130 | Loss: 0.5439746379852295
Accuracy: 71.79068360556565
Epoch: 140 | Loss: 0.5415635108947754
Accuracy: 72.04980842911878
Epoch: 150 | Loss: 0.5354

In [13]:
with torch.inference_mode():
    new_pred = torch.round(torch.sigmoid(model_0(X_test).squeeze()))
    ac = accuracy_fn(y_true=y_test, y_pred=new_pred)

print(f"Prediction: {new_pred}\nAccuracy: {ac}")

Prediction: tensor([0., 1., 0.,  ..., 0., 0., 0.])
Accuracy: 79.81044565436581


In [14]:
torch.save(model_0.state_dict(), f='Phisher Model.pth')

In [15]:
import re
import math
import numpy as np
from urllib.parse import urlparse
from collections import Counter

def shannon_entropy(text):
    if not text:
        return 0.0
    counts = Counter(text)
    probs = [c / len(text) for c in counts.values()]
    return -sum(p * math.log2(p) for p in probs)



def extract_features_ordered(url: str):
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path
    query = parsed.query
    fragment = parsed.fragment

    # Special characters
    special_chars = r"[^a-zA-Z0-9]"

    features = []

    # 1️⃣ Type (placeholder — MUST be dropped before model input)
    

    # ---------------- URL LEVEL ----------------
    features.append(len(url))                              # url_length
    features.append(url.count('.'))                        # number_of_dots_in_url
    features.append(int(bool(re.search(r'(\d)\1+', url)))) # having_repeated_digits_in_url
    features.append(sum(c.isdigit() for c in url))         # number_of_digits_in_url
    features.append(len(re.findall(special_chars, url)))  # number_of_special_char_in_url
    features.append(url.count('-'))                        # number_of_hyphens_in_url
    features.append(url.count('_'))                        # number_of_underline_in_url
    features.append(url.count('/'))                        # number_of_slash_in_url
    features.append(url.count('?'))                        # number_of_questionmark_in_url
    features.append(url.count('='))                        # number_of_equal_in_url
    features.append(url.count('@'))                        # number_of_at_in_url
    features.append(url.count('$'))                        # number_of_dollar_in_url
    features.append(url.count('!'))                        # number_of_exclamation_in_url
    features.append(url.count('#'))                        # number_of_hashtag_in_url
    features.append(url.count('%'))                        # number_of_percent_in_url

    # ---------------- DOMAIN LEVEL ----------------
    features.append(len(domain))                           # domain_length
    features.append(domain.count('.'))                     # number_of_dots_in_domain
    features.append(domain.count('-'))                     # number_of_hyphens_in_domain
    features.append(int(bool(re.search(special_chars, domain))))  # having_special_characters_in_domain
    features.append(len(re.findall(special_chars, domain)))       # number_of_special_characters_in_domain
    features.append(int(any(c.isdigit() for c in domain)))        # having_digits_in_domain
    features.append(sum(c.isdigit() for c in domain))             # number_of_digits_in_domain
    features.append(int(bool(re.search(r'(\d)\1+', domain))))     # having_repeated_digits_in_domain

    # ---------------- SUBDOMAIN LEVEL ----------------
    domain_parts = domain.split('.')
    subdomains = domain_parts[:-2] if len(domain_parts) > 2 else []

    features.append(len(subdomains))                       # number_of_subdomains
    features.append(int(any('.' in s for s in subdomains)))# having_dot_in_subdomain
    features.append(int(any('-' in s for s in subdomains)))# having_hyphen_in_subdomain

    avg_sub_len = np.mean([len(s) for s in subdomains]) if subdomains else 0
    features.append(avg_sub_len)                           # average_subdomain_length

    avg_dot = np.mean([s.count('.') for s in subdomains]) if subdomains else 0
    features.append(avg_dot)                               # average_number_of_dots_in_subdomain

    avg_hyphen = np.mean([s.count('-') for s in subdomains]) if subdomains else 0
    features.append(avg_hyphen)                            # average_number_of_hyphens_in_subdomain

    features.append(int(any(re.search(special_chars, s) for s in subdomains))) # having_special_characters_in_subdomain
    features.append(sum(len(re.findall(special_chars, s)) for s in subdomains))# number_of_special_characters_in_subdomain
    features.append(int(any(c.isdigit() for s in subdomains for c in s)))       # having_digits_in_subdomain
    features.append(sum(c.isdigit() for s in subdomains for c in s))            # number_of_digits_in_subdomain
    features.append(int(any(re.search(r'(\d)\1+', s) for s in subdomains)))      # having_repeated_digits_in_subdomain

    # ---------------- PATH / QUERY ----------------
    features.append(int(bool(path)))                      # having_path
    features.append(len(path))                            # path_length
    features.append(int(bool(query)))                     # having_query
    features.append(int(bool(fragment)))                  # having_fragment
    features.append(int('#' in url))                      # having_anchor

    # ---------------- ENTROPY ----------------
    features.append(shannon_entropy(url))                 # entropy_of_url
    features.append(shannon_entropy(domain))              # entropy_of_domain

    return np.array(features, dtype=np.float32)


In [16]:
features = extract_features_ordered(
    "http://secure-login-google.verify-user.info/login"
)
print(features.shape)


(41,)


In [17]:
url = "http://secure-login-google.verify-user.info/login"
features = extract_features_ordered(url)
features = torch.tensor(features, dtype=torch.float32)

with torch.no_grad():
    logits = model_0(features)
    prob = torch.sigmoid(logits).item()
if prob >= 0.5:
    print(f"The URL: {url} is predicted as Phishing with Probability {prob:.4f}")
else:
    print(f"The URL: {url} is predicted as Legitimate with Probability {1 - prob:.4f}")

The URL: http://secure-login-google.verify-user.info/login is predicted as Phishing with Probability 0.9857
