# URL SAFETY CHECKER
### A machine learning model that predicts whether a given URL is *Safe* or *Suspicious* based on lexical features extracted from the URL itself.

Importing libraries

In [1]:
import pandas as pd
import numpy as np
import re
import tldextract
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

Loading dataset

In [2]:
df = pd.read_csv("malicious_phish.csv")
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


Data Preprocessing

In [3]:
df['type'] = df['type'].replace({
    "benign": 0,
    "good": 0,
    "legitimate": 0,
    "malware": 1,
    "phishing": 1,
    "defacement": 1,
    "suspicious": 1
})

Extracting https, ip addresses, suspicious keywords

In [4]:
def extract_features(url):
    # Error-safe parsing
    try:
        parsed = urlparse(url)
    except:
        parsed = None

    length = len(url)
    dots = url.count('.')
    hyphens = url.count('-')
    slashes = url.count('/')
    digits = sum(c.isdigit() for c in url)
    special_chars = sum(not c.isalnum() for c in url)

    # HTTPS flag
    https_flag = 1 if url.startswith("https") else 0

    # IP address in URL
    ip_flag = 1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}", url) else 0

    # Suspicious keywords
    bad_words = ["login", "secure", "account", "verify", "update",
                 "free", "winner", "bank", "paypal", "ebay"]
    keyword_flag = any(word in url.lower() for word in bad_words)

    # Domain info
    ext = tldextract.extract(url)
    domain = ext.domain
    subdomain = ext.subdomain

    # Digit-letter ratio
    letters = sum(c.isalpha() for c in url)
    digit_letter_ratio = digits / (letters + 1)

    return [
        length, dots, hyphens, slashes, digits, special_chars,
        https_flag, ip_flag, int(keyword_flag),
        len(domain), len(subdomain), digit_letter_ratio
    ]

In [5]:
features = df['url'].apply(extract_features)
X = np.array(list(features))
y = df['type'].values

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (651191, 12)
y shape: (651191,)


Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Model Architecture

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),

    # Dense block 1
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Dense block 2
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),

    # Dense block 3
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),

    # Dense block 4
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.1),

    # Output layer
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

Training Model

In [9]:
history = model.fit(X_train, y_train,epochs=10,batch_size=256,validation_split=0.1)

Epoch 1/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.8214 - loss: 0.3907 - val_accuracy: 0.9129 - val_loss: 0.2095
Epoch 2/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9053 - loss: 0.2363 - val_accuracy: 0.9299 - val_loss: 0.1785
Epoch 3/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.9193 - loss: 0.2075 - val_accuracy: 0.9374 - val_loss: 0.1633
Epoch 4/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.9274 - loss: 0.1910 - val_accuracy: 0.9383 - val_loss: 0.1586
Epoch 5/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9318 - loss: 0.1817 - val_accuracy: 0.9433 - val_loss: 0.1512
Epoch 6/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9345 - loss: 0.1760 - val_accuracy: 0.9432 - val_loss: 0.1493
Epoch 7/10

Classification Report

In [10]:
pred = (model.predict(X_test) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

[1m4070/4070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
Accuracy: 0.9485023687221186
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     85778
           1       0.93      0.92      0.92     44461

    accuracy                           0.95    130239
   macro avg       0.94      0.94      0.94    130239
weighted avg       0.95      0.95      0.95    130239



Saving Model

In [11]:
model.save("model.h5")
model.save("model.keras")

