##PHISHING-URL-DETECTION PROJECT
*Ibn Zohr University*

*IT Excellence Center*

*Cyber Security Module*

*ELQORACHI Hind - JAAFAR Wafa - MISBAH Asma - BELFAIK Chayma*

*Academic Year 2025-2026*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/url-phishing-detection/CyberSecurity/Data/new_data_urls.csv')

df.sample(n=10)


Unnamed: 0,url,status
167377,https://gd2qq8.duckdns.org/,0
136893,https://www.amzaon.co.ip.maesome.shop/a3IwMDY1P3,0
128254,https://www1.aupay.co.aumin-php.com,0
671400,43blrj6ry9.hohyzuketexppa.info/euuc7e03zb\nvel...,0
563260,linkedin.com/pub/hubert-lacroix/3/854/995,1
703686,https://www.amazon.co.uk/Technicolour-TG582n-P...,1
382056,hannalilja.se/tqvbsj/tsyq.php?djma=marker-pool...,1
566087,lrri.org/,1
445933,windmillsandeggnog.wordpress.com/,1
341359,ca.linkedin.com/pub/wayne-nelson/4/b70/792,1


In [None]:
# Sample 20k from each class
legit_df = df[df['status'] == 1].sample(n=20000, random_state=42)
phish_df = df[df['status'] == 0].sample(n=20000, random_state=42)

# Concatenate the two classes
sampled_df = pd.concat([legit_df, phish_df], ignore_index=True)

#Shuffle the entire dataset
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check size
len(sampled_df)


40000

In [None]:
# Check sampling
print(legit_df.shape, phish_df.shape)
print(sampled_df['status'].value_counts())

# Save to CSV safely (keep headers)
sampled_df.to_csv("sampled_urls.csv", index=False)

# Load later safely
data = pd.read_csv("sampled_urls.csv")
print(data.head())
print(data['status'].value_counts())


(20000, 2) (20000, 2)
status
0    20000
1    20000
Name: count, dtype: int64
                                                 url  status
0               https://www.ama-autos-co-7.ea-4.top/       0
1  amazon.ca/Space-Buddies-F-T-Anderson/dp/B001IL...       1
2                                        tsdsadg.com       0
3                        uk.ask.com/wiki/Ford_Taurus       1
4                 https://connec-au-login.ips-au.com       0
status
0    20000
1    20000
Name: count, dtype: int64


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     40000 non-null  object
 1   status  40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [None]:
import pandas as pd
from urllib.parse import urlparse
import ipaddress
import re
import requests
from tqdm import tqdm

In [None]:
def featureExtraction(url):
    features = []

    # Remove leading/trailing whitespace
    url = url.strip()

    # Ensure URL has a scheme for correct parsing
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url  # assume http if missing

    # Regex pattern for URL shortening services
    shortening_services = r"(bit\.ly|goo\.gl|tinyurl|t\.co|ow\.ly|is\.gd|buff\.ly|adf\.ly|bitly)"

    # Manual override lists
    trusted_domains = [
        "google.com", "amazon.com", "microsoftonline.com",
        "gmail.com", "outlook.com", "apple.com", "github.com",
        "facebook.com", "youtube.com"
    ]

    shorteners_list = ["bit.ly", "tinyurl", "t.co", "goo.gl", "is.gd", "ow.ly"]

    fake_secure_keywords = [
        "secure-", "security-", "verify", "verification", "validate",
        "confirm", "confirmation", "update-info", "update-account",
        "unlock", "recovery", "support-center", "customer-support",
        "login-secure", "account-protect", "identity-check"
    ]

    suspicious_tlds = [
        ".xyz", ".top", ".click", ".info", ".monster", ".online",
        ".live", ".space", ".site", ".loan", ".stream"
    ]

    suspicious_path_keywords = [
        "login", "verify", "update", "signin", "account", "secure",
        "wp-admin", "password", "billing"
    ]

    # -------------------------
    # Domain & URL
    # -------------------------
    parsed = urlparse(url)
    domain = parsed.netloc.replace("www.", "").rstrip('/')
    path = parsed.path

    features.append(domain)   # keep domain string as first element

    # -------------------------
    # 1â€“20. Lexical & HTML features
    # -------------------------
    try:
        ipaddress.ip_address(domain)
        features.append(1)
    except:
        features.append(0)

    features.append(1 if "@" in url else 0)
    features.append(1 if len(url) >= 54 else 0)
    features.append(len([x for x in path.split("/") if x]))
    features.append(1 if url.rfind("//") > 6 else 0)
    features.append(1 if url.startswith("https") else 0)
    features.append(1 if re.search(shortening_services, url) else 0)
    features.append(1 if "-" in domain else 0)
    features.append(len(domain.split(".")) - 1)
    features.append(sum(c.isdigit() for c in domain))
    features.append(len(re.findall(r"[^a-zA-Z0-9.]", domain)))

    keywords = ["secure", "account", "update", "login", "verify", "bank", "confirm"]
    features.append(1 if any(k in url.lower() for k in keywords) else 0)

    features.append(1 if len(domain) < 10 else 0)
    features.append(1 if domain.endswith((".com", ".org", ".net")) else 0)

    # HTML features
    try:
        r = requests.get(url, timeout=3)
        html = r.text
    except:
        html = ""

    features.append(0 if "<iframe" in html else 1)
    features.append(1 if "onmouseover" in html else 0)
    features.append(0 if "event.button" in html else 1)
    features.append(1 if "window.location" in html else 0)
    features.append(1 if "<form" in html else 0)

    suspicious_js = ["eval(", "escape(", "unescape("]
    features.append(1 if any(f in html for f in suspicious_js) else 0)

    # -------------------------
    # ðŸ”¥ Manual override features integrated
    # -------------------------
    domain_lower = domain.lower()

    features.append(1 if any(td in domain_lower for td in trusted_domains) else 0)
    features.append(1 if any(sh in domain_lower for sh in shorteners_list) else 0)
    features.append(1 if any(k in url.lower() for k in fake_secure_keywords) else 0)
    features.append(1 if any(domain_lower.endswith(tld) for tld in suspicious_tlds) else 0)
    features.append(1 if any(k in path.lower() for k in suspicious_path_keywords) else 0)

    return features


In [None]:
# FEATURE EXTRACTION ON "sampled_urls.csv"

all_features = []

for url in tqdm(sampled_df['url'], total=len(sampled_df)):
    feats = featureExtraction(url)
    all_features.append(feats)


 18%|â–ˆâ–Š        | 7317/40000 [1:17:09<8:49:56,  1.03it/s] 

In [None]:
# -------------------------
# 3. Column names (25 features)
# -------------------------
columns = [
    'Domain',                 # extracted domain string
    'Have_IP',                # 1 if domain is an IP
    'Have_At',                # 1 if '@' in URL
    'URL_Length',             # 1 if len(URL) >= 54
    'URL_Depth',              # number of path segments
    'Redirection',            # 1 if '//' occurs after protocol
    'HTTPS',                  # 1 if HTTPS present
    'Shortener',              # 1 if URL shortening service regex
    'Prefix_Suffix',          # 1 if '-' in domain
    'Subdomain_Count',        # number of subdomains
    'Digit_Count',            # digits in domain
    'SpecialChar_Count',      # non-alphanumeric characters in domain
    'Sensitive_Keyword',      # 1 if URL contains sensitive keywords
    'Domain_Age',             # heuristic for short domain
    'Domain_Extension',       # heuristic for .com/.org/.net
    'iFrame',                 # 1 if no iframe tag
    'MouseOver',              # 1 if onmouseover present
    'RightClick',             # 1 if right-click disabled
    'Forwarding',             # 1 if window.location forwarding
    'Form_Tag',               # 1 if <form> tag present
    'Suspicious_JS',          # 1 if eval/escape/unescape
    'Trusted_Domain',         # 1 if domain is known trusted site
    'Manual_Shortener',       # 1 if domain matches known shortener
    'Fake_Secure_Keyword',    # 1 if URL contains fake security words
    'Suspicious_TLD',         # 1 if domain ends with suspicious TLD
    'Suspicious_Path'         # 1 if path contains suspicious keywords
]

# -------------------------
# 4. Convert to DataFrame
# -------------------------
features_df = pd.DataFrame(all_features, columns=columns)

# -------------------------
# 5. Add label
# -------------------------
features_df['Label'] = sampled_df['status'].values

# -------------------------
# 6. Optional: Save to CSV
# -------------------------
features_df.to_csv("features_dataset.csv", index=False)

print("Feature extraction complete. Shape:", features_df.shape)
features_df =
print(features_df.head())


NameError: name 'all_features' is not defined

In [None]:
features_df['Domain'].head(10)
(features_df['Domain'] == '').sum()

np.int64(29198)