# Phishing/Legitimate URL EDA
Quick exploratory analysis of the provided URL datasets (phishing vs. legitimate).

## Verify Kernel Availability

In [1]:
import sys, os
from importlib.metadata import version

print("Python:", sys.version.split()[0])
print("Executable:", sys.executable)
print("Conda env:", os.getenv("CONDA_DEFAULT_ENV"))
print("pandas:", version("pandas"))
print("numpy:", version("numpy"))

Python: 3.12.12
Executable: c:\Users\Asus\miniconda3\envs\ml2\python.exe
Conda env: ml2
pandas: 2.2.3
numpy: 1.26.4


## Initial Code Cell

In [2]:
import os
print("Hello from the EDA notebook!")
print("Working directory:", os.getcwd())
print("Sample env vars:", {k: os.getenv(k) for k in ["USERNAME", "COMPUTERNAME", "CONDA_DEFAULT_ENV"]})

Hello from the EDA notebook!
Working directory: c:\Users\Asus\Downloads\INT423\anomly-detect
Sample env vars: {'USERNAME': 'Asus', 'COMPUTERNAME': 'LAPTOP-FSIPJ8MC', 'CONDA_DEFAULT_ENV': 'ml2'}


## Test Computation

In [3]:
import numpy as np
vec = np.linspace(0, 1, 5)
print("Vector:", vec)
print("Vector squared:", vec ** 2)

Vector: [0.   0.25 0.5  0.75 1.  ]
Vector squared: [0.     0.0625 0.25   0.5625 1.    ]


## Save Notebook File
The notebook is stored at `eda.ipynb` inside the workspace; no extra action needed.

In [4]:
import pathlib
print("Notebook location:", pathlib.Path("eda.ipynb").resolve())

Notebook location: C:\Users\Asus\Downloads\INT423\anomly-detect\eda.ipynb


## Load Data

In [5]:
import pandas as pd
from urllib.parse import urlparse

phishing_path = "Phishing URLs.csv"
legit_path = "URL dataset.csv"

# Read as strings to avoid parse quirks
phish_df = pd.read_csv(phishing_path, dtype=str)
legit_df = pd.read_csv(legit_path, dtype=str)

print("Phishing shape:", phish_df.shape)
print("Legit shape:", legit_df.shape)

# Normalize column names
phish_df.columns = [c.strip().lower() for c in phish_df.columns]
legit_df.columns = [c.strip().lower() for c in legit_df.columns]

phish_df = phish_df.rename(columns={"type": "label"})
legit_df = legit_df.rename(columns={"type": "label"})

phish_df["label"] = "phishing"
legit_df["label"] = legit_df["label"].str.lower().fillna("legitimate")

phish_df["source"] = "phish_file"
legit_df["source"] = "legit_file"

combined = pd.concat([phish_df, legit_df], ignore_index=True)
print("Combined shape:", combined.shape)
combined.head()

Phishing shape: (54807, 2)
Legit shape: (450176, 2)
Combined shape: (504983, 3)


Unnamed: 0,url,label,source
0,https://docs.google.com/presentation/d/e/2PACX...,phishing,phish_file
1,https://btttelecommunniccatiion.weeblysite.com/,phishing,phish_file
2,https://kq0hgp.webwave.dev/,phishing,phish_file
3,https://brittishtele1bt-69836.getresponsesite....,phishing,phish_file
4,https://bt-internet-105056.weeblysite.com/,phishing,phish_file


## Basic Quality Checks

In [6]:
# Missing values by column
missing = combined.isna().mean().sort_values(ascending=False)

# Duplicate URLs
duplicate_count = combined.duplicated(subset=["url"]).sum()

# Label balance
label_counts = combined["label"].value_counts(dropna=False)
label_pct = (label_counts / len(combined) * 100).round(2)

print("Missing fraction by column:\n", missing)
print("\nDuplicate URL rows:", duplicate_count)
print("\nLabel counts:\n", label_counts)
print("\nLabel %:\n", label_pct)

Missing fraction by column:
 url       0.0
label     0.0
source    0.0
dtype: float64

Duplicate URL rows: 50

Label counts:
 label
legitimate    345738
phishing      159245
Name: count, dtype: int64

Label %:
 label
legitimate    68.47
phishing      31.53
Name: count, dtype: float64


## URL Parsing Helpers

In [7]:
from collections import Counter

# Simple parser with defensiveness
def parse_url(row_url: str):
    try:
        parsed = urlparse(row_url)
    except Exception:
        return {
            "scheme": None,
            "host": None,
            "path": None,
            "query": None,
            "fragment": None,
        }
    return {
        "scheme": parsed.scheme or None,
        "host": parsed.netloc or None,
        "path": parsed.path or None,
        "query": parsed.query or None,
        "fragment": parsed.fragment or None,
    }

parsed_df = combined["url"].apply(parse_url).apply(pd.Series)
parsed_df.columns = [f"url_{c}" for c in parsed_df.columns]

combined_parsed = pd.concat([combined, parsed_df], axis=1)

# Derive simple host pieces (robust fallback without tldextract)
def get_tld(host: str):
    if not isinstance(host, str) or host.strip() == "":
        return None
    parts = host.lower().split(".")
    return parts[-1] if len(parts) >= 2 else None

def get_domain_core(host: str):
    if not isinstance(host, str) or host.strip() == "":
        return None
    parts = host.lower().split(".")
    if len(parts) >= 2:
        return ".".join(parts[-2:])
    return host.lower()

combined_parsed["tld"] = combined_parsed["url_host"].apply(get_tld)
combined_parsed["domain_core"] = combined_parsed["url_host"].apply(get_domain_core)
combined_parsed["url_len"] = combined_parsed["url"].str.len()
combined_parsed["path_len"] = combined_parsed["url_path"].str.len()
combined_parsed["query_len"] = combined_parsed["url_query"].str.len()
combined_parsed.head()

Unnamed: 0,url,label,source,url_scheme,url_host,url_path,url_query,url_fragment,tld,domain_core,url_len,path_len,query_len
0,https://docs.google.com/presentation/d/e/2PACX...,phishing,phish_file,https,docs.google.com,/presentation/d/e/2PACX-1vTVj7OXwAUKJDv57jBmVg...,start=false&loop=false&delayms=3000&slide=id.p,,com,google.com,178,108.0,46.0
1,https://btttelecommunniccatiion.weeblysite.com/,phishing,phish_file,https,btttelecommunniccatiion.weeblysite.com,/,,,com,weeblysite.com,47,1.0,
2,https://kq0hgp.webwave.dev/,phishing,phish_file,https,kq0hgp.webwave.dev,/,,,dev,webwave.dev,27,1.0,
3,https://brittishtele1bt-69836.getresponsesite....,phishing,phish_file,https,brittishtele1bt-69836.getresponsesite.com,/,,,com,getresponsesite.com,50,1.0,
4,https://bt-internet-105056.weeblysite.com/,phishing,phish_file,https,bt-internet-105056.weeblysite.com,/,,,com,weeblysite.com,42,1.0,


## Length Distributions (summary stats)

In [8]:
length_cols = ["url_len", "path_len", "query_len"]
stats = combined_parsed.groupby("label")[length_cols].describe(percentiles=[0.25, 0.5, 0.75]).round(2)
stats

Unnamed: 0_level_0,url_len,url_len,url_len,url_len,url_len,url_len,url_len,url_len,path_len,path_len,path_len,path_len,path_len,query_len,query_len,query_len,query_len,query_len,query_len,query_len,query_len
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
legitimate,345738.0,58.48,25.53,15.0,42.0,53.0,70.0,651.0,343238.0,27.78,...,38.0,412.0,45963.0,23.71,25.88,1.0,11.0,13.0,29.0,612.0
phishing,159245.0,66.23,111.74,8.0,34.0,49.0,76.0,25523.0,150909.0,26.93,...,38.0,3212.0,25999.0,71.59,107.89,1.0,17.0,40.0,79.0,3624.0


## Scheme Breakdown

In [9]:
scheme_counts = combined_parsed.groupby(["label", "url_scheme"]).size().unstack(fill_value=0)
scheme_counts

url_scheme,ftp,http,https,httpss
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
legitimate,1,0,345702,35
phishing,7,103694,55543,0


## Top TLDs and Domains

In [10]:
top_tlds = (combined_parsed.groupby(["label", "tld"]).size().sort_values(ascending=False).groupby(level=0).head(10))
top_domains = (combined_parsed.groupby(["label", "domain_core"]).size().sort_values(ascending=False).groupby(level=0).head(10))

print("Top TLDs by label (head 10 each):\n", top_tlds)
print("\nTop domain cores by label (head 10 each):\n", top_domains)

Top TLDs by label (head 10 each):
 label       tld    
legitimate  com        265247
phishing    com         79395
legitimate  org         34846
            net         11347
            ca           9411
phishing    dev          7363
legitimate  edu          6944
phishing    net          5648
legitimate  uk           4620
phishing    org          4153
            br           3820
            ru           3744
legitimate  gov          2602
phishing    app          2521
            top          2243
            network      2189
            au           2032
legitimate  au           1267
            us           1183
            info         1158
dtype: int64

Top domain cores by label (head 10 each):
 label       domain_core        
legitimate  wikipedia.org          12895
            youtube.com             8627
            facebook.com            8275
            blogspot.com            7036
phishing    google.com              5341
legitimate  linkedin.com            5016
          

## Host/Path Patterns

In [11]:
# Presence flags
combined_parsed["has_ip_host"] = combined_parsed["url_host"].str.contains(r"^\d+\.\d+\.\d+\.\d+$", regex=True, na=False)
combined_parsed["has_query"] = combined_parsed["url_query"].str.len().gt(0)
combined_parsed["has_fragment"] = combined_parsed["url_fragment"].str.len().gt(0)

flags = combined_parsed[["label", "has_ip_host", "has_query", "has_fragment"]]
flag_rates = flags.groupby("label").mean().round(3)
flag_counts = flags.groupby("label").sum().astype(int)

print("Flag rates (share of URLs):\n", flag_rates)
print("\nFlag counts:\n", flag_counts)

Flag rates (share of URLs):
             has_ip_host  has_query  has_fragment
label                                           
legitimate        0.000      0.133         0.000
phishing          0.021      0.163         0.006

Flag counts:
             has_ip_host  has_query  has_fragment
label                                           
legitimate            0      45963             4
phishing           3318      25999           973


## Duplicate URLs Across Sources

In [12]:
# URLs appearing in both labeled sets
phish_urls = set(phish_df["url"])
legit_urls = set(legit_df["url"])
overlap = phish_urls & legit_urls

print("Overlap count (phish vs legit):", len(overlap))

if len(overlap) > 0:
    print("Sample overlaps:")
    for url in list(overlap)[:5]:
        print(url)

Overlap count (phish vs legit): 48
Sample overlaps:
https://prosxsiuser.myfreesites.net/
http://www.imxprs.com/free/webmaiil/accounttportal
http://www.imxprs.com/free/outlookwebaccessupgrade/outlookwebaccessupgrade
http://unam.myfreesites.net/
http://vtxmail2018.myfreesites.net/


## Notes / Next Steps
- Consider richer URL features (entropy, token counts, special characters, shortening services).
- Evaluate class imbalance when training (phishing set is smaller than legitimate set).
- Visualize length distributions and host patterns if needed.