## Preprocessing

In [None]:
import pandas as pd

print("Loading dataset...")
df = pd.read_csv(
    "/kaggle/input/phishing-website-webcode-dataset/phishing_complete_dataset.csv",
    usecols=["url", "result"],
)
df.head()

Loading dataset...


Unnamed: 0,url,result
0,http://intego3.info/EXEL/index.php,1
1,https://www.mathopenref.com/segment.html,0
2,https://www.computerhope.com/issues/ch000254.htm,0
3,https://www.investopedia.com/terms/n/next-elev...,0
4,https://jobs.emss.org.uk/lcc.aspx,0


In [None]:
import pandas as pd
from urllib.parse import urlparse


def preprocess_urls_safe(df):
    """
    Minimal preprocessing that preserves all potentially useful information
    for phishing detection feature extraction.
    """
    # Make a copy to avoid modifying original
    df = df.copy()

    print("=" * 50)
    print("URL PREPROCESSING REPORT")
    print("=" * 50)
    print(f"Initial dataset size: {len(df)}")

    # Step 1: Handle missing values
    missing_urls = df["url"].isna().sum()
    print(f"\n1. Missing URLs found: {missing_urls}")
    if missing_urls > 0:
        df = df.dropna(subset=["url"])
        print(f"   Rows after removing missing URLs: {len(df)}")

    # Step 2: Handle missing labels
    missing_labels = df["result"].isna().sum()
    print(f"\n2. Missing labels found: {missing_labels}")
    if missing_labels > 0:
        df = df.dropna(subset=["result"])
        print(f"   Rows after removing missing labels: {len(df)}")

    # Step 3: Strip ONLY leading/trailing whitespace (preserve internal structure)
    df["url"] = df["url"].str.strip()
    print(f"\n3. Whitespace stripped from URL boundaries")

    # Step 4: Check for duplicate URLs with conflicting labels
    duplicates = df[df.duplicated(subset=["url"], keep=False)]
    if len(duplicates) > 0:
        print(f"\n4. Duplicate URLs found: {len(duplicates)}")

        # Check for conflicting labels
        conflicts = duplicates.groupby("url")["result"].nunique()
        conflicting_urls = conflicts[conflicts > 1]

        if len(conflicting_urls) > 0:
            print(
                f"   ⚠️ WARNING: {len(conflicting_urls)} URLs have conflicting labels!"
            )
            print(f"   Conflicting URLs:")
            for url in conflicting_urls.index[:5]:  # Show first 5
                labels = duplicates[duplicates["url"] == url]["result"].unique()
                print(f"      {url}: labels = {labels}")

            # Strategy: Keep the majority label, or drop if tie
            def resolve_conflict(group):
                label_counts = group["result"].value_counts()
                if (
                    len(label_counts) > 1
                    and label_counts.iloc[0] == label_counts.iloc[1]
                ):
                    # It's a tie, drop all instances
                    return None
                # Keep row with most common label
                return group[group["result"] == label_counts.index[0]].iloc[0]

            # Separate conflicting and non-conflicting
            conflict_urls = conflicting_urls.index.tolist()
            df_conflicts = df[df["url"].isin(conflict_urls)]
            df_no_conflicts = df[~df["url"].isin(conflict_urls)]

            # Resolve conflicts
            resolved = (
                df_conflicts.groupby("url")
                .apply(resolve_conflict)
                .reset_index(drop=True)
            )
            resolved = resolved.dropna()

            # Combine back
            df = pd.concat([df_no_conflicts, resolved], ignore_index=True)
            print(f"   Conflicts resolved. Rows after resolution: {len(df)}")

        # Remove remaining duplicates (same URL, same label)
        df = df.drop_duplicates(subset=["url"], keep="first")
        print(f"   Duplicates removed. Final rows: {len(df)}")
    else:
        print(f"\n4. No duplicate URLs found")

    # Step 5: Validate URL format (remove only completely unparseable URLs)
    def is_parseable_url(url):
        """Check if URL can be parsed and has basic structure"""
        try:
            if not isinstance(url, str):
                return False
            if len(url.strip()) == 0:
                return False
            result = urlparse(url)
            # Must have scheme (http/https/ftp etc) and netloc (domain)
            return bool(result.scheme and result.netloc)
        except:
            return False

    df["is_valid"] = df["url"].apply(is_parseable_url)
    invalid_count = (~df["is_valid"]).sum()
    print(f"\n5. Invalid/unparseable URLs found: {invalid_count}")

    if invalid_count > 0:
        print(f"   Examples of invalid URLs:")
        invalid_urls = df[~df["is_valid"]]["url"].head(5).tolist()
        for invalid_url in invalid_urls:
            print(f"      {invalid_url}")

    df = df[df["is_valid"]].drop("is_valid", axis=1)

    # Step 6: Ensure labels are in correct format (0 and 1)
    unique_labels = df["result"].unique()
    print(f"\n6. Label distribution:")
    print(df["result"].value_counts().to_dict())

    if not all(label in [0, 1] for label in unique_labels):
        print(
            f"   ⚠️ WARNING: Labels contain values other than 0 and 1: {unique_labels}"
        )
        print(f"   Converting labels to binary (0/1)...")
        df["result"] = df["result"].astype(int)

    # Step 7: Reset index
    df = df.reset_index(drop=True)

    print(f"\n{'='*50}")
    print(f"PREPROCESSING COMPLETE")
    print(f"{'='*50}")
    print(f"Final dataset size: {len(df)}")
    print(f"Legitimate URLs (0): {(df['result'] == 0).sum()}")
    print(f"Phishing URLs (1): {(df['result'] == 1).sum()}")
    print(f"{'='*50}\n")

    return df

In [None]:
# Run preprocessing
df_clean = preprocess_urls_safe(df)

# Save the cleaned dataset
output_filename = "/kaggle/working/cleaned_urls_dataset.csv"
df_clean.to_csv(output_filename, index=False)
print(f"✅ Cleaned dataset saved to: {output_filename}")

# Display first few rows
print("\nFirst 5 rows of cleaned dataset:")
print(df_clean.head())

URL PREPROCESSING REPORT
Initial dataset size: 80000

1. Missing URLs found: 0

2. Missing labels found: 0

3. Whitespace stripped from URL boundaries

4. Duplicate URLs found: 286
   Conflicting URLs:
      https://www.meresearch.org.uk/what-is-me/: labels = [0 1]
   Conflicts resolved. Rows after resolution: 79998
   Duplicates removed. Final rows: 79847


  resolved = df_conflicts.groupby('url').apply(resolve_conflict).reset_index(drop=True)



5. Invalid/unparseable URLs found: 0

6. Label distribution:
{0: 49853, 1: 29994}

PREPROCESSING COMPLETE
Final dataset size: 79847
Legitimate URLs (0): 49853
Phishing URLs (1): 29994

✅ Cleaned dataset saved to: /kaggle/working/cleaned_urls_dataset.csv

First 5 rows of cleaned dataset:
                                                 url  result
0                 http://intego3.info/EXEL/index.php       1
1           https://www.mathopenref.com/segment.html       0
2   https://www.computerhope.com/issues/ch000254.htm       0
3  https://www.investopedia.com/terms/n/next-elev...       0
4                  https://jobs.emss.org.uk/lcc.aspx       0


## Feature engineering

In [4]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse, parse_qs
import re
from math import log2
import tldextract


def calculate_entropy(text):
    """Calculate Shannon entropy of a string"""
    if not text:
        return 0

    # Count frequency of each character
    prob = [float(text.count(c)) / len(text) for c in dict.fromkeys(list(text))]

    # Calculate entropy
    entropy = -sum([p * log2(p) for p in prob])
    return entropy


def has_ip_address(url):
    """Check if URL contains an IP address instead of domain name"""
    # IPv4 pattern
    ipv4_pattern = re.compile(r"(\d{1,3}\.){3}\d{1,3}")
    # IPv6 pattern (simplified)
    ipv6_pattern = re.compile(
        r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4})"
    )

    return bool(ipv4_pattern.search(url) or ipv6_pattern.search(url))


def count_suspicious_keywords(text):
    """Count suspicious keywords commonly used in phishing"""
    suspicious_words = [
        "verify",
        "account",
        "update",
        "secure",
        "banking",
        "login",
        "signin",
        "ebayisapi",
        "webscr",
        "password",
        "confirm",
        "suspend",
        "alert",
        "authenticate",
        "wallet",
        "credential",
        "security",
        "urgent",
    ]

    text_lower = text.lower()
    count = sum(1 for word in suspicious_words if word in text_lower)
    return count


def has_shortening_service(url):
    """Check if URL uses URL shortening service"""
    shortening_services = [
        "bit.ly",
        "goo.gl",
        "tinyurl",
        "t.co",
        "ow.ly",
        "is.gd",
        "buff.ly",
        "adf.ly",
        "bit.do",
        "short.link",
        "tiny.cc",
    ]

    url_lower = url.lower()
    return int(any(service in url_lower for service in shortening_services))


def get_tld_type(domain):
    """Classify TLD as common, suspicious, or other"""
    common_tlds = [".com", ".org", ".net", ".edu", ".gov", ".co", ".uk", ".de", ".fr"]
    suspicious_tlds = [".tk", ".ml", ".ga", ".cf", ".gq", ".pw", ".cc", ".club", ".xyz"]

    domain_lower = domain.lower()

    if any(domain_lower.endswith(tld) for tld in common_tlds):
        return 1  # Common TLD
    elif any(domain_lower.endswith(tld) for tld in suspicious_tlds):
        return 2  # Suspicious TLD
    else:
        return 0  # Other TLD


def extract_url_features(url):
    """Extract all features from a single URL"""
    features = {}

    try:
        # Parse URL
        parsed = urlparse(url)

        # Extract domain components using tldextract
        ext = tldextract.extract(url)
        domain = ext.domain
        subdomain = ext.subdomain
        suffix = ext.suffix
        full_domain = f"{domain}.{suffix}" if suffix else domain

        # ======================
        # 1. URL LENGTH FEATURES
        # ======================
        features["url_length"] = len(url)
        features["domain_length"] = len(full_domain) if full_domain else 0
        features["path_length"] = len(parsed.path)
        features["query_length"] = len(parsed.query)
        features["fragment_length"] = len(parsed.fragment)

        # Count subdirectories in path
        path_parts = [p for p in parsed.path.split("/") if p]
        features["subdirectory_count"] = len(path_parts)

        # ======================
        # 2. PROTOCOL FEATURES
        # ======================
        features["has_https"] = int(parsed.scheme == "https")
        features["has_http"] = int(parsed.scheme == "http")

        # ======================
        # 3. DOMAIN FEATURES
        # ======================
        features["dot_count_in_domain"] = full_domain.count(".")
        features["subdomain_count"] = len(subdomain.split(".")) if subdomain else 0
        features["has_subdomain"] = int(bool(subdomain))
        features["has_ip_address"] = int(has_ip_address(url))
        features["domain_has_numbers"] = int(bool(re.search(r"\d", full_domain)))
        features["suspicious_keywords_in_domain"] = count_suspicious_keywords(
            full_domain
        )

        # ======================
        # 4. SPECIAL CHARACTER FEATURES
        # ======================
        features["at_symbol_count"] = url.count("@")
        features["hyphen_count"] = url.count("-")
        features["underscore_count"] = url.count("_")
        features["question_mark_count"] = url.count("?")
        features["equal_count"] = url.count("=")
        features["ampersand_count"] = url.count("&")
        features["percent_count"] = url.count("%")
        features["double_slash_count"] = (
            url.count("//") - 1
        )  # Subtract the one in http://
        features["digit_count"] = sum(c.isdigit() for c in url)
        features["letter_count"] = sum(c.isalpha() for c in url)
        features["dot_count"] = url.count(".")
        features["slash_count"] = url.count("/")

        # ======================
        # 5. PATH FEATURES
        # ======================
        features["path_depth"] = len(path_parts)

        # File extension
        if path_parts:
            last_part = path_parts[-1]
            if "." in last_part:
                extension = last_part.split(".")[-1].lower()
                features["has_file_extension"] = 1

                # Suspicious extensions
                suspicious_extensions = ["exe", "zip", "rar", "php", "js", "bin", "scr"]
                features["has_suspicious_extension"] = int(
                    extension in suspicious_extensions
                )
            else:
                features["has_file_extension"] = 0
                features["has_suspicious_extension"] = 0
        else:
            features["has_file_extension"] = 0
            features["has_suspicious_extension"] = 0

        # ======================
        # 6. SUSPICIOUS PATTERN FEATURES
        # ======================
        features["has_at_symbol"] = int("@" in url)
        features["has_port"] = int(bool(parsed.port))
        features["has_shortening_service"] = has_shortening_service(url)
        features["suspicious_keywords_in_url"] = count_suspicious_keywords(url)

        # Check for hexadecimal characters (common in encoded URLs)
        hex_pattern = re.compile(r"%[0-9a-fA-F]{2}")
        features["has_hex_encoding"] = int(bool(hex_pattern.search(url)))

        # Prefix/suffix hyphen in domain
        features["prefix_suffix_hyphen"] = int("-" in full_domain)

        # ======================
        # 7. ENTROPY FEATURES
        # ======================
        features["url_entropy"] = calculate_entropy(url)
        features["domain_entropy"] = calculate_entropy(full_domain)
        features["path_entropy"] = calculate_entropy(parsed.path) if parsed.path else 0

        # ======================
        # 8. CHARACTER TYPE RATIOS
        # ======================
        url_len = len(url) if len(url) > 0 else 1  # Avoid division by zero
        features["digit_ratio"] = features["digit_count"] / url_len
        features["letter_ratio"] = features["letter_count"] / url_len

        # Count special characters
        special_char_count = sum(not c.isalnum() for c in url)
        features["special_char_ratio"] = special_char_count / url_len

        # Uppercase to lowercase ratio
        uppercase_count = sum(c.isupper() for c in url)
        lowercase_count = sum(c.islower() for c in url)
        total_letters = uppercase_count + lowercase_count
        features["uppercase_ratio"] = (
            uppercase_count / total_letters if total_letters > 0 else 0
        )

        # ======================
        # 9. LEXICAL FEATURES
        # ======================
        # Split URL into words (by non-alphanumeric characters)
        words = re.findall(r"\b\w+\b", url)
        features["word_count"] = len(words)

        # Average word length
        if words:
            features["avg_word_length"] = np.mean([len(word) for word in words])
            features["longest_word_length"] = max([len(word) for word in words])
        else:
            features["avg_word_length"] = 0
            features["longest_word_length"] = 0

        # TLD classification
        features["tld_type"] = get_tld_type(full_domain)

        # ======================
        # 10. REDIRECTION FEATURES
        # ======================
        # Multiple // in URL (excluding the one in protocol)
        features["multiple_redirects"] = int(url.count("//") > 1)

        # Query parameters count
        if parsed.query:
            query_params = parse_qs(parsed.query)
            features["query_param_count"] = len(query_params)
        else:
            features["query_param_count"] = 0

    except Exception as e:
        print(f"Error processing URL: {url}")
        print(f"Error: {str(e)}")
        # Return features with default values (0 or NaN)
        return {key: 0 for key in features.keys()}

    return features


def engineer_features(df):
    """
    Apply feature engineering to the entire dataset

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with 'url' and 'result' columns

    Returns:
    --------
    pandas.DataFrame
        DataFrame with all engineered features
    """
    print("=" * 60)
    print("FEATURE ENGINEERING")
    print("=" * 60)
    print(f"Processing {len(df)} URLs...")

    # Extract features for all URLs
    features_list = []

    for idx, url in enumerate(df["url"]):
        if (idx + 1) % 1000 == 0:
            print(f"Processed {idx + 1}/{len(df)} URLs...")

        features = extract_url_features(url)
        features["url"] = url  # Keep original URL
        features["result"] = df.loc[idx, "result"]  # Keep label
        features_list.append(features)

    # Create DataFrame from features
    df_features = pd.DataFrame(features_list)

    # Reorder columns: url, result, then features
    feature_cols = [col for col in df_features.columns if col not in ["url", "result"]]
    df_features = df_features[["url", "result"] + feature_cols]

    print(f"\n✅ Feature engineering complete!")
    print(f"Total features extracted: {len(feature_cols)}")
    print(f"\nFeature columns:")
    for i, col in enumerate(feature_cols, 1):
        print(f"  {i}. {col}")

    print(f"\n{'='*60}")
    print(f"Dataset shape: {df_features.shape}")
    print(f"{'='*60}\n")

    return df_features

In [5]:
# Load cleaned dataset
print("Loading cleaned dataset...")
df_clean = pd.read_csv(
    "/home/maliha/Programming/dm/Phishing-Website-Classifier/cleaned_urls_dataset.csv"
)

# Perform feature engineering
df_features = engineer_features(df_clean)

# Save feature-engineered dataset
output_filename = (
    "/home/maliha/Programming/dm/Phishing-Website-Classifier/url_features_dataset.csv"
)
df_features.to_csv(output_filename, index=False)
print(f"✅ Feature-engineered dataset saved to: {output_filename}")


# Display statistics
print("\n" + "=" * 60)
print("FEATURE STATISTICS")
print("=" * 60)
print(df_features.describe())

# Display first few rows
print("\n" + "=" * 60)
print("SAMPLE DATA (first 3 rows)")
print("=" * 60)
print(df_features.head(3))

Loading cleaned dataset...
FEATURE ENGINEERING
Processing 79847 URLs...
Processed 1000/79847 URLs...
Processed 2000/79847 URLs...
Processed 3000/79847 URLs...
Processed 4000/79847 URLs...
Processed 5000/79847 URLs...
Processed 6000/79847 URLs...
Processed 7000/79847 URLs...
Processed 8000/79847 URLs...
Processed 9000/79847 URLs...
Processed 10000/79847 URLs...
Processed 11000/79847 URLs...
Processed 12000/79847 URLs...
Processed 13000/79847 URLs...
Processed 14000/79847 URLs...
Processed 15000/79847 URLs...
Processed 16000/79847 URLs...
Processed 17000/79847 URLs...
Processed 18000/79847 URLs...
Processed 19000/79847 URLs...
Processed 20000/79847 URLs...
Processed 21000/79847 URLs...
Processed 22000/79847 URLs...
Processed 23000/79847 URLs...
Processed 24000/79847 URLs...
Processed 25000/79847 URLs...
Processed 26000/79847 URLs...
Processed 27000/79847 URLs...
Processed 28000/79847 URLs...
Processed 29000/79847 URLs...
Processed 30000/79847 URLs...
Processed 31000/79847 URLs...
Process