In [1]:
# AI-Powered Phishing URL Detector
# Notebook 2: Feature Engineering and Preprocessing

# ## 2.1 Import Libraries
import pandas as pd
import numpy as np
import os
from urllib.parse import urlparse
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ## 2.2 Load Raw Data
# Load the dataset we prepared in the previous notebook.

# Define file paths
raw_data_path = '../data/raw/dataset_phishing.csv'
processed_data_path = '../data/processed/'
os.makedirs(processed_data_path, exist_ok=True) # Ensure the processed data directory exists

# Load the data
print("Loading raw data...")
try:
    df = pd.read_csv(raw_data_path)
    # We only need the 'url' and 'status' columns for our feature engineering.
    # This is a crucial step to ensure we build the logic from scratch.
    df = df[['url', 'status']]
    print("Raw data loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file {raw_data_path} was not found.")
    df = None

# ## 2.3 Feature Engineering
# This is where we extract meaningful features from the raw URL strings.

if df is not None:
    print("\nStarting feature engineering...")

    # 1. Length-based Features
    df['url_length'] = df['url'].apply(len)
    df['hostname_length'] = df['url'].apply(lambda x: len(urlparse(x).netloc))
    df['path_length'] = df['url'].apply(lambda x: len(urlparse(x).path))

    # 2. Count-based Features
    def count_char(char, text):
        return text.count(char)

    df['count_hyphens'] = df['url'].apply(lambda x: count_char('-', x))
    df['count_dots'] = df['url'].apply(lambda x: count_char('.', x))
    df['count_at'] = df['url'].apply(lambda x: count_char('@', x))
    df['count_questionmark'] = df['url'].apply(lambda x: count_char('?', x))
    df['count_equals'] = df['url'].apply(lambda x: count_char('=', x))
    df['count_slashes'] = df['url'].apply(lambda x: count_char('/', x))
    df['count_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))

    # 3. Binary Features (Presence of suspicious elements)
    def has_ip_address(url):
        # Regex to check for an IP address in the hostname
        match = re.search(
            # IPv4 regex
            r'(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.'
            r'([01]?\d\d?|2[0-4]\d|25[0-5])\/)|' 
            # IPv4 in host name
            r'((0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2}))'
            # IPv6 regex
            r'([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|'
            r'([0-9a-fA-F]{1,4}:){1,7}:|'
            r'([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|'
            r'([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|'
            r'([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|'
            r'([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|'
            r'([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|'
            r'[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|'
            r':((:[0-9a-fA-F]{1,4}){1,7}|:)', urlparse(url).netloc)
        
        return 1 if match else 0

    df['has_ip'] = df['url'].apply(has_ip_address)
    df['has_https'] = df['url'].apply(lambda x: 1 if urlparse(x).scheme == 'https' else 0)

    # Check for common shortening services
    shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                          r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|zpr\.io|tcrn\.ch|" \
                          r"filoops\.info|v\.gd|tr\.im|link\.zip\.net"
                          
    df['is_shortened'] = df['url'].apply(lambda x: 1 if re.search(shortening_services, x) else 0)

    print("Feature engineering complete. Displaying new features:")
    print(df.head())


# ## 2.4 Data Preprocessing

if df is not None:
    print("\nStarting data preprocessing...")
    # Define features (X) and target (y)
    # We drop the original 'url' column as we can't use text in the model.
    X = df.drop(columns=['url', 'status'])
    y = df['status']

    # Encode the target variable ('phishing' -> 1, 'legitimate' -> 0)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    print("Target variable encoded.")
    # Show mapping
    print(dict(zip(le.classes_, le.transform(le.classes_))))


    # Split the data into training and testing sets
    # We use stratify=y_encoded to ensure the class distribution is the same in both sets,
    # which is important for our balanced dataset.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    print("Data split into training and testing sets.")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")


# ## 2.5 Save Processed Data
if 'X_train' in locals():
    print("\nSaving processed data...")
    X_train.to_csv(os.path.join(processed_data_path, 'X_train.csv'), index=False)
    X_test.to_csv(os.path.join(processed_data_path, 'X_test.csv'), index=False)
    # Save y_train and y_test as well, as they are numpy arrays now
    np.save(os.path.join(processed_data_path, 'y_train.npy'), y_train)
    np.save(os.path.join(processed_data_path, 'y_test.npy'), y_test)
    print("Processed data saved successfully to ../data/processed/")
else:
    print("\nPreprocessing was not completed. Skipping save step.")



Loading raw data...
Raw data loaded successfully.

Starting feature engineering...
Feature engineering complete. Displaying new features:
                                                 url      status  url_length  \
0              http://www.crestonwood.com/router.php  legitimate          37   
1  http://shadetreetechnology.com/V4/validation/a...    phishing          77   
2  https://support-appleld.com.secureupdate.duila...    phishing         126   
3                                 http://rgipt.ac.in  legitimate          18   
4  http://www.iracing.com/tracks/gateway-motorspo...  legitimate          55   

   hostname_length  path_length  count_hyphens  count_dots  count_at  \
0               19           11              0           3         0   
1               23           47              0           1         0   
2               50           20              1           4         0   
3               11            0              0           2         0   
4               15   