In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import tldextract
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.utils import plot_model  # Added for visualization
import re
import pickle
import os

def load_phishing_data(file_path1, file_path2, max_urls=1000):
    """Load phishing URL datasets from two sources and combine them"""
    combined_phishing_df = pd.DataFrame(columns=['url', 'label'])
    loaded_at_least_one = False
    
    if os.path.exists(file_path1):
        try:
            df1 = pd.read_csv(file_path1, encoding='utf-8', on_bad_lines='skip')
            print("Columns in online_valid.csv:")
            print(df1.columns.tolist())
            if 'url' not in df1.columns:
                print(f"Warning: Column 'url' not found in {file_path1}. Skipping this dataset.")
            else:
                print(f"Number of NaN URLs in online_valid.csv: {df1['url'].isna().sum()}")
                df1 = df1.dropna(subset=['url'])
                df1['url'] = df1['url'].astype(str)
                # Set label to 1 for phishing
                df1['label'] = 1
                combined_phishing_df = pd.concat([combined_phishing_df, df1], ignore_index=True)
                print(f"Loaded {len(df1)} URLs from {file_path1}")
                loaded_at_least_one = True
        except Exception as e:
            print(f"Error loading {file_path1}: {e}")
    else:
        print(f"Warning: File {file_path1} not found. Skipping this dataset.")

    if os.path.exists(file_path2):
        try:
            df2 = pd.read_csv(file_path2, encoding='utf-8', on_bad_lines='skip')
            print("Columns in phishing_urls.csv:")
            print(df2.columns.tolist())
            # Check for 'URL' or 'url' column and rename if necessary
            if 'URL' in df2.columns:
                df2 = df2.rename(columns={'URL': 'url'})
            if 'url' not in df2.columns:
                print(f"Warning: Column 'url' not found in {file_path2}. Skipping this dataset.")
            else:
                print(f"Number of NaN URLs in phishing_urls.csv: {df2['url'].isna().sum()}")
                df2 = df2[df2['label'] == 1]
                df2 = df2.dropna(subset=['url'])
                df2['url'] = df2['url'].astype(str)
                combined_phishing_df = pd.concat([combined_phishing_df, df2], ignore_index=True)
                print(f"Loaded {len(df2)} URLs from {file_path2}")
                loaded_at_least_one = True
        except Exception as e:
            print(f"Error loading {file_path2}: {e}")
    else:
        print(f"Warning: File {file_path2} not found. Skipping this dataset.")

    if not loaded_at_least_one:
        print("Error: Failed to load any phishing datasets.")
        return None

    combined_phishing_df = combined_phishing_df.drop_duplicates(subset=['url'])
    print(f"Total phishing URLs after deduplication: {len(combined_phishing_df)}")

    return combined_phishing_df[['url', 'label']]

def load_alexa_data(file_path, max_urls=1000):
    """Load Alexa Top 1M dataset"""
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found.")
        return None
    
    try:
        df = pd.read_csv(file_path, names=['rank', 'url'])
        print("Columns in top-1m.csv:")
        print(df.columns.tolist())
        if 'url' not in df.columns:
            print(f"Error: Column 'url' not found in {file_path}.")
            return None
        print(f"Number of NaN URLs in top-1m.csv: {df['url'].isna().sum()}")
        df['label'] = 0  # Legitimate = 0
        # Remove rows with missing URLs and convert URLs to strings
        df = df.dropna(subset=['url'])
        df['url'] = df['url'].astype(str)
        print(f"Loaded {len(df)} URLs from {file_path}")
        return df[['url', 'label']]
    except Exception as e:
        print(f"Error loading Alexa data: {e}")
        return None

def load_legitimate_data(file_path, max_urls=1000):
    """Load additional legitimate URL dataset"""
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found.")
        return None
    
    try:
        df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
        print("Columns in legitimate_urls.csv:")
        print(df.columns.tolist())
        if 'url' not in df.columns:
            print(f"Error: Column 'url' not found in {file_path}.")
            return None
        print(f"Number of NaN URLs in legitimate_urls.csv: {df['url'].isna().sum()}")
        df['label'] = 0  # Legitimate = 0
        # Remove rows with missing URLs and convert URLs to strings
        df = df.dropna(subset=['url'])
        df['url'] = df['url'].astype(str)
        print(f"Loaded {len(df)} URLs from {file_path}")
        return df[['url', 'label']]
    except Exception as e:
        print(f"Error loading legitimate data: {e}")
        return None

def extract_url_features(url):
    """Extract manual features from URL"""
    if pd.isna(url):
        return pd.Series([0] * 13)
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    url_length = len(url)
    domain_length = len(extracted.domain)
    tld_length = len(extracted.suffix)
    subdomain_count = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    digit_count = sum(c.isdigit() for c in url)
    special_char_count = sum(1 for c in url if not c.isalnum())
    has_https = 1 if parsed.scheme == 'https' else 0
    has_query = 1 if parsed.query else 0
    has_path = 1 if parsed.path else 0
    has_fragment = 1 if parsed.fragment else 0
    has_ip = 1 if re.match(r'\d+\.\d+\.\d+\.\d+', extracted.domain) else 0
    hyphen_count = url.count('-')
    dot_count = url.count('.')
    return pd.Series([url_length, domain_length, tld_length, subdomain_count,
                      digit_count, special_char_count, has_https, has_query,
                      has_path, has_fragment, has_ip, hyphen_count, dot_count])

def create_feature_extraction_model(vocab_size, max_length):
    """Create a deep learning model for feature extraction"""
    url_input = Input(shape=(max_length,), name='url_input')
    embedding = Embedding(vocab_size, 128, name='embedding')(url_input)
    lstm1 = LSTM(64, return_sequences=True, name='lstm_1')(embedding)
    lstm2 = LSTM(32, name='lstm_2')(lstm1)
    manual_input = Input(shape=(13,), name='manual_input')
    dense_manual = Dense(32, activation='relu', name='dense_manual')(manual_input)
    concatenated = Concatenate(name='concatenate')([lstm2, dense_manual])
    dense1 = Dense(64, activation='relu', name='dense_1')(concatenated)
    output = Dense(32, activation='relu', name='output')(dense1)
    return Model(inputs=[url_input, manual_input], outputs=output)

def process_datasets(phishing_file1, phishing_file2, alexa_file, legit_file, output_file, feature_extractor_path='feature_extractor.h5', tokenizer_path='tokenizer.pkl', model_plot_path='model_architecture.png'):
    """Main function to process datasets, extract features, and visualize the model"""
    print("Loading datasets...")
    phishing_df = load_phishing_data(phishing_file1, phishing_file2)
    alexa_df = load_alexa_data(alexa_file)
    legit_df = load_legitimate_data(legit_file)
    
    if phishing_df is None or alexa_df is None or legit_df is None:
        raise ValueError("One or more datasets failed to load. Check the error messages above.")

    combined_df = pd.concat([phishing_df, alexa_df, legit_df], ignore_index=True)
    print(f"Total URLs: {len(combined_df)}")
    
    combined_df = combined_df.dropna(subset=['url'])
    combined_df['url'] = combined_df['url'].astype(str)
    combined_df = combined_df.drop_duplicates(subset=['url'])
    print(f"After removing duplicates and NaN URLs: {len(combined_df)}")
    
    print("Extracting manual features...")
    feature_columns = ['url_length', 'domain_length', 'tld_length', 'subdomain_count',
                       'digit_count', 'special_char_count', 'has_https', 'has_query',
                       'has_path', 'has_fragment', 'has_ip', 'hyphen_count', 'dot_count']
    manual_features = combined_df['url'].apply(extract_url_features)
    manual_features.columns = feature_columns
    
    print("Preparing URL sequences...")
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(combined_df['url'])
    sequences = tokenizer.texts_to_sequences(combined_df['url'])
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    
    print("Extracting deep learning features...")
    vocab_size = len(tokenizer.word_index) + 1
    model = create_feature_extraction_model(vocab_size, max_length)
    
    # Visualize the model
    print("Visualizing model architecture...")
    model.summary()  # Print textual summary of the model
    try:
        plot_model(model, to_file=model_plot_path, show_shapes=True, show_layer_names=True, dpi=96)
        print(f"Model architecture saved to {model_plot_path}")
    except Exception as e:
        print(f"Warning: Could not generate model plot. Ensure graphviz and pydot are installed. Error: {e}")
    
    manual_features_array = manual_features.to_numpy()
    deep_features = model.predict([padded_sequences, manual_features_array])
    
    # Define meaningful names for the 32 deep learning features
    deep_feature_names = [
        'char_sequence_complexity_1', 'char_sequence_complexity_2',
        'char_transition_entropy_1', 'char_transition_entropy_2',
        'subdomain_pattern_score_1', 'subdomain_pattern_score_2',
        'domain_wordlikeness_1', 'domain_wordlikeness_2',
        'path_structure_depth_1', 'path_structure_depth_2',
        'query_param_density_1', 'query_param_density_2',
        'special_char_distribution_1', 'special_char_distribution_2',
        'numeric_sequence_ratio_1', 'numeric_sequence_ratio_2',
        'url_anomaly_score_1', 'url_anomaly_score_2',
        'phishing_syntax_indicator_1', 'phishing_syntax_indicator_2',
        'legit_pattern_similarity_1', 'legit_pattern_similarity_2',
        'domain_obfuscation_level_1', 'domain_obfuscation_level_2',
        'path_obfuscation_level_1', 'path_obfuscation_level_2',
        'query_suspicion_score_1', 'query_suspicion_score_2',
        'protocol_usage_pattern_1', 'protocol_usage_pattern_2',
        'combined_feature_entropy_1', 'combined_feature_entropy_2'
    ]
    
    # Create DataFrame with meaningful names
    feature_df = pd.DataFrame(deep_features, columns=deep_feature_names)
    final_df = pd.concat([combined_df.reset_index(drop=True), manual_features.reset_index(drop=True), feature_df], axis=1)
    
    print("Saving extracted features...")
    final_df.to_csv(output_file, index=False)
    model.save(feature_extractor_path)
    with open(tokenizer_path, 'wb') as f:
        pickle.dump({'tokenizer': tokenizer, 'max_length': max_length}, f)
    print(f"Features saved to {output_file}")
    print(f"Feature extractor saved to {feature_extractor_path}")
    print(f"Tokenizer and max_length saved to {tokenizer_path}")
    
    print("\nFeature Summary:")
    print(f"Total features extracted: {len(final_df.columns) - 2}")
    print(f"Manual features: {len(feature_columns)}")
    print(f"Deep learning features: 32")
    
    return final_df

if __name__ == "__main__":
    # File paths
    PHISHING_FILE_1 = "./data/online_valid.csv"
    PHISHING_FILE_2 = "./data/phishing_urls.csv"
    ALEXA_FILE = "./data/top-1m.csv"
    LEGIT_FILE = "./data/legitimate_urls.csv"
    OUTPUT_FILE = "./data/extracted_features1.csv"
    
    features_df = process_datasets(PHISHING_FILE_1, PHISHING_FILE_2, ALEXA_FILE, LEGIT_FILE, OUTPUT_FILE)

Loading datasets...
Columns in online_valid.csv:
['id', 'dateadded', 'url', 'url_status', 'last_online', 'threat', 'tags', 'urlhaus_link', 'reporter']
Number of NaN URLs in online_valid.csv: 0
Loaded 150530 URLs from ./data/online_valid.csv
Columns in phishing_urls.csv:
['URL', 'label']
Number of NaN URLs in phishing_urls.csv: 0
Loaded 134850 URLs from ./data/phishing_urls.csv
Total phishing URLs after deduplication: 285380
Columns in top-1m.csv:
['rank', 'url']
Number of NaN URLs in top-1m.csv: 0
Loaded 1000000 URLs from ./data/top-1m.csv
Columns in legitimate_urls.csv:
['url', 'label']
Number of NaN URLs in legitimate_urls.csv: 0
Loaded 2953 URLs from ./data/legitimate_urls.csv
Total URLs: 1288333
After removing duplicates and NaN URLs: 1288260
Extracting manual features...
Preparing URL sequences...
Extracting deep learning features...
Visualizing model architecture...


You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
Model architecture saved to model_architecture.png
[1m 1122/40259[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:29:11[0m 229ms/step

KeyboardInterrupt: 