In [7]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from urllib.parse import urlparse
import tldextract
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

def extract_manual_features(url):
    if pd.isna(url):
        return np.zeros(13)
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    url_length = len(url)
    domain_length = len(extracted.domain)
    tld_length = len(extracted.suffix)
    subdomain_count = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    digit_count = sum(c.isdigit() for c in url)
    special_char_count = sum(1 for c in url if not c.isalnum())
    has_https = 1 if parsed.scheme == 'https' else 0
    has_query = 1 if parsed.query else 0
    has_path = 1 if parsed.path else 0
    has_fragment = 1 if parsed.fragment else 0
    has_ip = 1 if re.match(r'\d+\.\d+\.\d+\.\d+', extracted.domain) else 0
    hyphen_count = url.count('-')
    dot_count = url.count('.')
    return np.array([url_length, domain_length, tld_length, subdomain_count,
                     digit_count, special_char_count, has_https, has_query,
                     has_path, has_fragment, has_ip, hyphen_count, dot_count])

class PhishingDetector:
    def __init__(self, model_path, feature_extractor_path, scaler_path='scaler.npy', tokenizer_path='tokenizer.pkl'):
        self.model = load_model(model_path)
        self.feature_extractor = load_model(feature_extractor_path)
        self.scaler = np.load(scaler_path, allow_pickle=True).item()
        with open(tokenizer_path, 'rb') as f:
            data = pickle.load(f)
            self.tokenizer = data['tokenizer']
            self.max_length = data['max_length']

    def preprocess_url(self, url):
        manual_features = extract_manual_features(url)
        sequence = self.tokenizer.texts_to_sequences([url])
        padded_sequence = pad_sequences(sequence, maxlen=self.max_length, padding='post')
        deep_features = self.feature_extractor.predict([padded_sequence, manual_features.reshape(1, -1)], verbose=0)
        combined_features = np.hstack([manual_features, deep_features[0]])
        combined_features = self.scaler.transform(combined_features.reshape(1, -1))
        return combined_features.reshape(1, -1, 1)

    def predict(self, url):
        features = self.preprocess_url(url)
        prediction = self.model.predict(features, verbose=0)[0][0]
        is_phishing = prediction > 0.5
        confidence = prediction if is_phishing else 1 - prediction
        return {'url': url, 'is_phishing': is_phishing, 'confidence': float(confidence), 'probability': float(prediction)}

    def predict_batch(self, urls):
        return [self.predict(url) for url in urls]

def detect_phishing(model_path, feature_extractor_path, urls, scaler_path='scaler.npy', tokenizer_path='tokenizer.pkl'):
    detector = PhishingDetector(model_path, feature_extractor_path, scaler_path, tokenizer_path)
    if isinstance(urls, str):
        result = detector.predict(urls)
        print(f"\nURL: {result['url']}")
        print(f"Prediction: {'Phishing' if result['is_phishing'] else 'Legitimate'}")
        print(f"Confidence: {result['confidence']:.4f}")
        print(f"Raw Probability: {result['probability']:.4f}")
        return result
    else:
        results = detector.predict_batch(urls)
        print("\nBatch Prediction Results:")
        for result in results:
            print(f"URL: {result['url']}")
            print(f"Prediction: {'Phishing' if result['is_phishing'] else 'Legitimate'}")
            print(f"Confidence: {result['confidence']:.4f}")
            print(f"Raw Probability: {result['probability']:.4f}")
            print("---")
        return results

if __name__ == "__main__":
    MODEL_PATH = "./models/best_model_cnn_model.h5"
    FEATURE_EXTRACTOR_PATH = "./feature_extractor.h5"
    test_urls = [
        "https://mail.google.com/mail/u/0/#inbox/FMfcgzQZTVmsrcSDgxtvlXVNmsmslNWW",
        "http://glaters.com/4nmFje21224QRSX959bkzzmivacg161",
        "http://consigneservices.com/I",
        "https://southadore.top/plpw806d/29335154517850877740d6023e?_t1740894072375",
        "https://in.linkedin.com/in/akshat-nautiyal-048527186",
    ]
    single_result = detect_phishing(MODEL_PATH, FEATURE_EXTRACTOR_PATH, test_urls[1])
    batch_results = detect_phishing(MODEL_PATH, FEATURE_EXTRACTOR_PATH, test_urls)




URL: http://glaters.com/4nmFje21224QRSX959bkzzmivacg161
Prediction: Phishing
Confidence: 0.9990
Raw Probability: 0.9990

Batch Prediction Results:
URL: https://mail.google.com/mail/u/0/#inbox/FMfcgzQZTVmsrcSDgxtvlXVNmsmslNWW
Prediction: Legitimate
Confidence: 0.7125
Raw Probability: 0.2875
---
URL: http://glaters.com/4nmFje21224QRSX959bkzzmivacg161
Prediction: Phishing
Confidence: 0.9990
Raw Probability: 0.9990
---
URL: http://consigneservices.com/I
Prediction: Phishing
Confidence: 1.0000
Raw Probability: 1.0000
---
URL: https://southadore.top/plpw806d/29335154517850877740d6023e?_t1740894072375
Prediction: Phishing
Confidence: 0.9215
Raw Probability: 0.9215
---
URL: https://in.linkedin.com/in/akshat-nautiyal-048527186
Prediction: Legitimate
Confidence: 0.9875
Raw Probability: 0.0125
---


In [8]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from urllib.parse import urlparse
import tldextract
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

def extract_manual_features(url):
    # Keeping this function for consistency, though it may not be used by the model
    if pd.isna(url):
        return np.zeros(13)
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    url_length = len(url)
    domain_length = len(extracted.domain)
    tld_length = len(extracted.suffix)
    subdomain_count = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
    digit_count = sum(c.isdigit() for c in url)
    special_char_count = sum(1 for c in url if not c.isalnum())
    has_https = 1 if parsed.scheme == 'https' else 0
    has_query = 1 if parsed.query else 0
    has_path = 1 if parsed.path else 0
    has_fragment = 1 if parsed.fragment else 0
    has_ip = 1 if re.match(r'\d+\.\d+\.\d+\.\d+', extracted.domain) else 0
    hyphen_count = url.count('-')
    dot_count = url.count('.')
    return np.array([url_length, domain_length, tld_length, subdomain_count,
                     digit_count, special_char_count, has_https, has_query,
                     has_path, has_fragment, has_ip, hyphen_count, dot_count])

class PhishingDetector:
    def __init__(self, model_path, tokenizer_path='tokenizer.pkl'):
        # Load the Attention-based model
        self.model = load_model(model_path)
        with open(tokenizer_path, 'rb') as f:
            data = pickle.load(f)
            self.tokenizer = data['tokenizer']
            self.max_length = data['max_length']

    def preprocess_url(self, url):
        # Convert URL to sequence and pad - single input for Attention model
        sequence = self.tokenizer.texts_to_sequences([url])
        padded_sequence = pad_sequences(sequence, maxlen=self.max_length, padding='post')
        return padded_sequence  # Return as single input

    def predict(self, url):
        # Get preprocessed input
        inputs = self.preprocess_url(url)
        
        # Make prediction using Attention model (single input)
        prediction = self.model.predict(inputs, verbose=0)[0][0]
        is_phishing = prediction > 0.5
        confidence = prediction if is_phishing else 1 - prediction
        
        return {
            'url': url,
            'is_phishing': is_phishing,
            'confidence': float(confidence),
            'probability': float(prediction)
        }

    def predict_batch(self, urls):
        # Process multiple URLs efficiently
        sequences = self.tokenizer.texts_to_sequences(urls)
        padded_sequences = pad_sequences(sequences, maxlen=self.max_length, padding='post')
        
        predictions = self.model.predict(padded_sequences, verbose=0)
        results = []
        for url, pred in zip(urls, predictions):
            pred = pred[0]
            is_phishing = pred > 0.5
            confidence = pred if is_phishing else 1 - pred
            results.append({
                'url': url,
                'is_phishing': is_phishing,
                'confidence': float(confidence),
                'probability': float(pred)
            })
        return results

def detect_phishing(model_path, urls, tokenizer_path='tokenizer.pkl'):
    detector = PhishingDetector(model_path, tokenizer_path)
    
    if isinstance(urls, str):
        result = detector.predict(urls)
        print(f"\nURL: {result['url']}")
        print(f"Prediction: {'Phishing' if result['is_phishing'] else 'Legitimate'}")
        print(f"Confidence: {result['confidence']:.4f}")
        print(f"Raw Probability: {result['probability']:.4f}")
        return result
    else:
        results = detector.predict_batch(urls)
        print("\nBatch Prediction Results:")
        for result in results:
            print(f"URL: {result['url']}")
            print(f"Prediction: {'Phishing' if result['is_phishing'] else 'Legitimate'}")
            print(f"Confidence: {result['confidence']:.4f}")
            print(f"Raw Probability: {result['probability']:.4f}")
            print("---")
        return results

if __name__ == "__main__":
    MODEL_PATH = "./models/model_attention_model.h5"  # Path to your saved Attention model
    test_urls = [
        "https://mail.google.com/mail/u/0/#inbox/FMfcgzQZTVmsrcSDgxtvlXVNmsmslNWW",
        "http://glaters.com/4nmFje21224QRSX959bkzzmivacg161",
        "https://in.linkedin.com/in/akshat-nautiyal-048527186",
    ]
    single_result = detect_phishing(MODEL_PATH, test_urls[1])
    batch_results = detect_phishing(MODEL_PATH, test_urls)



ValueError: Input 0 of layer "functional_5" is incompatible with the layer: expected shape=(None, 45, 1), found shape=(1, 1154)

In [15]:
df['label'].value_counts()

label
0    1001244
1     150530
Name: count, dtype: int64