In [13]:
!pip install python-whois fuzzywuzzy python-geoip-python3 geoip2
!pip install python-whois
!pip install tldextract
!pip install geoip2



In [29]:
# =====================================================
# 🧩 ALL FUNCTIONS USED FOR FEATURE EXTRACTION
# =====================================================

import socket
import ssl
import hashlib
import urllib.parse
from datetime import datetime, timezone
import whois
from fuzzywuzzy import fuzz
import geoip2.database

reader = geoip2.database.Reader("GeoLite2-City.mmdb")

# ---------------------------------------------
# 1️⃣ Resolve domain to IP
# ---------------------------------------------
def resolve_ip(domain):
    try:
        return socket.gethostbyname(domain)
    except Exception as e:
        print(f"Failed to resolve {domain}: {e}")
        return None

# ---------------------------------------------
# 2️⃣ WHOIS Information
# ---------------------------------------------
def get_whois_data(domain):
    try:
        w = whois.whois(domain)
        created = w.creation_date
        if isinstance(created, list):
            created = created[0]
        if created is not None:
            # Normalize datetime
            if created.tzinfo is not None:
                created = created.replace(tzinfo=None)
            domain_age_days = (datetime.utcnow() - created).days
            creation_date = created.strftime("%Y-%m-%d %H:%M:%S")
        else:
            domain_age_days = None
            creation_date = None

        return {
            "registrar": w.registrar,
            "creation_date": creation_date,
            "domain_age_days": domain_age_days
        }
    except Exception as e:
        print(f"WHOIS lookup failed for {domain}: {e}")
        return {"registrar": None, "creation_date": None, "domain_age_days": None}

# ---------------------------------------------
# 3️⃣ Brand Similarity
# ---------------------------------------------
brand_list = ["Google", "Facebook", "Microsoft", "Amazon", "PayPal", "Netflix"]

def compute_similarity(domain_name):
    if not domain_name:
        return 0
    domain = domain_name.split('.')[0]  # remove .com, .net, etc.
    try:
        return max(fuzz.ratio(domain.lower(), brand.lower()) for brand in brand_list)
    except Exception as e:
        print(f"Brand similarity failed: {e}")
        return 0

# ---------------------------------------------
# 4️⃣ GeoIP Lookup
# ---------------------------------------------
def get_geoip_data(domain):
    """
    Takes a domain name, resolves it to an IP, and returns GeoIP information.
    Uses the GeoLite2-City.mmdb database.
    """
    try:
        ip = resolve_ip(domain)
        if not ip:
            return {"ip": None, "country": None, "region": None, "city": None}

        response = reader.city(ip)
        return {
            "ip": ip,
            "country": response.country.name,
            "region": response.subdivisions.most_specific.name,
            "city": response.city.name
        }

    except Exception as e:
        print(f"GeoIP lookup failed for {domain}: {e}")
        return {"ip": None, "country": None, "region": None, "city": None}

# ---------------------------------------------
# 5️⃣ ASN Info (placeholder — you already have it)
# ---------------------------------------------
def get_asn_info_from_ip(ip):
    # Placeholder structure
    try:
        # Example only: replace with your own MaxMind ASN reader if available
        return {"asn": "AS15169", "asn_description": "GOOGLE LLC"}
    except Exception:
        return {"asn": None, "asn_description": None}

def get_asn_reputation(asn, asn_description):
    # Simple logic example; replace with your scoring system
    if asn_description and "GOOGLE" in asn_description.upper():
        return "Good"
    elif asn_description:
        return "Unknown"
    else:
        return "Bad"

# ---------------------------------------------
# 6️⃣ SSL Certificate Core Features
# ---------------------------------------------
def get_ssl_core_features(url, timeout=5):
    """
    Returns dict with SSL certificate features:
    fingerprint_sha256, issuer, subject, not_valid_before, not_valid_after, error
    """
    try:
        parsed = urllib.parse.urlparse(url)
        hostname = parsed.hostname or url
        scheme = parsed.scheme or "http"
        port = parsed.port or (443 if scheme == "https" else 80)

        if scheme == "http":
            return {
                "fingerprint_sha256": None,
                "issuer": None,
                "subject": None,
                "not_valid_before": None,
                "not_valid_after": None,
                "error": "no_ssl_for_http"
            }

        ctx = ssl.create_default_context()
        with socket.create_connection((hostname, port), timeout=timeout) as sock:
            with ctx.wrap_socket(sock, server_hostname=hostname) as ssock:
                der_cert = ssock.getpeercert(binary_form=True)
                cert_info = ssock.getpeercert()

                fingerprint = hashlib.sha256(der_cert).hexdigest().upper()

                issuer = ", ".join([f"{k}={v}" for part in cert_info.get("issuer", []) for k, v in part]) or None
                subject = ", ".join([f"{k}={v}" for part in cert_info.get("subject", []) for k, v in part]) or None

                nb = na = None
                try:
                    nb = datetime.strptime(cert_info["notBefore"], "%b %d %H:%M:%S %Y %Z").isoformat()
                    na = datetime.strptime(cert_info["notAfter"], "%b %d %H:%M:%S %Y %Z").isoformat()
                except Exception:
                    pass

                return {
                    "fingerprint_sha256": fingerprint,
                    "issuer": issuer,
                    "subject": subject,
                    "not_valid_before": nb,
                    "not_valid_after": na,
                    "error": None
                }
    except Exception as e:
        return {
            "fingerprint_sha256": None,
            "issuer": None,
            "subject": None,
            "not_valid_before": None,
            "not_valid_after": None,
            "error": str(e)
        }
# ---------------------------------------------
# 7️⃣ Risk Score Calculation (based on your DB logic)
# ---------------------------------------------
def calculate_risk_score(domain, domain_age_days, brand_similarity, fingerprint_sha256, all_domains):
    """
    Compute risk score:
    - feed frequency (log scale)
    - domain age (younger = riskier)
    - SSL mismatch
    - brand similarity
    """
    try:
        feed_freq = all_domains.count(domain) if domain else 1
        total_domains = max(all_domains.count(d) for d in all_domains)
        feed_score = np.log1p(feed_freq) / np.log1p(total_domains) * 25 if total_domains > 0 else 0
    except Exception:
        feed_score = 0

    try:
        domain_age_days = float(domain_age_days or 0)
    except:
        domain_age_days = 0
    max_age = 365 * 5
    age_score = (1 - (domain_age_days / max_age)) * 25
    age_score = np.clip(age_score, 0, 25)

    ssl_score = 25 if fingerprint_sha256 is None or str(fingerprint_sha256).strip() == "" else 5

    try:
        sim = float(brand_similarity or 0)
        if sim > 1:
            sim /= 100.0
        sim = np.clip(sim, 0, 1)
    except Exception:
        sim = 0
    brand_score = sim * 25

    total = feed_score + age_score + ssl_score + brand_score
    total = np.clip(total, 0, 100)

    if total >= 70:
        level = "High"
    elif total >= 40:
        level = "Medium"
    else:
        level = "Low"

    return round(total, 2), level

In [42]:
# =====================================================
# 🎯 EXTRACT FEATURES FOR ONE INPUT URL + DB CHECK + RISK SCORE
# =====================================================

import urllib.parse
import sqlite3
from datetime import datetime
import phishing_module # Import the phishing_module

# =====================================================
# 1️⃣ Ask user for database path and URL to analyze
# =====================================================
DB_PATH = input("📁 Enter path to your SQLite database (e.g., phishing_dataset.db): ").strip()
url_input = input("🌐 Enter a URL to analyze: ").strip()

# =====================================================
# 2️⃣ Helper function — check if URL already exists in DB
# =====================================================
def url_exists_in_db(url, db_path):
    """Check if the URL already exists in the database and return its record."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute("""
        SELECT url_id, url, domain, date_added, verified, url_status, label,
               whois_registrar, creation_date, domain_age_days, brand_similarity,
               ip, country, region, city,
               fingerprint_sha256, issuer, subject, not_valid_before, not_valid_after,
               error, risk_score, risk_level
        FROM urls
        WHERE url = ?;
    """, (url,))
    row = cursor.fetchone()
    columns = [desc[0] for desc in cursor.description]
    conn.close()

    if row:
        return dict(zip(columns, row))
    return None

# =====================================================
# 3️⃣ Main URL analysis logic
# =====================================================
if not url_input:
    print("⚠️ No URL entered.")
else:
    print("\n🔍 Checking database for existing record...\n")

    existing_data = url_exists_in_db(url_input, DB_PATH)

    if existing_data:
        print("✅ URL already exists in the database. Retrieving stored data...\n")

        print("🧾 STORED FEATURES FROM DATABASE")
        print("------------------------------------------------------------")
        for key, value in existing_data.items():
            print(f"{key:25s}: {value}")
        print("------------------------------------------------------------")
        print("💾 Retrieved successfully from the database.")

    else:
        print("❌ URL not found in database — extracting all features...\n")

        try:
            # Basic setup
            parsed = urllib.parse.urlparse(url_input)
            domain = parsed.hostname or url_input
            date_added = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            verified = "false"
            url_status = "active"
            label = "unknown"

            # WHOIS
            print("📜 Getting WHOIS info...")
            whois_info = phishing_module.get_whois_data(domain) # Call function from the module
            registrar = whois_info.get("whois_registrar") or "null" # Corrected key name
            creation_date = whois_info.get("creation_date") or "null"
            domain_age_days = whois_info.get("domain_age_days") or 0

            # Brand Similarity
            print("🏷️ Calculating Brand Similarity...")
            brand_similarity = phishing_module.compute_brand_similarity(domain) # Call function from the module

            # GeoIP
            print("🌍 Getting GeoIP data...")
            geo_data = phishing_module.get_geoip_data(domain) # Call function from the module
            ip = geo_data.get("ip") or "null"
            country = geo_data.get("country") or "null"
            region = geo_data.get("region") or "null"
            city = geo_data.get("city") or "null"

            # ASN
            print("🔢 Getting ASN info...")
            # Placeholder - phishing_module does not have ASN functions currently
            asn = "null"
            asn_description = "null"
            asn_reputation = "null"

            # SSL
            print("🔒 Getting SSL certificate info...")
            ssl_info = phishing_module.get_ssl_core_features(url_input) # Call function from the module
            fingerprint_sha256 = ssl_info.get("fingerprint_sha256") or "null"
            issuer = ssl_info.get("issuer") or "null"
            subject = ssl_info.get("subject") or "null"
            not_valid_before = ssl_info.get("not_valid_before") or "null"
            not_valid_after = ssl_info.get("not_valid_after") or "null"
            error = ssl_info.get("error") or "null"

            # Risk Score
            print("📊 Calculating Risk Score...")
            all_domains = [domain, "google.com", "facebook.com", "paypal.com"]
            risk_score, risk_level = phishing_module.calculate_risk_score( # Call function from the module
                domain=domain,
                domain_age_days=domain_age_days,
                brand_similarity=brand_similarity,
                fingerprint_sha256=None if fingerprint_sha256 == "null" else fingerprint_sha256,
                all_domains=all_domains
            )

            # Final Output
            print("\n🧾 FINAL EXTRACTED FEATURES")
            print("------------------------------------------------------------")
            print(f"{'domain':25s}: {domain}")
            print(f"{'date_added':25s}: {date_added}")
            print(f"{'verified':25s}: {verified}")
            print(f"{'url_status':25s}: {url_status}")
            print(f"{'label':25s}: {label}")
            print(f"{'whois_registrar':25s}: {registrar}")
            print(f"{'creation_date':25s}: {creation_date}")
            print(f"{'domain_age_days':25s}: {domain_age_days}")
            print(f"{'brand_similarity':25s}: {brand_similarity}")
            print(f"{'ip':25s}: {ip}")
            print(f"{'country':25s}: {country}")
            print(f"{'region':25s}: {region}")
            print(f"{'city':25s}: {city}")
            print(f"{'asn':25s}: {asn}")
            print(f"{'asn_description':25s}: {asn_description}")
            print(f"{'asn_reputation':25s}: {asn_reputation}")
            print(f"{'fingerprint_sha256':25s}: {fingerprint_sha256}")
            print(f"{'issuer':25s}: {issuer}")
            print(f"{'subject':25s}: {subject}")
            print(f"{'not_valid_before':25s}: {not_valid_before}")
            print(f"{'not_valid_after':25s}: {not_valid_after}")
            print(f"{'error':25s}: {error}")
            print(f"{'risk_score':25s}: {risk_score}")
            print(f"{'risk_level':25s}: {risk_level}")
            print("------------------------------------------------------------")
            print("✅ Feature extraction completed successfully.\n")

            # Optional: Insert into DB
            conn = sqlite3.connect(DB_PATH)
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO urls (
                    url, domain, date_added, verified, url_status, label,
                    whois_registrar, creation_date, domain_age_days, brand_similarity,
                    ip, country, region, city,
                    fingerprint_sha256, issuer, subject, not_valid_before, not_valid_after,
                    error, risk_score, risk_level
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
            """, (
                url_input, domain, date_added, verified, url_status, label,
                registrar, creation_date, domain_age_days, brand_similarity,
                ip, country, region, city, asn, asn_description, asn_reputation,
                fingerprint_sha256, issuer, subject, not_valid_before, not_valid_after,
                error, risk_score, risk_level
            ))
            conn.commit()
            conn.close()
            print("💾 Data successfully inserted into the database.")

        except Exception as e:
            print(f"❌ Extraction failed: {e}")

📁 Enter path to your SQLite database (e.g., phishing_dataset.db): /content/da.db
🌐 Enter a URL to analyze: https://sites.google.com/view/porepaotpaeotae/	

🔍 Checking database for existing record...

✅ URL already exists in the database. Retrieving stored data...

🧾 STORED FEATURES FROM DATABASE
------------------------------------------------------------
url_id                   : 1
url                      : https://sites.google.com/view/porepaotpaeotae/
domain                   : sites.google.com
date_added               : 2023-10-06T16:47:02+00:00
verified                 : yes
url_status               : None
label                    : phishing
whois_registrar          : MarkMonitor, Inc.
creation_date            : 1997-09-15 04:00:00
domain_age_days          : 10257
brand_similarity         : 100.0
ip                       : 142.250.152.102
country                  : United States
region                   : None
city                     : None
fingerprint_sha256       : CA9418CF3D