In [1]:
!pip install PyDrive
!pip install python-whois


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import whois
import requests
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
from google.colab import files
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Use Google Colab's File Picker
from google.colab import files
uploaded = files.upload()  # Opens a file selection dialog

# Get the file name dynamically
file_name = list(uploaded.keys())[0]

# Check the file format and load accordingly
if file_name.endswith('.csv'):
    df = pd.read_csv(file_name)
elif file_name.endswith('.xls') or file_name.endswith('.xlsx'):
    df = pd.read_excel(file_name)
else:
    print("Unsupported file format!")

# Display first few rows
print(df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving 5.urldata.csv to 5.urldata.csv
Saving Detection.xlsx to Detection.xlsx
Saving web-page-phishing.csv to web-page-phishing.csv
             Domain  Have_IP  Have_At  URL_Length  URL_Depth  Redirection  \
0  graphicriver.net        0        0           1          1            0   
1         ecnavi.jp        0        0           1          1            1   
2      hubpages.com        0        0           1          1            0   
3   extratorrent.cc        0        0           1          3            0   
4     icicibank.com        0        0           1          3            0   

   https_Domain  TinyURL  Prefix/Suffix  DNS_Record  Web_Traffic  Domain_Age  \
0             0        0              0           0            1           1   
1             0        0              0           0            1           1   
2             0        0              0           0            1           0   
3             0        0              0           0            1           0   
4    

In [5]:
# Handle missing values
df.fillna(0, inplace=True)

In [6]:

# Convert categorical columns to numerical using Label Encoding
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [7]:
# Define Features (X) and Target (y)
target_column = 'Label'  # Ensure this matches your dataset
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset. Check column names: {df.columns}")

X = df.drop(columns=[target_column])
y = df[target_column]


In [8]:
# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [9]:
# Train XGBoost Model
model = xgb.XGBClassifier(n_estimators=500, max_depth=6, eval_metric="logloss")


model.fit(X_train, y_train)

In [10]:
# Function to Extract Features from a URL
def extract_features(url):
    features = {}
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    # Feature 1: URL Length
    features['URL_Length'] = len(url)

    # Feature 2: Presence of '@' symbol
    features['Have_At'] = 1 if "@" in url else 0

    # Feature 3: Presence of IP Address in URL
    ip_pattern = re.compile(r'(\d{1,3}\.){3}\d{1,3}')
    features['Have_IP'] = 1 if ip_pattern.search(url) else 0

    # Feature 4: Count of '/' in URL (URL Depth)
    features['URL_Depth'] = url.count('/')

    # Feature 5: Presence of "https" in domain
    features['https_Domain'] = 1 if parsed_url.scheme == 'https' else 0

    # Feature 6: Presence of '-' in domain (prefix/suffix)
    features['Prefix/Suffix'] = 1 if '-' in domain else 0

    # Feature 7: Check if domain name contains numbers (common in phishing sites)
    features['Contains_Numbers'] = 1 if any(char.isdigit() for char in domain) else 0

    # Feature 8: Check if domain age is very low (phishing domains are usually new)
    try:
        domain_info = whois.whois(domain)
        features['Domain_Age'] = (pd.to_datetime('today') - pd.to_datetime(domain_info.creation_date)).days

    except:
        features['Domain_Age'] = 0  # If WHOIS fails, assume suspicious

    # Feature 9: Check if the domain is blacklisted (Google Safe Browsing API)
    try:
        safe_browsing_api = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
        api_key = "YOUR_GOOGLE_SAFE_BROWSING_API_KEY"  # Replace with actual key
        payload = {
            "client": {"clientId": "phishing-detector", "clientVersion": "1.0"},
            "threatInfo": {
                "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING"],
                "platformTypes": ["ANY_PLATFORM"],
                "threatEntryTypes": ["URL"],
                "threatEntries": [{"url": url}],
            },
        }
        response = requests.post(safe_browsing_api, json=payload, params={"key": api_key})
        features['Blacklisted'] = 1 if response.json().get("matches") else 0
    except:
        features['Blacklisted'] = 0  # Assume safe if API call fails

    # Convert extracted features to DataFrame
    return pd.DataFrame([features])



In [11]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [12]:
import pandas as pd
import re
from urllib.parse import urlparse
from Levenshtein import distance as levenshtein_distance  # Install using pip install python-Levenshtein

# Expanded whitelist of known legitimate domains
TRUSTED_DOMAINS = {
    "facebook.com", "google.com", "amazon.com", "microsoft.com", "wikipedia.org",
    "github.com", "apple.com", "netflix.com", "paypal.com", "bankofamerica.com",
    "domain.me"  # ✅ Added domain.me to prevent false positives
}

# List of known compromised phishing domains
COMPROMISED_DOMAINS = {
    "rivercitybanking.com"
}

# Maximum allowed similarity for typo domains (lower = stricter)
TYPO_SQUATTING_THRESHOLD = 2

# Function to extract domain from any URL type
def extract_domain(url):
    # Ensure scheme is included to properly parse URLs
    if not url.startswith(("http://", "https://")):
        url = "http://" + url  # Add default scheme for parsing

    parsed_url = urlparse(url)
    domain = parsed_url.netloc.lower()

    # Remove common subdomains like "www."
    domain = domain.replace("www.", "").strip()

    return domain

# Function to check if a URL is whitelisted
def is_whitelisted(url):
    domain = extract_domain(url)
    return domain in TRUSTED_DOMAINS  # Strict match

# Function to check if a domain is compromised
def is_compromised(url):
    domain = extract_domain(url)
    return domain in COMPROMISED_DOMAINS  # Block if in compromised list

# Function to detect typosquatting (only for domains similar to trusted ones)
def is_typosquatting(url):
    domain = extract_domain(url)

    # Ignore short domains (e.g., 'me', 'io') to reduce false positives
    if len(domain.split(".")[0]) <= 3:
        return False

    # Only check typo similarity against known trusted domains
    for trusted_domain in TRUSTED_DOMAINS:
        if levenshtein_distance(domain, trusted_domain) <= TYPO_SQUATTING_THRESHOLD:
            return True  # Flag as phishing

    return False

# Function to Predict URL Legitimacy
def predict_url(url):
    domain = extract_domain(url)

    # ✅ Block if the domain is compromised
    if is_compromised(url):
        return "Phishing"

    # ✅ Allow if the domain is whitelisted
    if is_whitelisted(url):
        return "Legitimate"

    # ✅ Detect typo-squatted phishing attempts (e.g., `go0gle.com`)
    if is_typosquatting(url):
        return "Phishing"

    # ✅ If not explicitly trusted or flagged, fallback to ML model
    extracted_features = extract_features(url)  # Extract features (Assumed function)

    # Ensure feature columns match training data
    missing_cols = set(X.columns) - set(extracted_features.columns)
    for col in missing_cols:
        extracted_features[col] = 0  # Fill missing columns with 0

    extracted_features = extracted_features[X.columns]  # Reorder columns

    # Convert all object columns to numeric
    extracted_features = extracted_features.apply(pd.to_numeric, errors='coerce').fillna(0)

    # Get model prediction with confidence score
    prediction = model.predict(extracted_features)[0]
    confidence = model.predict_proba(extracted_features)[0]

    phishing_confidence = confidence[1]  # Confidence score for phishing class

    # ✅ Override: If phishing confidence is below 50%, classify as "Legitimate"
    if phishing_confidence < 0.5:
        return "Legitimate"

    return "Legitimate" if prediction == 0 else "Phishing"

# Get User Input and Predict
user_url = input("Enter a website URL: ")
result = predict_url(user_url)
print(f"The website '{user_url}' is classified as: {result}")


Enter a website URL: https://www.google.com 
The website 'https://www.google.com ' is classified as: Legitimate


In [13]:
print(f"Dataset size: {df.shape}")  # (rows, columns)




Dataset size: (10000, 18)
