In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.sparse import hstack, csr_matrix

In [4]:
df=pd.read_json(r"E:\Phising_detection\dataset\urls.json")
df.shape

(835697, 2)

In [26]:
# Load your dataset
df = pd.read_csv(r"E:\Phising_detection\dataset\PhiUSIIL_Phishing_URL_Dataset.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# Check the first few rows
print(df.head())
print(df.columns)

     FILENAME                                 URL  URLLength  \
0  521848.txt    https://www.southbankmosaics.com         31   
1   31372.txt            https://www.uni-mainz.de         23   
2  597387.txt      https://www.voicefmradio.co.uk         29   
3  554095.txt         https://www.sfnmjournal.com         26   
4  151578.txt  https://www.rewildingargentina.org         33   

                       Domain  DomainLength  IsDomainIP  TLD  \
0    www.southbankmosaics.com            24           0  com   
1            www.uni-mainz.de            16           0   de   
2      www.voicefmradio.co.uk            22           0   uk   
3         www.sfnmjournal.com            19           0  com   
4  www.rewildingargentina.org            26           0  org   

   URLSimilarityIndex  CharContinuationRate  TLDLegitimateProb  ...  Pay  \
0               100.0              1.000000           0.522907  ...    0   
1               100.0              0.666667           0.032650  ...    0   
2 

In [27]:
df['label']

0         1
1         1
2         1
3         1
4         1
         ..
235790    1
235791    1
235792    1
235793    0
235794    1
Name: label, Length: 235795, dtype: int64

In [12]:
df.shape

(235795, 56)

In [5]:
selected_features = [
    'URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'IsDomainIP',
    'NoOfLettersInURL', 'NoOfDegitsInURL', 'LetterRatioInURL', 'DegitRatioInURL',
    'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'SpacialCharRatioInURL',
    'Bank', 'Pay', 'Crypto'
]

In [None]:
X_numeric = df[selected_features]

In [7]:
X_numeric

Unnamed: 0,URLLength,DomainLength,TLDLength,NoOfSubDomain,IsDomainIP,NoOfLettersInURL,NoOfDegitsInURL,LetterRatioInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,SpacialCharRatioInURL,Bank,Pay,Crypto
0,31,24,3,1,0,18,0,0.581,0.000,0,0,0,0.032,1,0,0
1,23,16,2,1,0,9,0,0.391,0.000,0,0,0,0.087,0,0,0
2,29,22,2,2,0,15,0,0.517,0.000,0,0,0,0.069,0,0,0
3,26,19,3,1,0,13,0,0.500,0.000,0,0,0,0.038,0,1,1
4,33,26,3,1,0,20,0,0.606,0.000,0,0,0,0.030,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,29,22,3,1,0,16,0,0.552,0.000,0,0,0,0.034,0,1,0
235791,28,21,2,2,0,14,0,0.500,0.000,0,0,0,0.071,0,1,0
235792,30,23,2,1,0,17,0,0.567,0.000,0,0,0,0.033,0,0,0
235793,55,47,3,2,0,39,3,0.709,0.055,0,0,0,0.091,0,0,0


In [13]:
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

In [14]:
# Prepare TF-IDF from raw URL string
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 6), max_features=3000)
X_tfidf = vectorizer.fit_transform(df['URL'].astype(str))

In [24]:
df['label']

0         1
1         1
2         1
3         1
4         1
         ..
235790    1
235791    1
235792    1
235793    0
235794    1
Name: label, Length: 235795, dtype: int64

In [15]:
# Combine both inputs
X_combined = hstack([X_tfidf, csr_matrix(X_numeric_scaled)])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, stratify=y, random_state=42
)

In [16]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     20189
           1       1.00      1.00      1.00     26970

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [17]:
import joblib

# Save Random Forest model
joblib.dump(clf, "phishing_url_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Save feature scaler
joblib.dump(scaler, "feature_scaler.pkl")


['feature_scaler.pkl']

In [18]:
import tldextract
import ipaddress

def extract_safe_url_features(url):
    ext = tldextract.extract(url)
    domain = ext.domain + '.' + ext.suffix
    subdomain = ext.subdomain
    try:
        ipaddress.ip_address(domain)
        is_domain_ip = 1
    except ValueError:
        is_domain_ip = 0

    letters = sum(c.isalpha() for c in url)
    digits = sum(c.isdigit() for c in url)
    specials = sum(not c.isalnum() for c in url)

    return {
        "URLLength": len(url),
        "DomainLength": len(domain),
        "TLDLength": len(ext.suffix),
        "NoOfSubDomain": subdomain.count('.') + (1 if subdomain else 0),
        "IsDomainIP": is_domain_ip,
        "NoOfLettersInURL": letters,
        "NoOfDegitsInURL": digits,
        "LetterRatioInURL": round(letters / len(url), 3),
        "DegitRatioInURL": round(digits / len(url), 3),
        "NoOfEqualsInURL": url.count('='),
        "NoOfQMarkInURL": url.count('?'),
        "NoOfAmpersandInURL": url.count('&'),
        "SpacialCharRatioInURL": round(specials / len(url), 3),
        "Bank": int("bank" in url.lower()),
        "Pay": int("pay" in url.lower()),
        "Crypto": int("crypto" in url.lower())
    }


In [20]:
import numpy as np
from scipy.sparse import hstack

def predict_url_phishing(url):
    # Load saved objects
    clf = joblib.load("phishing_url_model.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    scaler = joblib.load("feature_scaler.pkl")

    # Extract features
    features = extract_safe_url_features(url)
    numeric = np.array([list(features.values())])
    numeric_scaled = scaler.transform(numeric)

    # TF-IDF transform
    tfidf_vector = vectorizer.transform([url])

    # Combine and predict
    X_combined = hstack([tfidf_vector, numeric_scaled])
    prediction = clf.predict(X_combined)[0]
    probability = clf.predict_proba(X_combined)[0][1]

    return prediction, round(probability, 4)


In [37]:
url = "https://www.youtube.com"
pred, prob = predict_url_phishing(url)
print(pred,prob)
print(f"Prediction: {'Phishing' if pred==0 else 'Legitimate'} (Confidence: {prob})")
# Label 1 corresponds to a legitimate URL, label 0 to a phishing URL



1 0.7
Prediction: Legitimate (Confidence: 0.7)


