In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.sparse import hstack, csr_matrix
import joblib

In [None]:
df=pd.read_txt()

In [None]:
df=pd.read_json(r"E:\Phising_detection\dataset\urls.json")
df.shape

In [None]:
# Load your dataset
# df = pd.read_csv(r"E:\Phising_detection\dataset\PhiUSIIL_Phishing_URL_Dataset.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# Check the first few rows
print(df.head())
print(df.columns)

In [None]:
df['label']

In [None]:
df.shape

In [None]:
# selected_features = [
#     'URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'IsDomainIP',
#     'NoOfLettersInURL', 'NoOfDegitsInURL', 'LetterRatioInURL', 'DegitRatioInURL',
#     'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'SpacialCharRatioInURL',
#     'Bank', 'Pay', 'Crypto'
# ]

In [None]:
# X_numeric = df[selected_features]

In [None]:
# X_numeric

In [None]:
# scaler = StandardScaler()
# X_numeric_scaled = scaler.fit_transform(X_numeric)

In [None]:
# Prepare TF-IDF from raw URL string
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 6), max_features=3000)
X_tfidf = vectorizer.fit_transform(df['text'].astype(str))

In [None]:
df['label']

In [None]:
# Combine both inputs
# X_combined = hstack([X_tfidf, csr_matrix(X_numeric_scaled)])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:


# Save Random Forest model
joblib.dump(clf, "url_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")

# Save feature scaler
# joblib.dump(scaler, "feature_scaler.pkl")


In [None]:
import tldextract
import ipaddress

def extract_safe_url_features(url):
    ext = tldextract.extract(url)
    domain = ext.domain + '.' + ext.suffix
    subdomain = ext.subdomain
    try:
        ipaddress.ip_address(domain)
        is_domain_ip = 1
    except ValueError:
        is_domain_ip = 0

    letters = sum(c.isalpha() for c in url)
    digits = sum(c.isdigit() for c in url)
    specials = sum(not c.isalnum() for c in url)

    return {
        "URLLength": len(url),
        "DomainLength": len(domain),
        "TLDLength": len(ext.suffix),
        "NoOfSubDomain": subdomain.count('.') + (1 if subdomain else 0),
        "IsDomainIP": is_domain_ip,
        "NoOfLettersInURL": letters,
        "NoOfDegitsInURL": digits,
        "LetterRatioInURL": round(letters / len(url), 3),
        "DegitRatioInURL": round(digits / len(url), 3),
        "NoOfEqualsInURL": url.count('='),
        "NoOfQMarkInURL": url.count('?'),
        "NoOfAmpersandInURL": url.count('&'),
        "SpacialCharRatioInURL": round(specials / len(url), 3),
        "Bank": int("bank" in url.lower()),
        "Pay": int("pay" in url.lower()),
        "Crypto": int("crypto" in url.lower())
    }


In [None]:
import numpy as np
from scipy.sparse import hstack

def predict_url_phishing(url):
    # Load saved objects
    clf = joblib.load("url_model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    # scaler = joblib.load("feature_scaler.pkl")

    # Extract features
    # features = extract_safe_url_features(url)
    # print(features)
    # # numeric = np.array([list(features.values())])
    # numeric_scaled = scaler.transform(numeric)

    # TF-IDF transform
    tfidf_vector = vectorizer.transform([url])

    # Combine and predict
    # X_combined = hstack([tfidf_vector, numeric_scaled])
    prediction = clf.predict(tfidf_vector)[0]
    probability = clf.predict_proba(tfidf_vector)[0][1]

    return prediction, round(probability, 4)


In [None]:
url = "https://chatgpt.com/c/68762bb9-04cc-800d-853f-b119f84ea6f8"
pred, prob = predict_url_phishing(url)
print(pred,prob)

print(f"Prediction: {'Phishing' if pred==0 else 'Legitimate'} (Confidence: {prob})")
# Label 1 corresponds to a legitimate URL, label 0 to a phishing URL



In [None]:
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Reuse saved components
import joblib
model = joblib.load("phishing_url_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
scaler = joblib.load("feature_scaler.pkl")

# Custom transformer for numerical feature extraction
class SafeURLFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, urls):
        import numpy as np
        return scaler.transform([
            list(extract_safe_url_features(url).values())
            for url in urls
        ])

# Create a pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer

# NOTE: reuse your saved TF-IDF and scaler here
pipeline = make_pipeline(
    FeatureUnion([
        ("tfidf", vectorizer),
        ("features", SafeURLFeatureExtractor())
    ]),
    model
)


In [None]:
explainer = LimeTextExplainer(class_names=["Legitimate", "Phishing"])

# Sample URL (false positive case)
sample_url = "http://paypal-update-login-verify.com/?id=123"

# Explain prediction
exp = explainer.explain_instance(sample_url, pipeline.predict_proba, num_features=10)

# Show explanation
print(exp.as_list())