In [1]:
import numpy as np
import joblib
import re
from nltk.corpus import stopwords
from scipy.sparse import hstack
import sys
import json

# Load stopwords
stop_words = set(stopwords.words('english'))

# Text Cleaning Function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'\W', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load the bundled pipeline
pipeline = joblib.load(r"C:\Users\may\Desktop\phishing_pipeline.joblib")
ensemble_model = pipeline['ensemble_model']
tfidf_subject = pipeline['tfidf_subject']
tfidf_body = pipeline['tfidf_body']

# Access fitted base models from the ensemble
models = ensemble_model.named_estimators_

# Function to process input and return predictions
def predict_phishing(id, subject, body):
    # Clean input
    cleaned_subject = clean_text(subject)
    cleaned_body = clean_text(body)

    # Vectorize using loaded TF-IDF vectorizers
    X_subject = tfidf_subject.transform([cleaned_subject])
    X_body = tfidf_body.transform([cleaned_body])
    X_combined = hstack((X_subject, X_body))

    # Ensemble prediction
    prediction = ensemble_model.predict(X_combined)[0]
    ensemble_probs = ensemble_model.predict_proba(X_combined)[0]
    ensemble_prob_class1 = ensemble_probs[1]

    # Individual model predictions and probabilities
    model_scores = {}
    for name, model in models.items():
        pred = model.predict(X_combined)[0]
        probs = model.predict_proba(X_combined)[0]
        prob = probs[1] if pred == 1 else probs[0]
        model_scores[name] = (pred, prob)

    # Determine winner
    winner = max(model_scores, key=lambda k: model_scores[k][1] if model_scores[k][0] == prediction else -1)
    winner_probability = model_scores[winner][1]

    # Return dictionary with id included
    result = {
        "id": id,  # Pass through the id
        "subject": subject,
        "body": body,
        "prediction": int(prediction),
        "winner_model": winner,
        "winner_probability": float(winner_probability)
    }
    return result



Test Result:
{
    "id": "12345",
    "subject": "Urgent: Account Verification Required",
    "body": "Dear user, click this link to verify your account immediately or it will be suspended.",
    "prediction": 1,
    "winner_model": "logreg",
    "winner_probability": 0.9993839913868209
}
