In [17]:
import numpy as np
import joblib
import re
from nltk.corpus import stopwords
from scipy.sparse import hstack
import sys
import json

# Load stopwords
stop_words = set(stopwords.words('english'))

# Text Cleaning Function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'\W', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load the bundled pipeline
pipeline = joblib.load(r"C:\Users\may\Desktop\phishing_pipeline.joblib")
ensemble_model = pipeline['ensemble_model']
tfidf_subject = pipeline['tfidf_subject']
tfidf_body = pipeline['tfidf_body']

# Access fitted base models from the ensemble
models = ensemble_model.named_estimators_

# Function to process input and return predictions with top 5 words from all models
def predict_phishing(id, subject, body):
    # Clean input
    cleaned_subject = clean_text(subject)
    cleaned_body = clean_text(body)

    # Vectorize using loaded TF-IDF vectorizers
    X_subject = tfidf_subject.transform([cleaned_subject])
    X_body = tfidf_body.transform([cleaned_body])
    X_combined = hstack((X_subject, X_body))

    # Ensemble prediction
    prediction = ensemble_model.predict(X_combined)[0]
    ensemble_probs = ensemble_model.predict_proba(X_combined)[0]
    ensemble_prob_class0 = ensemble_probs[0]
    ensemble_prob_class1 = ensemble_probs[1]

    # Individual model predictions and probabilities
    model_scores = {}
    for name, model in models.items():
        pred = model.predict(X_combined)[0]
        probs = model.predict_proba(X_combined)[0]
        prob_class_0 = probs[0]
        prob_class_1 = probs[1]

        model_scores[f"{name.lower()}_pred"] = pred
        model_scores[f"{name.lower()}_prob_class0"] = prob_class_0
        model_scores[f"{name.lower()}_prob_class1"] = prob_class_1

    # Determine winner
    winner_key = max(
        model_scores,
        key=lambda k: model_scores[f"{k.split('_')[0]}_prob_class1"] if model_scores[k] == prediction else -1
    )
    winner = winner_key.split('_')[0]
    winner_probability = (
        model_scores[f"{winner}_prob_class1"] if prediction == 1
        else model_scores[f"{winner}_prob_class0"]
    )

    # Get feature names
    subject_features = tfidf_subject.get_feature_names_out()
    body_features = tfidf_body.get_feature_names_out()
    all_features = np.concatenate([subject_features, body_features])

    # Get feature contributions for all models
    X_combined_dense = X_combined.toarray()[0]
    all_model_contributions = {}

    for name, model in models.items():
        feature_contributions = {}

        if hasattr(model, 'feature_importances_'):  # Random Forest
            importances = model.feature_importances_
            for i, (feature, importance) in enumerate(zip(all_features, importances)):
                if X_combined_dense[i] > 0:
                    feature_contributions[feature] = importance * X_combined_dense[i]
        elif hasattr(model, 'coef_'):  # Linear SVM
            coef = model.coef_[0] if len(model.coef_.shape) > 1 else model.coef_
            for i, (feature, coef_val) in enumerate(zip(all_features, coef)):
                if X_combined_dense[i] > 0:
                    feature_contributions[feature] = coef_val * X_combined_dense[i]
        elif hasattr(model, 'feature_log_prob_'):  # Naive Bayes
            log_probs = model.feature_log_prob_[1]  # Phishing class
            for i, (feature, log_prob) in enumerate(zip(all_features, log_probs)):
                if X_combined_dense[i] > 0:
                    feature_contributions[feature] = log_prob * X_combined_dense[i]

        # Get top 5 features for this model
        top_features = sorted(feature_contributions.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
        all_model_contributions[f"top_5_words_from_{name.lower()}"] = {feature: float(score) for feature, score in top_features}

    # Build result dictionary
    result = {
        "id": id,
        "subject": subject,
        "body": body,
        "prediction": int(prediction),
        "winner_model": winner,
        "winner_probability": float(winner_probability),
        "ensemble_predicted_label": int(prediction),
        "ensemble_prob_class0": float(ensemble_prob_class0),
        "ensemble_prob_class1": float(ensemble_prob_class1),
        **{f"{name.lower()}_pred": int(model_scores[f"{name.lower()}_pred"])
           for name in models.keys()},
        **{f"{name.lower()}_prob_class0": float(model_scores[f"{name.lower()}_prob_class0"])
           for name in models.keys()},
        **{f"{name.lower()}_prob_class1": float(model_scores[f"{name.lower()}_prob_class1"])
           for name in models.keys()},
        **all_model_contributions  # Add top 5 words from all models
    }

    return result

def main():
    # Test case 1: A potentially phishing email
    test_id_1 = "001"
    test_subject_1 = "Urgent: Verify Your Account Now!"
    test_body_1 = "Dear user, your account will be suspended unless you click this link to verify: http://fake-login.com. Act now!"

    result_1 = predict_phishing(test_id_1, test_subject_1, test_body_1)
    print("Test Case 1 (Phishing Example):")
    print(json.dumps(result_1, indent=4))

if __name__ == "__main__":
    main()

Test Case 1 (Phishing Example):
{
    "id": "001",
    "subject": "Urgent: Verify Your Account Now!",
    "body": "Dear user, your account will be suspended unless you click this link to verify: http://fake-login.com. Act now!",
    "prediction": 1,
    "winner_model": "xgb",
    "winner_probability": 0.9993987083435059,
    "ensemble_predicted_label": 1,
    "ensemble_prob_class0": 0.0622091823505867,
    "ensemble_prob_class1": 0.9377908176494133,
    "nb_pred": 1,
    "rf_pred": 1,
    "xgb_pred": 1,
    "knn_pred": 1,
    "logreg_pred": 1,
    "nb_prob_class0": 0.0023386330892434855,
    "rf_prob_class0": 0.14029850746268657,
    "xgb_prob_class0": 0.0006012916564941406,
    "knn_prob_class0": 0.16666666666666666,
    "logreg_prob_class0": 0.001140812877842623,
    "nb_prob_class1": 0.9976613669107564,
    "rf_prob_class1": 0.8597014925373134,
    "xgb_prob_class1": 0.9993987083435059,
    "knn_prob_class1": 0.8333333333333334,
    "logreg_prob_class1": 0.9988591871221574,
    "top