In [None]:
%pip install pandas numpy scikit-learn matplotlib seaborn xgboost joblib

In [None]:
# ======================================================
# 1. IMPORT LIBRARIES
# ======================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

# Set visual style for graphs
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úÖ Libraries Imported Successfully")


In [None]:
# ======================================================
# 2. LOAD DATA (LOCAL FILES)
# ======================================================
print("\nüìÇ Loading Datasets...")
df_list = []

# --- Load SMS Data ---
try:
    sms_files = ["sms/train.csv", "sms/test.csv", "sms/valid.csv"]
    for f in sms_files:
        temp_df = pd.read_csv(f)
        df_list.append(temp_df)
    
    print("   -> SMS files loaded.")
except Exception as e:
    print(f"   ‚ö†Ô∏è Error loading SMS: {e}")

# --- Load Email Data ---
try:
    email_df = pd.read_json("phishing_email.jsonl", lines=True)
    if 'prompt' in email_df.columns:
        email_df = email_df.rename(columns={'prompt': 'text'})
    df_list.append(email_df[['text', 'label']])
    print("   -> Email file loaded.")
except Exception as e:
    print(f"   ‚ö†Ô∏è Error loading Email: {e}")

# --- Combine & Clean ---
df_final = pd.concat(df_list, ignore_index=True)

# Standardize Labels (Ensure 0 = Safe, 1 = Phishing)
# Some datasets use strings 'ham'/'spam', others use ints 0/1
if df_final['label'].dtype == 'object':
    df_final['label'] = df_final['label'].map({'ham': 0, 'spam': 1})

df_final.dropna(inplace=True)
df_final['label'] = df_final['label'].astype(int)

In [None]:
# ======================================================
# 3. DATA STATISTICS & GRAPHS
# ======================================================
total = len(df_final)
safe = len(df_final[df_final['label'] == 0])
phish = len(df_final[df_final['label'] == 1])

print(f"\nüìä DATASET STATISTICS")
print(f"----------------------")
print(f"Total Samples: {total}")
print(f"Safe (0):      {safe} ({(safe/total)*100:.1f}%)")
print(f"Phishing (1):  {phish} ({(phish/total)*100:.1f}%)")

# --- Graph 1: Class Distribution ---
plt.figure(figsize=(8, 5))
ax = sns.countplot(x='label', data=df_final, palette=['#2ecc71', '#e74c3c'])
plt.title('Distribution: Safe vs Phishing', fontsize=15)
plt.xticks([0, 1], ['Safe Messages', 'Phishing Attempts'])
plt.ylabel('Count')

# Add counts on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x()+0.35, p.get_height()+100), fontweight='bold')
plt.show()

In [None]:
# ======================================================
# 4. TEXT PRE-PROCESSING
# ======================================================
def clean_text(text):
    text = str(text).lower()
    # Mask URLs to a generic token so model learns "Presence of Link" = Bad
    text = re.sub(r'http\S+', 'http_token', text)
    text = re.sub(r'www\S+', 'http_token', text)
    # Keep currency symbols ($ ¬£ ‚Ç¨) as they are strong indicators
    text = re.sub(r'[^a-z0-9!?$¬£‚Ç¨ ]', '', text) 
    return text

print("\nüßπ Cleaning Text Data...")
df_final['clean_text'] = df_final['text'].apply(clean_text)

In [None]:
# ======================================================
# 5. BUILD VOTING ENSEMBLE MODEL
# ======================================================
print("‚öôÔ∏è Building Voting Ensemble Model...")

X_train, X_test, y_train, y_test = train_test_split(
    df_final['clean_text'], 
    df_final['label'], 
    test_size=0.2, 
    random_state=42,
    stratify=df_final['label']
)

# Define the 3 Classifiers
clf1 = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1, class_weight='balanced')
clf2 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
clf3 = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Combine into Voting Classifier (Soft Voting = Average Probabilities)
voting_clf = VotingClassifier(
    estimators=[('rf', clf1), ('xgb', clf2), ('lr', clf3)],
    voting='soft'
)

# Create Pipeline with TF-IDF
pipeline = Pipeline([
    # ngram_range=(1,3) means it looks at phrases up to 3 words long
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=12000, ngram_range=(1, 3))),
    ('ensemble', voting_clf)
])

print("üöÄ Training Model... (This is robust, give it a moment)")
pipeline.fit(X_train, y_train)
print("‚úÖ Training Complete!")


In [None]:
# ======================================================
# 6. EVALUATION & ACCURACY GRAPHS
# ======================================================
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n" + "="*60)
print(f"üèÜ FINAL MODEL ACCURACY: {acc:.4f} ({acc*100:.2f}%)")
print("="*60)
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Safe', 'Phishing']))

# --- Graph 2: Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', linewidths=1, linecolor='black',
            xticklabels=['Predicted Safe', 'Predicted Phishing'],
            yticklabels=['Actual Safe', 'Actual Phishing'])
plt.title('Confusion Matrix (Where did the model make mistakes?)', fontsize=14)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ======================================================
# 7. SAVE MODEL & TEST
# ======================================================
model_filename = "best_phishing_model.pkl"
joblib.dump(pipeline, model_filename)
print(f"\nüíæ Model saved successfully to: {model_filename}")

def live_test(text):
    clean = clean_text(text)
    pred = pipeline.predict([clean])[0]
    prob = pipeline.predict_proba([clean])[0]
    
    if pred == 1:
        print(f"üî¥ PHISHING ({prob[1]*100:.2f}%): '{text}'")
    else:
        print(f"üü¢ SAFE ({prob[0]*100:.2f}%): '{text}'")

print("\n--- üß™ LIVE TEST EXAMPLES ---")
live_test("Hey, can we meet for lunch?")
live_test("URGENT: Your bank account is locked. Update immediately at http://fake-bank.com")
live_test("You have won a $1000 prize! Click here to claim.")