In [None]:
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report

# Enhanced feature extraction function
def extract_features(texts):
    features = pd.DataFrame({
        'length': [len(text) for text in texts],
        'word_count': [len(text.split()) for text in texts],
        'has_urgency': [1 if re.search(r'\b(urgent|immediately|now|quick|action required|expire)\b', text.lower()) else 0 for text in texts],
        'has_money': [1 if re.search(r'\b(win|prize|cash|free|bonus|reward|\$|money|dollar|million|billion)\b', text.lower()) else 0 for text in texts],
        'has_link': [1 if re.search(r'http|www|\.com|\.net|\.org|click|\.link|bit\.ly|goo\.gl', text.lower()) else 0 for text in texts],
        'special_char_count': [len(re.findall(r'[!@#$%^&*()_+\-=\[\]{};:\'"\\|,.<>\/?]', text)) for text in texts],
        'has_greeting': [1 if re.search(r'^dear|hello|hi |greetings|valued customer', text.lower()) else 0 for text in texts],
        'has_threat': [1 if re.search(r'\b(suspend|close|terminate|verify|confirm|account|security|alert)\b', text.lower()) else 0 for text in texts]
    })
    return features

# Custom transformer wrapper
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, texts): return extract_features(texts)

# Load and prepare data
sms = pd.read_csv('dataDeployment/PhishingData.csv', encoding='latin-1')

sms = sms.rename(columns={"v1":"label", "v2":"text"})

sms = sms.dropna(subset=['text','label'])

sms['label'] = sms['label'].map({'ham': 0,'spam': 1})

# Drop rows not mapping
sms = sms.dropna(subset=['label'])

# Convert label to integer
sms['label'] = sms['label'].astype(int)


# Create pipeline with enhanced parameters
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')),
        ('manual', FeatureExtractor())])),
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=25,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1))
])

# Train model with evaluation
X_train, X_test, Y_train, Y_test = train_test_split(
    sms['text'], sms['label'], test_size=0.2, random_state=42, stratify=sms['label'])
pipeline.fit(X_train, Y_train)

# Save model
with open('phishingModel.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Evaluate model
predY = pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(Y_test, predY))
