In [3]:
import pandas as pd
import numpy as np
import re
import string
import joblib
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

nltk.download('stopwords')
from nltk.corpus import stopwords


df = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'text'])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


def extract_features(text):
    text = str(text)
    features = {
        'has_urgent_words': int(bool(re.search(r"immediate|urgent|action required|24 hours", text, re.IGNORECASE))),
        'has_links': int("http" in text or "www" in text or "bit.ly" in text),
        'has_attachment_terms': int(bool(re.search(r"\.zip|\.pdf|attachment", text, re.IGNORECASE))),
        'phishing_keywords': int(bool(re.search(r"invoice|bank|login|payment|verify|credentials", text, re.IGNORECASE))),
        'capital_ratio': sum(1 for c in text if c.isupper()) / (len(text) + 1),
        'has_html_tags': int(bool(re.search(r"<.*?>", text))),
        'excessive_punctuation': int(text.count('!') > 1 or text.count('.') > 3),
        'text_len': len(text),
    }
    return pd.Series(features)

feature_df = df['text'].apply(extract_features)


tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['text'])

from scipy.sparse import hstack
X = hstack([tfidf_matrix, feature_df])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

joblib.dump(model, "improved_spam_model.pkl")
joblib.dump(tfidf, "improved_vectorizer.pkl")
joblib.dump(feature_df.columns.tolist(), "engineered_feature_names.pkl")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91784\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


F1 Score: 0.9090909090909092
ROC AUC: 0.9892867564300304
[[959   7]
 [ 19 130]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.95      0.87      0.91       149

    accuracy                           0.98      1115
   macro avg       0.96      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



['engineered_feature_names.pkl']