In [1]:
# =================
# 0. INITIAL SETUP
# =================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Global containers to store components between versions
version_components = {
    'vectorizers': {},
    'models': {},
    'features': {},
    'performance': []
}



In [2]:

# ===============================
# 1. DATA LOADING & PREPROCESSING
# ===============================
def load_and_preprocess():
    df = pd.read_csv("CEAS_08.csv")
    df['subject'] = df['subject'].fillna('')
    df['body'] = df['body'].fillna('')

    # Define clean_text INSIDE the function
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'\s+', ' ', text)  # Fix extra spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove special chars
        return text


    df['clean_body'] = df['body'].apply(clean_text)
    df['clean_subject'] = df['subject'].apply(clean_text)

    return df

df = load_and_preprocess()
print("Data loaded and preprocessed!")

Data loaded and preprocessed!


In [3]:
# ======================
# VERSION CONTROL SYSTEM
# ======================
def run_version(version_name, df, use_previous=True):
    """
    Modular version runner
    Parameters:
        version_name: '1.0', '1.1', '1.3', or '2.0'
        use_previous: Whether to reuse components from earlier versions
    """
    if version_name == '1.1':
        return version_1_1(df, use_previous)
    elif version_name == '1.2':
        return version_1_2(df, use_previous)
    elif version_name == '2.0':
        return version_2_0(df, use_previous)
    else:
        raise ValueError("Invalid version name")

def print_performance():
    """Display all recorded results"""
    print("\n=== PERFORMANCE COMPARISON ===")
    for result in version_components['performance']:
        print(f"\nVersion {result['Version']} - {result['Model']}")
        print(f"Accuracy: {result['Accuracy']:.4f}")
        print(f"F1 Score: {result['F1 Score']:.4f}")

In [4]:
# ===========================
# VERSION 1.1 - SENDER DOMAIN
# ===========================
def version_1_1(df, use_previous=True):
    # Feature engineering
    df['sender_domain'] = df['sender'].apply(lambda x: x.split('@')[-1] if pd.notnull(x) else 'missing')
    sender_domain_freq = df['sender_domain'].value_counts().to_dict()
    df['sender_domain_freq'] = df['sender_domain'].map(sender_domain_freq)

    X = df[['clean_body', 'clean_subject', 'sender_domain_freq']]
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorization (reuse if available)
    if use_previous and '1.0' in version_components['vectorizers']:
        vectorizers = version_components['vectorizers']['1.0']
        print("Reusing vectorizers from Version 1.0")
    else:
        vectorizers = {
            'body': TfidfVectorizer().fit(X_train['clean_body']),
            'subject': TfidfVectorizer().fit(X_train['clean_subject'])
        }

    X_train_body = vectorizers['body'].transform(X_train['clean_body'])
    X_test_body = vectorizers['body'].transform(X_test['clean_body'])
    X_train_subject = vectorizers['subject'].transform(X_train['clean_subject'])
    X_test_subject = vectorizers['subject'].transform(X_test['clean_subject'])

    # Combine features
    X_train_combined = hstack([
        X_train_body,
        X_train_subject,
        X_train['sender_domain_freq'].values.reshape(-1, 1)
    ])
    X_test_combined = hstack([
        X_test_body,
        X_test_subject,
        X_test['sender_domain_freq'].values.reshape(-1, 1)
    ])

    # Model training
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_combined, y_train)
    version_components['models']['1.1'] = model

    # Evaluation
    y_pred = model.predict(X_test_combined)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    result = {
        'Version': '1.1',
        'Model': 'Logistic Regression + Sender Domain',
        'Accuracy': accuracy,
        'F1 Score': f1
    }
    version_components['performance'].append(result)

    print(f"\nVersion 1.1 Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f'{matrix} \n')
    return result

In [5]:
# ==========================
# VERSION 1.2 - URL FEATURES
# ==========================
def version_1_2(df, use_previous=True):
    # Feature engineering
    df['url_count'] = df['clean_body'].apply(lambda x: len(re.findall(r'http[s]?://', x)))
    df['url_presence'] = (df['url_count'] > 0).astype(int)

    X = df[['clean_body', 'clean_subject', 'url_count', 'url_presence']]
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorization (reuse if available)
    if use_previous and '1.0' in version_components['vectorizers']:
        vectorizers = version_components['vectorizers']['1.0']
        print("Reusing vectorizers from Version 1.0")
    else:
        vectorizers = {
            'body': TfidfVectorizer().fit(X_train['clean_body']),
            'subject': TfidfVectorizer().fit(X_train['clean_subject'])
        }

    X_train_body = vectorizers['body'].transform(X_train['clean_body'])
    X_test_body = vectorizers['body'].transform(X_test['clean_body'])
    X_train_subject = vectorizers['subject'].transform(X_train['clean_subject'])
    X_test_subject = vectorizers['subject'].transform(X_test['clean_subject'])

    # Combine features
    X_train_combined = hstack([
        X_train_body,
        X_train_subject,
        X_train[['url_count', 'url_presence']].values
    ])
    X_test_combined = hstack([
        X_test_body,
        X_test_subject,
        X_test[['url_count', 'url_presence']].values
    ])

    # Model training
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_combined, y_train)
    version_components['models']['1.2'] = model

    # Evaluation
    y_pred = model.predict(X_test_combined)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    result = {
        'Version': '1.2',
        'Model': 'Logistic Regression + URL Features',
        'Accuracy': accuracy,
        'F1 Score': f1
    }
    version_components['performance'].append(result)

    print(f"\nVersion 1.2 Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f'{matrix} \n')
    return result

In [6]:
# ===========================
# VERSION 2.0 - RANDOM FOREST
# ===========================
def version_2_0(df, use_previous=True):
    # Combine all features from previous versions
    df['sender_domain'] = df['sender'].apply(lambda x: x.split('@')[-1] if pd.notnull(x) else 'missing')
    sender_domain_freq = df['sender_domain'].value_counts().to_dict()
    df['sender_domain_freq'] = df['sender_domain'].map(sender_domain_freq)
    df['url_count'] = df['clean_body'].apply(lambda x: len(re.findall(r'http[s]?://', x)))
    df['url_presence'] = (df['url_count'] > 0).astype(int)

    X = df[['clean_body', 'clean_subject', 'sender_domain_freq', 'url_count', 'url_presence']]
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorization
    if use_previous and '1.0' in version_components['vectorizers']:
        vectorizers = version_components['vectorizers']['1.0']
        print("Reusing vectorizers from Version 1.0")
    else:
        vectorizers = {
            'body': TfidfVectorizer().fit(X_train['clean_body']),
            'subject': TfidfVectorizer().fit(X_train['clean_subject'])
        }

    X_train_body = vectorizers['body'].transform(X_train['clean_body'])
    X_test_body = vectorizers['body'].transform(X_test['clean_body'])
    X_train_subject = vectorizers['subject'].transform(X_train['clean_subject'])
    X_test_subject = vectorizers['subject'].transform(X_test['clean_subject'])

    # Combine features
    X_train_combined = hstack([
        X_train_body,
        X_train_subject,
        X_train[['sender_domain_freq', 'url_count', 'url_presence']].values
    ])
    X_test_combined = hstack([
        X_test_body,
        X_test_subject,
        X_test[['sender_domain_freq', 'url_count', 'url_presence']].values
    ])

    # Model training
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_combined, y_train)
    version_components['models']['2.0'] = model

    # Evaluation
    y_pred = model.predict(X_test_combined)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    result = {
        'Version': '2.0',
        'Model': 'Random Forest (All Features)',
        'Accuracy': accuracy,
        'F1 Score': f1
    }
    version_components['performance'].append(result)

    print(f"\nVersion 2.0 Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f'{matrix} \n')
    return result

In [7]:
# =================
# EXECUTION CONTROL
# =================

run_version('1.1', df)
run_version('1.2', df)
run_version('2.0', df)

print_performance()


Version 1.1 Results:
Accuracy: 0.9941
F1 Score: 0.9947
[[3466   24]
 [  22 4319]] 


Version 1.2 Results:
Accuracy: 0.9945
F1 Score: 0.9950
[[3468   22]
 [  21 4320]] 


Version 2.0 Results:
Accuracy: 0.9920
F1 Score: 0.9927
[[3460   30]
 [  33 4308]] 


=== PERFORMANCE COMPARISON ===

Version 1.1 - Logistic Regression + Sender Domain
Accuracy: 0.9941
F1 Score: 0.9947

Version 1.2 - Logistic Regression + URL Features
Accuracy: 0.9945
F1 Score: 0.9950

Version 2.0 - Random Forest (All Features)
Accuracy: 0.9920
F1 Score: 0.9927


In [12]:
# ======================
# VERSION 1.0 - BASELINE
# ======================

df = pd.read_csv("CEAS_08.csv")


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "link", text)
    text = re.sub(r'[^\w\s./-]', '', text)
    return text

df["clean_body"] = df["body"].apply(clean_text)



df["email_length"] = df["body"].str.len()

# B. URL Features
df["has_urls"] = (df["urls"] > 0).astype(int)


df["body_cap_ratio"] = df["body"].str.findall(r'[A-Z]').str.len() / df["email_length"]
df["subject_cap_ratio"] = df["subject"].str.findall(r'[A-Z]').str.len() / df["subject"].str.len()


revised_words = ["click", "urgent", "bank", "immediately"]
for word in revised_words:
    df[f"has_{word}"] = df["clean_body"].str.contains(word).astype(int)
df["suspicious_score"] = df[[f"has_{w}" for w in revised_words]].sum(axis=1)


df.drop(columns=["urls", "special_chars"], inplace=True, errors="ignore")

final_features = [
    "email_length",
    "body_cap_ratio",
    "subject_cap_ratio",
    "suspicious_score",
    "has_urls"
]


# Fill any NaNs caused by division or missing data
df["body_cap_ratio"].fillna(0, inplace=True)
df["subject_cap_ratio"].fillna(0, inplace=True)



from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


X = df[final_features]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print("\n=== Accuracy ===")
print(accuracy_score(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))


=== Accuracy ===
0.661345932831056

=== Confusion Matrix ===
[[1975 1515]
 [1137 3204]]

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.63      0.57      0.60      3490
           1       0.68      0.74      0.71      4341

    accuracy                           0.66      7831
   macro avg       0.66      0.65      0.65      7831
weighted avg       0.66      0.66      0.66      7831

