In [1]:
import pandas as pd
import pickle
import re
from collections import Counter

In [2]:
# Load the trained model
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
# Define the 57 feature columns
columns = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
    'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
    'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
    'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest',
    'capital_run_length_total'
]

In [4]:
# Words and chars to extract
target_words = [
    'make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order',
    'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business',
    'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george',
    '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology',
    '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're',
    'edu', 'table', 'conference'
]
target_chars = [';', '(', '[', '!', '$', '#']

In [5]:
# Function to extract features from email text
def extract_features(email_text):
    email_text = email_text.lower()
    words = re.findall(r'\w+', email_text)
    total_words = len(words) or 1
    word_counts = Counter(words)
    
    # Word frequencies
    word_features = [(word_counts[word] / total_words) * 100 for word in target_words]
    
    # Character frequencies
    total_chars = len(email_text) or 1
    char_features = [(email_text.count(char) / total_chars) * 100 for char in target_chars]
    
    # Capital run features
    capital_runs = []
    current_run = 0
    for char in email_text.upper():
        if char.isupper():
            current_run += 1
        elif current_run > 0:
            capital_runs.append(current_run)
            current_run = 0
    if current_run > 0:
        capital_runs.append(current_run)
    
    avg_capital_run = sum(capital_runs) / len(capital_runs) if capital_runs else 0
    longest_capital_run = max(capital_runs) if capital_runs else 0
    total_capital_run = sum(capital_runs) if capital_runs else 0
    
    # Combine features
    features = word_features + char_features + [avg_capital_run, longest_capital_run, total_capital_run]
    return features

In [None]:
# Sample email
sample_email = "Win a FREE iPhone NOW! Click here!!! $500 prize awaits YOU."
features = extract_features(sample_email)
sample_df = pd.DataFrame([features], columns=columns)

In [7]:

# Predict
prediction = model.predict(sample_df)
print("Email:", sample_email)
print("Prediction:", "Spam" if prediction[0] == 1 else "Ham")

Email: Win a FREE iPhone NOW! Click here!!! $500 prize awaits YOU.
Prediction: Spam
