In [11]:
import pandas as pd
import os

def read_spam():
    """
    Reads and extracts information from spam emails.

    Returns:
    list: A list of dictionaries containing email information.
        Each dictionary has the following keys:
        - 'name': The filename.
        - 'content': The email content.
        - 'category': The category (spam).
    """
    category = 'spam'
    directory = './enron1/enron1/spam'
    return read_category(category, directory)

def read_ham():
    """
    Reads and extracts information from ham (non-spam) emails.

    Returns:
    list: A list of dictionaries containing email information.
        Each dictionary has the following keys:
        - 'name': The filename.
        - 'content': The email content.
        - 'category': The category (ham).
    """
    category = 'ham'
    directory = './enron1/enron1/ham'
    return read_category(category, directory)

def read_category(category, directory):
    """
    Reads and extracts information from emails in a specified category.

    Args:
    category (str): The category of emails (ham or spam).
    directory (str): The directory containing the email files.

    Returns:
    list: A list of dictionaries containing email information.
        Each dictionary has the following keys:
        - 'name': The filename.
        - 'content': The email content.
        - 'category': The category (ham or spam).
    """
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails

# Read ham (non-spam) and spam emails
ham = read_ham()
spam = read_spam()

# Create a DataFrame to store the email data
df = pd.DataFrame.from_records(ham)
df = pd.concat([df, pd.DataFrame.from_records(spam)], ignore_index=True)

"""
Email Classifier Data:
- 'name': Filename
- 'content': Email content
- 'category': Category of the email (ham/spam)

ham: List of dictionaries containing information from ham (non-spam) emails.
spam: List of dictionaries containing information from spam emails.
"""


skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt


"\nEmail Classifier Data:\n- 'name': Filename\n- 'content': Email content\n- 'category': Category of the email (ham/spam)\n\nham: List of dictionaries containing information from ham (non-spam) emails.\nspam: List of dictionaries containing information from spam emails.\n"

In [12]:
df.head()

Unnamed: 0,name,content,category
0,0001.1999-12-10.farmer.ham.txt,Subject: christmas tree farm pictures\n,ham
1,0002.1999-12-13.farmer.ham.txt,"Subject: vastar resources , inc .\ngary , prod...",ham
2,0003.1999-12-14.farmer.ham.txt,Subject: calpine daily gas nomination\n- calpi...,ham
3,0004.1999-12-14.farmer.ham.txt,Subject: re : issue\nfyi - see note below - al...,ham
4,0005.1999-12-14.farmer.ham.txt,Subject: meter 7268 nov allocation\nfyi .\n- -...,ham


# Data cleaning 
The 'preprocessor' function accepts a string as input, and its purpose is to replace all characters that are not part of the alphabet with a space. Subsequently, it converts the entire string to lowercase.

In [13]:
import re

def preprocessor(df, column_name):
    df[column_name] = df[column_name].apply(lambda text: re.sub(r'[^a-zA-Z]', ' ', text).lower())


preprocessor(df,'content')


In [14]:
df.head()

Unnamed: 0,name,content,category
0,0001.1999-12-10.farmer.ham.txt,subject christmas tree farm pictures,ham
1,0002.1999-12-13.farmer.ham.txt,subject vastar resources inc gary produ...,ham
2,0003.1999-12-14.farmer.ham.txt,subject calpine daily gas nomination calpin...,ham
3,0004.1999-12-14.farmer.ham.txt,subject re issue fyi see note below alr...,ham
4,0005.1999-12-14.farmer.ham.txt,subject meter nov allocation fyi ...,ham


# Data cleaning end.

In [17]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Definition of the preprocessor function
def preprocessor(text):
    """
    Preprocesses text by removing non-alphabet characters and converting to lowercase.

    Args:
    text (str): The input text to be preprocessed.

    Returns:
    str: The preprocessed text.
    """
    return re.sub(r'[^a-zA-Z]', ' ', text).lower()

# Creating a text vectorizer
vectorizer = CountVectorizer(preprocessor=preprocessor)

# Transforming text data into a feature matrix
X = vectorizer.fit_transform(df['content'])
y = df['category']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating and training the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Printing the results
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.9622823984526112
Confusion Matrix:
[[714  15]
 [ 24 281]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.98      0.97       729
        spam       0.95      0.92      0.94       305

    accuracy                           0.96      1034
   macro avg       0.96      0.95      0.95      1034
weighted avg       0.96      0.96      0.96      1034



Conclusion:
# dataset:enron1 
According to the results, the model achieves an accuracy of approximately 96.23% in classifying emails as "ham" or "spam." The classification report provides additional details about the model's performance for each category.

In [19]:
# Retrieving the feature names from the vectorizer
features = vectorizer.get_feature_names_out()

# Extracting 
top_positive_features = [features[i] for i in coefficients.argsort()[-10:][::-1]]
top_negative_features = [features[i] for i in coefficients.argsort()[:10]]
top_spam_negative_features = [features[i] for i in coefficients.argsort()[:10]]
top_ham_positive_features = [features[i] for i in coefficients.argsort()[-10:][::-1]]

# Printing the top positive (spam-related) features
print("Top 10 Positive Features (Spam-related):")
for feature in top_positive_features:
    print(feature)

# Printing the top negative (ham-related) features
print("\nTop 10 Negative Features (Ham-related):")
for feature in top_negative_features:
    print(feature)

# Printing the top negative (spam-related) features
print("\nTop 10 Negative Features (Spam-negative):")
for feature in top_spam_negative_features:
    print(feature)

# Printing the top positive (ham-related) features
print("\nTop 10 Positive Features (Ham-positive):")
for feature in top_ham_positive_features:
    print(feature)


Top 10 Positive Features (Spam-related):
cards
document
removed
smart
laptop
women
enjoy
school
note
read

Top 10 Negative Features (Ham-related):
revised
actuals
attached
check
see
xls
hpl
nom
tap
for

Top 10 Negative Features (Spam-negative):
revised
actuals
attached
check
see
xls
hpl
nom
tap
for

Top 10 Positive Features (Ham-positive):
cards
document
removed
smart
laptop
women
enjoy
school
note
read
