## Emails - Spam Classifier Model

ACCURACY 95%

In [11]:
import os
import pandas as pd

def read_files(path, classification):
    data = []  # Initialize an empty list to store the data
    for root, _, filenames in os.walk(path):  # Walk through the directory structure
        for filename in filenames:  # Iterate over each file in the directory
            # Open and read the content of each file
            with open(os.path.join(root, filename), 'r', encoding='latin1') as file:
                content = file.read().split('\n\n', 1)[1]  # Skip headers and read the email body
                data.append({'message': content, 'class': classification})  # Append the message and its class to the list
    return pd.DataFrame(data)  # Convert the list of dictionaries to a pandas DataFrame

# Initialize an empty DataFrame to store emails
data = pd.DataFrame(columns=['message', 'class'])
# Read spam and ham emails and concatenate them into the DataFrame
data = pd.concat([
    data,
    read_files("emails/spam", "spam"),
    read_files("emails/ham", "ham")
], ignore_index=True)  # Concatenate all data and reindex the DataFrame


In [12]:
data.head()

Unnamed: 0,message,class
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam
3,##############################################...,spam
4,I thought you might like these:\n1) Slim Down ...,spam


##  Building and Training a Spam Classifier with Naive Bayes

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Step 1: Vectorize the text data
# The CountVectorizer converts the text data into a format that is suitable for model training.
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)  # Transform the text data to a numeric format

# Step 2: Create and train the Naive Bayes classifier
# Multinomial Naive Bayes is well-suited for classification with discrete features (e.g., word counts).
classifier = MultinomialNB()
targets = data['class'].values  # The target variable (spam or ham)
classifier.fit(counts, targets)  # Train the classifier on the vectorized data


In [14]:
# Step 3: Predicting the classification of the first 5 emails
sample_data = data['message'].values[:5]  # Extract the first five email messages from the dataset
sample_counts = vectorizer.transform(sample_data)  # Convert these emails to the same numeric format as the training data
predictions = classifier.predict(sample_counts)  # Use the classifier to predict the class for each of the sample emails

# Print the predictions along with parts of the emails
for email, prediction in zip(sample_data, predictions):
    print("Email:", email[:60], "...")  # Display the first 60 characters of each email
    print("Predicted Class:", prediction, "\n")  # Show the predicted classification (spam or ham)


Email: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN ...
Predicted Class: spam 

Email: 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=3 ...
Predicted Class: spam 

Email: 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=3 ...
Predicted Class: spam 

Email: ##################################################
#         ...
Predicted Class: spam 

Email: I thought you might like these:
1) Slim Down - Guaranteed to ...
Predicted Class: spam 



##  Building and Training a Spam Classifier with Logical Regression 

In [15]:
from sklearn.linear_model import LogisticRegression

# Step 1: Create and train the Logistic Regression classifier
# Logistic Regression is a statistical model that estimates probabilities using a logistic function.
classifier = LogisticRegression()
classifier.fit(counts, targets)  # Train the classifier on the vectorized data and the corresponding labels


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Step 2: Select and vectorize the first 5 emails from the dataset
sample_emails = data['message'].values[:5]  # Extract the first five email messages
sample_counts = vectorizer.transform(sample_emails)  # Convert the emails to a numeric format using the pre-trained vectorizer

# Step 3: Use the trained classifier to make predictions
predictions = classifier.predict(sample_counts)  # Predict the class for each of the vectorized emails

# Step 4: Print the predictions with part of the email text
for email, prediction in zip(sample_emails, predictions):
    print("Email:", email[:60], "...")  # Display the first 60 characters of each email for context
    print("Predicted Class:", prediction, "\n")  # Show the predicted classification (spam or ham)


Email: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN ...
Predicted Class: spam 

Email: 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=3 ...
Predicted Class: spam 

Email: 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=3 ...
Predicted Class: spam 

Email: ##################################################
#         ...
Predicted Class: spam 

Email: I thought you might like these:
1) Slim Down - Guaranteed to ...
Predicted Class: spam 



##  Accuracy Score - Spam Classifier Training and Evaluation

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Step 1: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['class'], test_size=0.2, random_state=42)

# Step 2: Vectorize the text data
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)  # Learn the vocabulary and convert training text data to matrix

# Step 3: Create and train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)  # Fit the classifier on the training set

# Step 4: Vectorize the test data and predict
X_test_counts = vectorizer.transform(X_test)  # Convert test text data to matrix using the same vectorizer
predictions = classifier.predict(X_test_counts)  # Predict the class for the test data

# Step 5: Calculate and print the accuracy
accuracy = accuracy_score(y_test, predictions)  # Calculate the accuracy of the classifier
print("Accuracy:", accuracy)


Accuracy: 0.95
