Spam Email Detector using Machine Learning
Course: CT-361: Artificial Intelligence & Expert System  
Project Type: AIES CCP Project — Spring 2025  
Developed By:
Maaria Shaikh – [CT -22019]
---

This project demonstrates the use of machine learning techniques to automatically classify emails as *Spam* or *Not Spam (Ham)* using Natural Language Processing and a Naive Bayes classifier. The goal is to reduce the risk of phishing attacks, scams, and spam overload in digital communication.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For preprocessing
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# For vectorizing text
from sklearn.feature_extraction.text import CountVectorizer

# Machine learning model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
nltk.download('stopwords')

In [None]:
import pandas as pd

# Load working spam/ham dataset (SMS messages)
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", 
                 sep='\t', 
                 header=None, 
                 names=['label', 'message'])

# Show first 5 rows to verify data loaded properly
df.head()

In [None]:
print(df.info())
print(df['label'].value_counts())

sns.countplot(x='label', data=df)
plt.title("Class Distribution: Ham vs Spam")
plt.show()

In [None]:
nltk.download('stopwords')

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    stop_words = stopwords.words('english')
    return ' '.join([word for word in words if word not in stop_words])

df['cleaned'] = df['message'].apply(preprocess)
df.head()

In [None]:
df['clean_message'] = df['message'].apply(preprocess_text)
df.head()

In [None]:
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

In [None]:
print(df.info())
print(df['label'].value_counts())

sns.countplot(x='label', data=df)
plt.title("Class Distribution: Ham vs Spam")
plt.show()X = df['clean_message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)


In [None]:
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

In [None]:
y_pred = clf.predict(X_test_counts)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
test_messages = [
    "Congratulations! You won a free ticket.",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your account has been compromised. Call now!"
]

test_counts = vectorizer.transform(test_messages)
predictions = clf.predict(test_counts)

for msg, pred in zip(test_messages, predictions):
    label = 'Spam' if pred == 1 else 'Ham'
    print(f"Message: {msg}\nPredicted label: {label}\n")