In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv("email_classification.csv")
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   179 non-null    object
 1   label   179 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


In [4]:
df = pd.read_csv('email_classification.csv') 
df.head() 


Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [5]:
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()  # Tokenize the text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords(such as “the”, “a”, “an”, or “in)
    return ' '.join(tokens)

# Apply preprocessing to the email column
df['processed_email'] = df['email'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.model_selection import train_test_split

X = df['processed_email']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [8]:
from sklearn.naive_bayes import GaussianNB

# GaussianNB requires a dense matrix, so convert sparse matrix to dense
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

model = GaussianNB()
model.fit(X_train_dense, y_train)


In [9]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_dense)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9722222222222222

Classification Report:
               precision    recall  f1-score   support

         ham       0.93      1.00      0.97        14
        spam       1.00      0.95      0.98        22

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36



In [12]:
def predict_email(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text]).toarray()
    prediction = model.predict(vectorized_text)
    return prediction[0]

# Example usage
new_email = "Upgrade to our premium plan for exclusive access to premium content and features."
print(f"The email is classified as: {predict_email(new_email)}")


The email is classified as: ham
