## Importing required libraries and dataset

In [2]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\niran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\niran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
data = pd.read_csv('C:/Users/niran/Desktop/spam_normal_emails.csv')  

In [7]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## Data Preprocessing

In [9]:
# Splitting the dataset into features (X) and labels (y)
X = data['text']
y = data['spam']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Text Preprocessing using nltk

In [10]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

X_train_preprocessed = X_train.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)

In [11]:
# Creating a CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_preprocessed)
X_test_vectorized = vectorizer.transform(X_test_preprocessed)

In [12]:
X_train_vectorized.shape,X_test_vectorized.shape

((4582, 30326), (1146, 30326))

## Example

In [13]:
sample_vectorized = X_train_vectorized[0]
sample_vectorized_array = sample_vectorized.toarray()
non_zero_indices = np.nonzero(sample_vectorized_array)
feature_names = vectorizer.get_feature_names_out()

# Displaying non-zero indices and corresponding word frequencies
for word_index, frequency in zip(non_zero_indices[1], sample_vectorized_array[non_zero_indices]):
    word = feature_names[word_index]
    print(f"Word: {word}, Frequency: {frequency}")


Word: akhave, Frequency: 1
Word: allen, Frequency: 1
Word: ann, Frequency: 1
Word: approved, Frequency: 4
Word: billie, Frequency: 1
Word: bradley, Frequency: 1
Word: carmen, Frequency: 1
Word: carol, Frequency: 1
Word: cc, Frequency: 2
Word: chavira, Frequency: 1
Word: click, Frequency: 1
Word: coats, Frequency: 1
Word: company, Frequency: 1
Word: corp, Frequency: 1
Word: document, Frequency: 1
Word: ect, Frequency: 26
Word: ely, Frequency: 1
Word: enron, Frequency: 1
Word: epsc, Frequency: 3
Word: following, Frequency: 1
Word: form, Frequency: 1
Word: galvan, Frequency: 1
Word: gary, Frequency: 1
Word: hargrave, Frequency: 1
Word: holloway, Frequency: 3
Word: hou, Frequency: 13
Word: indicated, Frequency: 1
Word: information, Frequency: 1
Word: jeff, Frequency: 1
Word: jo, Frequency: 1
Word: joann, Frequency: 3
Word: kaminski, Frequency: 2
Word: kinneman, Frequency: 1
Word: link, Frequency: 1
Word: louis, Frequency: 1
Word: mccumber, Frequency: 1
Word: michael, Frequency: 1
Word: mic

## Model Building and Performance Analysis

In [14]:
# Initializing and training the Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train_vectorized, y_train)

# Making predictions on the test set
y_pred = nb_clf.predict(X_test_vectorized)

In [15]:
# Calculating accuracy and generating classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.9895287958115183

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       856
           1       0.98      0.98      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



### Model Summary

The Naive Bayes classifier performed very well in this scenario, achieving high accuracy and balanced precision and recall for both spam and ham classes. The results suggest that the classifier is effective at distinguishing between spam and non-spam emails.

**Accuracy:** The classifier is about 98.95% accurate in predicting whether an email is spam or not.

**Precision:** About 98% of the predicted spam emails were actually spam, and about 99% of the predicted non-spam (ham) emails were actually non-spam.

**Recall or Sensitivity:** The classifier identified about 98% of the actual spam emails and about 99% of the actual non-spam emails.

**F1-Score:** The balanced measure of precision and recall is around 0.98 for spam and 0.99 for non-spam emails.

**Support:** The test set included 856 non-spam (ham) emails and 290 spam emails.

**Macro Avg:** The average performance across both classes (spam and ham) is around 0.99.

**Weighted Avg:** Considering class distribution, the weighted average performance is around 0.99.