## Import Important Libraries

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

## Loading the dataset and Display the data

In [7]:
# Load the dataset
file_path = 'spam_or_not_spam.csv'
dataset = pd.read_csv(file_path)

# Display the first few rows of the dataset
dataset.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


## Preprocessing Text and making a function to clean text

In [11]:
# Text Preprocessing
import re

# Function to clean the text data
def clean_text(text):
    # Convert non-string data to empty string
    if not isinstance(text, str):
        return ''
    # Your existing cleaning steps
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text



## Apply cleaning funtion and display train data

In [12]:
# Apply the cleaning function to the email column
dataset['email'] = dataset['email'].apply(clean_text)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['email'], dataset['label'], test_size=0.2, random_state=42)
# Display the first few cleaned emails
X_train.head()

642                        chuck murcko wrote stuff yawn 
700      some interesting quotes url thomas jefferson ...
226      in forteana martin adamson martin wrote for a...
1697     skip montanaro to anthony baxter accordingly ...
1010     on fri number sep number tony tony nugent wro...
Name: email, dtype: object

## Evaluating the model

In [14]:
# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Training the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_counts, y_train)

# Predicting on the test set
y_pred = model.predict(X_test_counts)

# Evaluating the model
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Confusion Matrix:
 [[505   0]
 [  6  89]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       505
           1       1.00      0.94      0.97        95

    accuracy                           0.99       600
   macro avg       0.99      0.97      0.98       600
weighted avg       0.99      0.99      0.99       600

