In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data

In [86]:
# Load spam email dataset

df = pd.read_csv('/kaggle/input/spam-email-dataset/emails.csv')




In [87]:
data = df.where((pd.notnull(df)),'')

In [88]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [89]:
data.shape

(5728, 2)

In [90]:
# Convert 'text' column to string type
df['text'] = df['text'].astype(str)


In [91]:
# Ensure 'spam' column is numeric
df['spam'] = df['spam'].astype(int)

In [92]:
# Check unique values in 'Category' column before encoding
print("Unique values in 'Body' before encoding:", data['text'].unique())

# Encode 'Category' column
#data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

Unique values in 'Body' before encoding: ["Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within thre

In [93]:
# Replace NaN values with the mode of the column
mode = data['text'].mode()[0]

data['text'].fillna(mode)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object

In [94]:
# Remove duplicate rows based on 'spam' column
data.drop_duplicates(subset=['spam'], inplace=True)

# Drop rows with NaN values in 'spam' or 'text' columns
data.dropna(subset=['spam', 'text'], inplace=True)

In [95]:
#filling missing messages with empty string

data['text'].fillna('') 

0       Subject: naturally irresistible your corporate...
1368    Subject: hello guys ,  i ' m " bugging you " f...
Name: text, dtype: object

In [96]:
# # Transform the messages into TF-IDF features
X = data['text']
y = data['spam']

In [97]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the messages into TF-IDF features
X = vectorizer.fit_transform(data['text']).toarray()
y = data['spam']


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [99]:
# Initialize the classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)


In [100]:
# Predict the labels of the test set
y_pred = clf.predict(X_test)


In [101]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [102]:
# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Confusion Matrix:\n{cm}")
print(f"Accuracy: {accuracy}")

Confusion Matrix:
[[0 1]
 [0 0]]
Accuracy: 0.0


In [103]:
# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
