In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('spam_ham_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
df = df.drop(['Unnamed: 0','label'], axis=1)

In [5]:
df.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label_num,5171.0,0.289886,0.453753,0.0,0.0,0.0,1.0,1.0


In [7]:
df.isnull().sum()

text         0
label_num    0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       5171 non-null   object
 1   label_num  5171 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 80.9+ KB


In [9]:
df.duplicated().sum()

178

In [10]:
dp = df.duplicated()
duplicated = df[dp]
dublicated_sorted = duplicated.sort_values(by=['text'])

In [11]:
dublicated_sorted.head(10)

Unnamed: 0,text,label_num
182,Subject: \r\n,1
296,Subject: \r\n,1
363,Subject: \r\n,1
2538,Subject: \r\n,1
2665,Subject: \r\n,1
2680,Subject: \r\n,1
1369,Subject: \r\n,1
4748,Subject: \r\n,1
2903,Subject: \r\n,1
4081,Subject: \r\n,1


In [12]:
df_cleaned = df.drop_duplicates()

In [13]:
df_cleaned.shape

(4993, 2)

In [14]:
df['label_num'].value_counts()

label_num
0    3672
1    1499
Name: count, dtype: int64

In [15]:
# Data Preprocessing

# Convert text to lowercase → Ensures uniformity.
# Remove email addresses, URLs, and numbers → Spammers often use links and numbers in messages.
# Remove special characters and punctuation → Cleans unnecessary symbols like @, #, $, %.
# Remove stopwords → Reduces common words that don’t contribute to spam detection.
# Tokenization → Splits email text into words.
# Apply stemming or lemmatization → Converts words to their base form (better for pattern recognition).
# Detect and remove repeated characters → Spammers often use "Frrreeee" instead of "Free".
# Vectorization (TF-IDF or Word Embeddings) → Converts text into numerical form for ML models.

In [16]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')  
nltk.download('stopwords') 
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hamad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hamad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hamad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [18]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.PorterStemmer()

def preprocess_email(text):

    # Convert text to lowercase
    text = text.lower()

    # Remove email addresses
    text = re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove special characters and punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    words = nltk.word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Apply stemming
    words = [stemmer.stem(word) for word in words]

    # Remove repeated characters (e.g., "Frrreeee" → "Free")
    words = [re.sub(r'(.)\1+', r'\1\1', word) for word in words]

    # Convert list of words back to text
    return ' '.join(words)

In [19]:
df_final = df_cleaned

In [20]:
df_final['text'] = df_final['text'].apply(preprocess_email)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['text'] = df_final['text'].apply(preprocess_email)


In [42]:
df_final.head()

Unnamed: 0,text,label_num
0,subject enron methanol meter follow note gave ...,0
1,subject hpl nom januari see attach file hplnol...,0
2,subject neon retreat ho ho ho around wonder ti...,0
3,subject photoshop window offic cheap main tren...,1
4,subject indian spring deal book teco pvr reven...,0


In [46]:
X_train, X_test, y_train, y_test = train_test_split(df_final['text'], df_final['label_num'], random_state=42, stratify=df_final['label_num'])

In [50]:
tfid = TfidfVectorizer(max_features=5000)

X_train_tf = tfid.fit_transform(X_train)
X_test_tf = tfid.transform(X_test)

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [60]:
model = MultinomialNB()
model.fit(X_train_tf, y_train)

y_pred = model.predict(X_test_tf)

In [64]:
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.9519615692554043
