## Email spam detection

Email spam refers to unsolicited email messages, usually send in bulk to a large list of recipients. It can choke email inboxes if not properly filtered and regularly deleted. It can also be a threat as email spam senders regularly alter their methods and messages to trick potential victims into downloading malware,sharing data or sending money.

### Objective:

To implement machine learning algorithms and data analysis techniques to automatically identify and filter out spam emails, thereby improving user experience, privacy, and security in the digital communication landscape.

In [29]:
!pip install tensorflow



In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import warnings
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

data = pd.read_csv('C:/Users/Admin/Desktop/emails.csv')


In [31]:
data


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [32]:
data.columns

Index(['text', 'spam'], dtype='object')

In [33]:
data = data.drop_duplicates() # Removing Duplicates

In [34]:
data = data.dropna()

### Text Preprocessing


In [35]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Remove special characters, numbers, and extra white spaces

In [36]:
def clean_text(text):
    
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ' '.join(text.split())
    return text


In [37]:
def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [38]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [39]:
data['text'] = data['text'].apply(clean_text)
data['text'] = data['text'].apply(tokenize_and_lemmatize)
data['text'] = data['text'].apply(remove_stopwords)

In [40]:
data['text']

0       Subject naturally irresistible corporate ident...
1       Subject stock trading gunslinger fanny merrill...
2       Subject unbelievable new home made easy im wan...
3       Subject color printing special request additio...
4       Subject money get software cd software compati...
                              ...                        
5723    Subject research development charge gpg forwar...
5724    Subject receipt visit jim thanks invitation vi...
5725    Subject enron case study update wow day super ...
5726    Subject interest david please call shirley cre...
5727    Subject news aurora update aurora version fast...
Name: text, Length: 5695, dtype: object

In [41]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['spam'], test_size=0.2, random_state=42)


### Count Vectorizer


In [42]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Naive Bayes classifier


In [43]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{report}")

Accuracy: 0.990342405618964
Confusion Matrix:
[[839   4]
 [  7 289]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       843
           1       0.99      0.98      0.98       296

    accuracy                           0.99      1139
   macro avg       0.99      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139



### Logistic Regression

In [44]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9885864793678666
Confusion Matrix:
[[838   5]
 [  8 288]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       843
           1       0.98      0.97      0.98       296

    accuracy                           0.99      1139
   macro avg       0.99      0.98      0.99      1139
weighted avg       0.99      0.99      0.99      1139



### AdaBoost

In [45]:
base_classifier = DecisionTreeClassifier(max_depth=1)
adaboost_classifier = AdaBoostClassifier(base_classifier, n_estimators=50)
adaboost_classifier.fit(X_train, y_train)
y_pred = adaboost_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{report}")

Accuracy: 0.961369622475856
Confusion Matrix:
[[828  15]
 [ 29 267]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       843
           1       0.95      0.90      0.92       296

    accuracy                           0.96      1139
   macro avg       0.96      0.94      0.95      1139
weighted avg       0.96      0.96      0.96      1139



### SVM

In [46]:
svm_classifier = SVC(kernel='linear') 
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{report}")


Accuracy: 0.9806848112379281
Confusion Matrix:
[[836   7]
 [ 15 281]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       843
           1       0.98      0.95      0.96       296

    accuracy                           0.98      1139
   macro avg       0.98      0.97      0.97      1139
weighted avg       0.98      0.98      0.98      1139



### CNN

In [47]:
X = data['text']
y = data['spam']



In [48]:
tokenizer = Tokenizer(num_words=10000)  
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

max_sequence_length = 200 
X_pad = pad_sequences(X_seq, maxlen=max_sequence_length)
X_train, X_temp, y_train, y_temp = train_test_split(X_pad, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


model = tf.keras.Sequential([
    Embedding(input_dim=10000, output_dim=100, input_length=max_sequence_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))
test_loss, test_acc = model.evaluate(X_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
test_loss, test_acc = model.evaluate(X_test, y_test)




In [50]:
new_data = ['Deals for Your Last-Minute Long-Weekend Trip']
new_data_seq = tokenizer.texts_to_sequences(new_data)
new_data_pad = pad_sequences(new_data_seq, maxlen=max_sequence_length)
predictions = model.predict(new_data_pad)
predictions



array([[0.8707125]], dtype=float32)

In [51]:
predicted_labels = (predictions > 0.5).astype(int) #applying threshold

In [52]:
predicted_labels

array([[1]])

#### CNN successfully detected a spam email as spam.