In [125]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

data = pd.read_csv("Spam_Email_Data.csv")

stop_words = set(stopwords.words('english'))
# wordnet lemmatizer
lemmatizer = WordNetLemmatizer()

def data_preprocessing(row):
    # text cleansing
    row = row.lower()
    row = re.sub(r'\b\S+@\S+\b', ' ', row)
    row = re.sub(r'<[^>]*>', ' ', row)
    row = re.sub(r'[^a-zA-Z\s]', ' ', row)
    # nltk tokenizer
    tokens = word_tokenize(row)
    # lemmatization
    clean_rows = []
    for token in tokens:
        if token not in stop_words:
            clean_token = lemmatizer.lemmatize(token)
            clean_rows.append(clean_token)
    # join row
    clean_row = ' '.join(clean_rows)
    return clean_row

# apply data preprocessing to each row in data['text']
data['clean_text'] = data['text'].apply(data_preprocessing)
# print(data['clean_text'][0])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
from sklearn.model_selection import train_test_split
# splitting portion is 60 train - 40 test with random_state=42
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.4, random_state=42)

In [127]:
# list to append models performance in it
models_performances = []

In [128]:
print('Tfidf')
print('-----')

Tfidf
-----


In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer
# tf-idf text embedding technique (not neural networks based)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# Logistic Regression with regularization parameter C=0.5,set random state to ensure reproducibility
logistic_model = LogisticRegression(C=0.5,random_state=42)
# fit model to train data
logistic_model.fit(X_train_tfidf, y_train)
# model predict test data
y_pred_logistic_tfidf = logistic_model.predict(X_test_tfidf)

# Tfidf Logistic Regression Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_tfidf_logistic = "Tfidf Logistic Regression"
accuracy_tfidf_logistic = accuracy_score(y_test, y_pred_logistic_tfidf)
precision_tfidf_logistic = precision_score(y_test, y_pred_logistic_tfidf)
recall_tfidf_logistic = recall_score(y_test, y_pred_logistic_tfidf)
f1_tfidf_logistic = f1_score(y_test, y_pred_logistic_tfidf)
# Print Tfidf Logistic Regression Evaluation Metrics
print("Model Name:", model_name)
print("Accuracy (Tfidf Logistic):", accuracy_tfidf_logistic)
print("Precision (Tfidf Logistic):", precision_tfidf_logistic)
print("Recall (Tfidf Logistic):", recall_tfidf_logistic)
print("F1-score (Tfidf Logistic):", f1_tfidf_logistic)
# Append Tfidf Logistic Regression Evaluation Metrics to models_performances List
models_performances.append([model_name_tfidf_logistic, accuracy_tfidf_logistic, precision_tfidf_logistic, recall_tfidf_logistic, f1_tfidf_logistic])

Model Name: Tfidf Logistic Regression
Accuracy (Tfidf Logistic): 0.9805950840879689
Precision (Tfidf Logistic): 0.9957805907172996
Recall (Tfidf Logistic): 0.944
F1-score (Tfidf Logistic): 0.9691991786447639


In [131]:
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
# SVM
svm_model = SVC(random_state=42)
# fit model to train data
svm_model.fit(X_train_tfidf, y_train)
# model predict test data
y_pred_svm_tfidf = svm_model.predict(X_test_tfidf)

# Tfidf SVM Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_tfidf_svm = "Tfidf SVM"
accuracy_tfidf_svm = accuracy_score(y_test, y_pred_svm_tfidf)
precision_tfidf_svm = precision_score(y_test, y_pred_svm_tfidf)
recall_tfidf_svm = recall_score(y_test, y_pred_svm_tfidf)
f1_tfidf_svm = f1_score(y_test, y_pred_svm_tfidf)
# Print Tfidf SVM Evaluation Metrics
print("Model Name:", model_name_tfidf_svm)
print("Accuracy (Tfidf SVM):", accuracy_tfidf_svm)
print("Precision (Tfidf SVM):", precision_tfidf_svm)
print("Recall (Tfidf SVM):", recall_tfidf_svm)
print("F1-score (Tfidf SVM):", f1_tfidf_svm)
# Append Tfidf SVM Evaluation Metrics to models_performances List
models_performances.append([model_name_tfidf_svm, accuracy_tfidf_svm, precision_tfidf_svm, recall_tfidf_svm, f1_tfidf_svm])

Model Name: Tfidf SVM
Accuracy (Tfidf SVM): 0.9922380336351876
Precision (Tfidf SVM): 0.9972826086956522
Recall (Tfidf SVM): 0.9786666666666667
F1-score (Tfidf SVM): 0.9878869448183042


In [132]:
print('Bag of Words')
print('------------')

Bag of Words
------------


In [133]:
from sklearn.feature_extraction.text import CountVectorizer
# bag of words text embedding technique (not neural networks based)
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# Logistic Regression with regularization parameter C=0.5,set random state to ensure reproducibility
logistic_model = LogisticRegression(max_iter=1000,C=0.5,random_state=42)
# fit model to train data
logistic_model.fit(X_train_bow, y_train)
# model predict test data
y_pred_logistic_bow = logistic_model.predict(X_test_bow)

# BOW Logistic Regression Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_bow_logistic = "Bag of Words Logistic Regression"
accuracy_bow_logistic = accuracy_score(y_test, y_pred_logistic_bow)
precision_bow_logistic = precision_score(y_test, y_pred_logistic_bow)
recall_bow_logistic = recall_score(y_test, y_pred_logistic_bow)
f1_bow_logistic = f1_score(y_test, y_pred_logistic_bow)
# Print BOW Logistic Regression Evaluation Metrics
print("Model Name:", model_name_bow_logistic)
print("Accuracy (Bag of Words Logistic):", accuracy_bow_logistic)
print("Precision (Bag of Words Logistic):", precision_bow_logistic)
print("Recall (Bag of Words Logistic):", recall_bow_logistic)
print("F1-score (Bag of Words Logistic):", f1_bow_logistic)
# Append BOW Logistic Regression Evaluation Metrics to models_performances List
models_performances.append([model_name_bow_logistic, accuracy_bow_logistic, precision_bow_logistic, recall_bow_logistic, f1_bow_logistic])

Model Name: Bag of Words Logistic Regression
Accuracy (Bag of Words Logistic): 0.9922380336351876
Precision (Bag of Words Logistic): 0.9986376021798365
Recall (Bag of Words Logistic): 0.9773333333333334
F1-score (Bag of Words Logistic): 0.9878706199460917


In [135]:
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
# SVM
svm_model = SVC(random_state=42)
# fit model to train data
svm_model.fit(X_train_bow, y_train)
# model predict test data
y_pred_svm_bow = svm_model.predict(X_test_bow)

# BOW SVM Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_bow_svm = "Bag of Words SVM"
accuracy_bow_svm = accuracy_score(y_test, y_pred_svm_bow)
precision_bow_svm = precision_score(y_test, y_pred_svm_bow)
recall_bow_svm = recall_score(y_test, y_pred_svm_bow)
f1_bow_svm = f1_score(y_test, y_pred_svm_bow)
# Print BOW SVM Evaluation Metrics
print("Model Name:", model_name_bow_svm)
print("Accuracy (Bag of Words SVM):", accuracy_bow_svm)
print("Precision (Bag of Words SVM):", precision_bow_svm)
print("Recall (Bag of Words SVM):", recall_bow_svm)
print("F1-score (Bag of Words SVM):", f1_bow_svm)
# Append BOW SVM Evaluation Metrics to models_performances List
models_performances.append([model_name_bow_svm, accuracy_bow_svm, precision_bow_svm, recall_bow_svm, f1_bow_svm])

Model Name: Bag of Words SVM
Accuracy (Bag of Words SVM): 0.9870633893919794
Precision (Bag of Words SVM): 0.9945054945054945
Recall (Bag of Words SVM): 0.9653333333333334
F1-score (Bag of Words SVM): 0.9797023004059541


In [136]:
print('Word2Vec')
print('--------')

Word2Vec
--------


In [137]:
from gensim.models import Word2Vec
import numpy as np
# Word2Vec text embedding technique (neural networks based)
# Train Word2Vec model
word2vec_model = Word2Vec([x_train.split() for x_train in X_train])

# Function to transform text to word2vec embeddings
def word2vec_transform(x_train, model):
    word_tokens = x_train.split()
    word_vectors = []
    for word in word_tokens:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    if word_vectors:
        # Take the mean of word vectors to get document vector
        text_embedding = np.mean(word_vectors, axis=0)
        return text_embedding
    else:
        return None

# Transform training data to word2vec embeddings
X_train_word2vec = []
for x_train in X_train:
    x_train_embedding = word2vec_transform(x_train, word2vec_model)
    X_train_word2vec.append(x_train_embedding)

X_train_word2vec = np.array(X_train_word2vec)

# Transform test data to word2vec embeddings
X_test_word2vec = []
for x_test in X_test:
    x_test_embedding = word2vec_transform(x_test, word2vec_model)
    X_test_word2vec.append(x_test_embedding)

X_test_word2vec = np.array(X_test_word2vec)

In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# Logistic Regression with regularization parameter C=0.5,set random state to ensure reproducibility
logistic_model = LogisticRegression(C=0.5,random_state=42)
# fit model to train data
logistic_model.fit(X_train_word2vec, y_train)
# model predict test data
y_pred_logistic_word2vec = logistic_model.predict(X_test_word2vec)

# Word2Vec Logistic Regression Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_word2vec_logistic = "Word2Vec Logistic Regression"
accuracy_word2vec_logistic = accuracy_score(y_test, y_pred_logistic_word2vec)
precision_word2vec_logistic = precision_score(y_test, y_pred_logistic_word2vec)
recall_word2vec_logistic = recall_score(y_test, y_pred_logistic_word2vec)
f1_word2vec_logistic = f1_score(y_test, y_pred_logistic_word2vec)
# Print Word2Vec Logistic Regression Evaluation Metrics
print("Model Name:", model_name_word2vec_logistic)
print("Accuracy (Word2Vec Logistic Regression):", accuracy_word2vec_logistic)
print("Precision (Word2Vec Logistic Regression):", precision_word2vec_logistic)
print("Recall (Word2Vec Logistic Regression):", recall_word2vec_logistic)
print("F1-score (Word2Vec Logistic Regression):", f1_word2vec_logistic)
# Append Word2Vec Logistic Regression Evaluation Metrics to models_performances List
models_performances.append([model_name_word2vec_logistic, accuracy_word2vec_logistic, precision_word2vec_logistic, recall_word2vec_logistic, f1_word2vec_logistic])

Model Name: Word2Vec Logistic Regression
Accuracy (Word2Vec Logistic Regression): 0.9849072876239758
Precision (Word2Vec Logistic Regression): 0.9917469050894085
Recall (Word2Vec Logistic Regression): 0.9613333333333334
F1-score (Word2Vec Logistic Regression): 0.976303317535545


In [139]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# SVM
svm_model = SVC(random_state=42)
# fit model to train data
svm_model.fit(X_train_word2vec, y_train)
# model predict test data
y_pred_svm_word2vec = svm_model.predict(X_test_word2vec)

# Word2Vec SVM Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_word2vec_svm = "Word2Vec SVM"
accuracy_word2vec_svm = accuracy_score(y_test, y_pred_svm_word2vec)
precision_word2vec_svm = precision_score(y_test, y_pred_svm_word2vec)
recall_word2vec_svm = recall_score(y_test, y_pred_svm_word2vec)
f1_word2vec_svm = f1_score(y_test, y_pred_svm_word2vec)
# Print Word2Vec SVM Evaluation Metrics
print("Model Name:", model_name_word2vec_svm)
print("Accuracy (Word2Vec SVM):", accuracy_word2vec_svm)
print("Precision (Word2Vec SVM):", precision_word2vec_svm)
print("Recall (Word2Vec SVM):", recall_word2vec_svm)
print("F1-score (Word2Vec SVM):", f1_word2vec_svm)
# Append Word2Vec SVM Evaluation Metrics to models_performances List
models_performances.append([model_name_word2vec_svm, accuracy_word2vec_svm, precision_word2vec_svm, recall_word2vec_svm, f1_word2vec_svm])

Model Name: Word2Vec SVM
Accuracy (Word2Vec SVM): 0.9853385079775765
Precision (Word2Vec SVM): 0.9917582417582418
Recall (Word2Vec SVM): 0.9626666666666667
F1-score (Word2Vec SVM): 0.9769959404600812


In [140]:
print('Doc2Vec')
print('--------')

Doc2Vec
--------


In [141]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
# Doc2Vec text embedding technique (neural networks based)
# Prepare data for Doc2Vec
labeled_data = [
    TaggedDocument(words=text.split(), tags=[i]) for i, text in enumerate(X_train)
]

# Train Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(labeled_data)
model.train(labeled_data, total_examples=model.corpus_count, epochs=model.epochs)

# Transform training data to Doc2Vec embeddings
X_train_doc2vec = [model.infer_vector(x_train.split()) for x_train in X_train]

# Transform test data to Doc2Vec embeddings
X_test_doc2vec = [model.infer_vector(x_test.split()) for x_test in X_test]

In [142]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# Logistic Regression with regularization parameter C=0.5,set random state to ensure reproducibility
logistic_model = LogisticRegression(C=0.5,random_state=42)
# fit model to train data
logistic_model.fit(X_train_doc2vec, y_train)
# model predict test data
y_pred_logistic_doc2vec = logistic_model.predict(X_test_doc2vec)

# Doc2Vec Logistic Regression Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_doc2vec_logistic = "Doc2Vec Logistic Regression"
accuracy_doc2vec_logistic = accuracy_score(y_test, y_pred_logistic_doc2vec)
precision_doc2vec_logistic = precision_score(y_test, y_pred_logistic_doc2vec)
recall_doc2vec_logistic = recall_score(y_test, y_pred_logistic_doc2vec)
f1_doc2vec_logistic = f1_score(y_test, y_pred_logistic_doc2vec)
# Print Doc2Vec Logistic Regression Evaluation Metrics
print("Model Name:", model_name_doc2vec_logistic)
print("Accuracy (Doc2Vec Logistic Regression):", accuracy_doc2vec_logistic)
print("Precision (Doc2Vec Logistic Regression):", precision_doc2vec_logistic)
print("Recall (Doc2Vec Logistic Regression):", recall_doc2vec_logistic)
print("F1-score (Doc2Vec Logistic Regression):", f1_doc2vec_logistic)
# Append Doc2Vec Logistic Regression Evaluation Metrics to models_performances List
models_performances.append([model_name_doc2vec_logistic, accuracy_doc2vec_logistic, precision_doc2vec_logistic, recall_doc2vec_logistic, f1_doc2vec_logistic])

Model Name: Doc2Vec Logistic Regression
Accuracy (Doc2Vec Logistic Regression): 0.9616213885295386
Precision (Doc2Vec Logistic Regression): 0.9635343618513323
Recall (Doc2Vec Logistic Regression): 0.916
F1-score (Doc2Vec Logistic Regression): 0.9391660970608339


In [143]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score
# SVM
svm_model = SVC(random_state=42)
# fit model to train data
svm_model.fit(X_train_doc2vec, y_train)
# model predict test data
y_pred_svm_doc2vec = svm_model.predict(X_test_doc2vec)

# Doc2Vec SVM Evaluation Metrics (Accuracy, Recall, Precision, f1-score) 
model_name_doc2vec_svm = "Doc2Vec SVM"
accuracy_doc2vec_svm = accuracy_score(y_test, y_pred_svm_doc2vec)
precision_doc2vec_svm = precision_score(y_test, y_pred_svm_doc2vec)
recall_doc2vec_svm = recall_score(y_test, y_pred_svm_doc2vec)
f1_doc2vec_svm = f1_score(y_test, y_pred_svm_doc2vec)
# Print Doc2Vec SVM Evaluation Metrics
print("Model Name:", model_name_doc2vec_svm)
print("Accuracy (Doc2Vec SVM):", accuracy_doc2vec_svm)
print("Precision (Doc2Vec SVM):", precision_doc2vec_svm)
print("Recall (Doc2Vec SVM):", recall_doc2vec_svm)
print("F1-score (Doc2Vec SVM):", f1_doc2vec_svm)
# Append Doc2Vec SVM Evaluation Metrics to models_performances List
models_performances.append([model_name_doc2vec_svm, accuracy_doc2vec_svm, precision_doc2vec_svm, recall_doc2vec_svm, f1_doc2vec_svm])

Model Name: Doc2Vec SVM
Accuracy (Doc2Vec SVM): 0.9598965071151359
Precision (Doc2Vec SVM): 0.9607293127629734
Recall (Doc2Vec SVM): 0.9133333333333333
F1-score (Doc2Vec SVM): 0.9364319890635681


In [144]:
# Summarization of all models performances in a models_performances dataframe
models_performances_df = pd.DataFrame(models_performances, columns=["Model", "Accuracy", "Precision", "Recall", "f1-score"])
print(models_performances_df)

                              Model  Accuracy  Precision    Recall  f1-score
0         Tfidf Logistic Regression  0.980595   0.995781  0.944000  0.969199
1                         Tfidf SVM  0.992238   0.997283  0.978667  0.987887
2  Bag of Words Logistic Regression  0.992238   0.998638  0.977333  0.987871
3                  Bag of Words SVM  0.987063   0.994505  0.965333  0.979702
4      Word2Vec Logistic Regression  0.984907   0.991747  0.961333  0.976303
5                      Word2Vec SVM  0.985339   0.991758  0.962667  0.976996
6       Doc2Vec Logistic Regression  0.961621   0.963534  0.916000  0.939166
7                       Doc2Vec SVM  0.959897   0.960729  0.913333  0.936432
