**Team Members**

| Name            | ID       |
| :---            | :---     |
| Somaya Mohammed | 20200234 |
| Mariem Shehab   | 20200844 |
| Eman Ibrahim    | 20201038 |
| Dina Ahmed      | 20201061 |
| Norhan Sayed    | 20201200 |

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# drive/MyDrive/Colab Notebooks/..

Mounted at /content/drive


# Load the dataset and perform initial data exploration.

In [2]:
import pandas as pd

In [3]:
email_data = pd.read_csv('drive/MyDrive/Colab Notebooks/Spam_Email_Data.csv')

In [4]:
print('Spam Email Data Head: ')
print(email_data.head().to_markdown(tablefmt="github", index=False))

Spam Email Data Head: 
| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [5]:
print('Discover Spam Email Data Information: \n')
print(email_data.info())

Discover Spam Email Data Information: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB
None


In [6]:
print(email_data.target.value_counts())

target
0    3900
1    1896
Name: count, dtype: int64


# Data Preprocessing & Features Extraction

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [9]:
def text_cleaning(text):
    # remove email address
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)
    # remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove puctuations and numbers
    text = re.sub(r'[^A-Za-z\s]', ' ', text)

    return text

def preprocessing(text):
    # perform text cleaning (removing irrelevant words, symbols, etc.)
    clean_text = text_cleaning(text)
    # tokenization for lowercase words
    text_tokens = word_tokenize(clean_text.lower())
    # remove all stopwords
    stopwrds = set(stopwords.words('english'))
    text_rmstop = [i for i in text_tokens if i not in stopwrds]
    # limmatize all words
    lemmatizer = WordNetLemmatizer()
    text_lemm = [lemmatizer.lemmatize(w) for w in text_rmstop]
    # POS tagging
    #pos_tags = pos_tag(text_lemm)
    #text_pos = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]

    return ' '.join(text_lemm)

In [10]:
email_data['text'] = email_data['text'].apply(preprocessing)

In [11]:
print('Spam Email Data Head after Preprocessing: ')
print(email_data.head().to_markdown(tablefmt="github", index=False))

Spam Email Data Head after Preprocessing: 
| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

# Text Embedding Techniques

## Neural networks based techniques (at least two)

*  Word2Vec
*  Doc2Vec

In [12]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

### word2vec

In [13]:
# creating word2vec model with sentences of word and vector size of 200
def word2vec_model(word_of_sent):
    model = Word2Vec(sentences=word_of_sent, vector_size=200)

    return model

In [14]:
# get average embedding for one sentence
def get_sentence_vector(model, sentence, vocab):
    sentence_vec = np.zeros(200)
    count = 0
    for word in sentence:
        # ensure word is in the vocabulary
        if word in vocab:
            sentence_vec += model.wv[word]
            count += 1
    # get average embedding
    if count != 0:
        sentence_vec /= count

    return sentence_vec

In [15]:
# split sentence into words
def get_word_of_sent(sentences):
   word_of_sent = [sentence.split() for sentence in sentences]

   return word_of_sent

In [16]:
# train word2vec embedding with training data and get vocabulary and word2vec model with embedding
def train_word2vec_embedding(text_data):
    word_of_sent = get_word_of_sent(text_data)
    w2v_model = word2vec_model(word_of_sent)
    vocab = list(w2v_model.wv.key_to_index.keys())

    return w2v_model, vocab

In [17]:
# get word2vec embedding for text data either training or testing data
def get_word2vec_embedding(text_data, model, vocab):
    word_of_sent = get_word_of_sent(text_data)
    # get average sentence embedding for each sentence in the text data
    embedding = [get_sentence_vector(model, sentence, vocab) for sentence in word_of_sent]

    return embedding

### Doc2Vec

In [18]:
def train_Doc2Vec(train_data):
  # Prepare TaggedDocument objects for training
  documents = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(train_data)]
  # Initialize a Doc2Vec model
  doc2vec_model = Doc2Vec(vector_size=200)
  # Build the vocabulary from the training data
  doc2vec_model.build_vocab(documents)
  # Train the Doc2Vec model on the training data
  doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=15)

  return doc2vec_model

In [19]:
def get_Doc2Vec(doc2vec_model, corpus):
  # Generate document vectors for the provided corpus using the trained model
  doc_vectors = []
  for text in corpus:
        # Infer the vector representation of each document in the corpus
        doc_vectors.append(doc2vec_model.infer_vector(text.split()))

  return doc_vectors

## Techniques that aren’t based on neural networks (at least two)
*   TF-IDF
*   Bag of Words



In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

### TF-IDF

In [21]:
def train_TFIDF(train_data):
  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer.fit(train_data)
  #print(tfidf_vectorizer.get_feature_names_out()[:10])

  return tfidf_vectorizer

In [22]:
def get_TFIDF(tfidf_vectorizer, corpus):
  corpus_tfidf = tfidf_vectorizer.transform(corpus)
  #print('Shape: ', corpus_tfidf.shape)

  return corpus_tfidf

### Bag of Words

In [23]:
def train_BagOfWords(train_data):
  bow_vectorizer = CountVectorizer()
  bow_vectorizer.fit(train_data)
  #print(bow_vectorizer.get_feature_names_out()[:10])

  return bow_vectorizer

In [24]:
def get_BagOfWords(bow_vectorizer, corpus):
  corpus_bow = bow_vectorizer.transform(corpus)
  #print('Shape: ', corpus_bow.shape)

  return corpus_bow

# Data Splitting

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(email_data['text'], email_data['target'], train_size=0.6, random_state=42)
print('Training Shape: x=', x_train.shape, ' y=', y_train.shape)
print('Testing Shape:  x=', x_test.shape,  ' y=', y_test.shape)

Training Shape: x= (3477,)  y= (3477,)
Testing Shape:  x= (2319,)  y= (2319,)


In [27]:
print('y_train:')
print(y_train.value_counts())
print('y_test:')
print(y_test.value_counts())

y_train:
target
0    2331
1    1146
Name: count, dtype: int64
y_test:
target
0    1569
1     750
Name: count, dtype: int64


# Model Training
(at least two)

*    Logistic Regression
*    Decision Tree
*    Support Vector Machine

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [29]:
def LogisticRegression_Model(x_train, y_train):
  logistic_model = LogisticRegression(max_iter=1000, random_state=42).fit(x_train, y_train)
  return logistic_model

In [30]:
def DecisionTree_Model(x_train, y_train):
  decision_tree_model = DecisionTreeClassifier(criterion="entropy", random_state=42).fit(x_train, y_train)
  return decision_tree_model

In [31]:
def SVM_Model(x_train, y_train):
  svm_model = SVC(random_state=42).fit(x_train, y_train)
  return svm_model

# Model Evaluation

In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [33]:
def model_result_evaluation(embedding_name, model_name, y_true, y_pred):
  precision = precision_score(y_true, y_pred)*100
  recall = recall_score(y_true, y_pred)*100
  f1 = f1_score(y_true, y_pred)*100
  acc = accuracy_score(y_true, y_pred)*100

  return {'Text Embedding Technique':embedding_name, 'Classification Model':model_name,
          'Accuracy':acc, 'Precision':precision, 'Recall':recall, 'F1-Score':f1}

# Creating 8 models and evaluate them

In [34]:
TextEmbeddingTechniques = ['Word2Vec','Doc2Vec', 'TF-IDF', 'Bag of Words']
ClassificationModels = ['Logistic Regression', 'Decision Tree', 'Support Vector Machine']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

In [35]:
def getTextEmbeddingTechnique(name, x_train, x_test):
  new_train = None
  new_test = None
  if name == 'TF-IDF':
    model = train_TFIDF(x_train)
    new_train = get_TFIDF(model, x_train)
    new_test = get_TFIDF(model, x_test)
    #print("Maximum TF-IDF value:", new_train.max(), new_test.max())
    #print("Minimum TF-IDF value:", new_train.min(), new_test.min())
  elif name == 'Bag of Words':
    model = train_BagOfWords(x_train)
    new_train = get_BagOfWords(model, x_train)
    new_test = get_BagOfWords(model, x_test)
    #print("Maximum BOW value:", new_train.max(), new_test.max())
    #print("Minimum BOW value:", new_train.min(), new_test.min())
  elif name == 'Word2Vec':
    model, vocab = train_word2vec_embedding(x_train)
    new_train = get_word2vec_embedding(x_train, model, vocab)
    new_test = get_word2vec_embedding(x_test, model, vocab)
  elif name == 'Doc2Vec':
    model = train_Doc2Vec(x_train)
    new_train = get_Doc2Vec(model, x_train)
    new_test = get_Doc2Vec(model, x_test)

  return new_train, new_test

In [36]:
def getClassificationModel(name, x_train, y_train):
  if name == 'Logistic Regression':
      return LogisticRegression_Model(x_train, y_train)
  elif name == 'Decision Tree':
      return DecisionTree_Model(x_train, y_train)
  elif name == 'Support Vector Machine':
      return SVM_Model(x_train, y_train)

In [37]:
def create_evaluate_models(x_train, x_test, y_train, y_test):
  evaluation_results = [] # to save evaluation results of all models

  # loop over all text embedding techniques
  for technique_name in TextEmbeddingTechniques:
      new_x_train, new_x_test = getTextEmbeddingTechnique(technique_name, x_train, x_test)
      # loop over all classification models
      for model_name in ClassificationModels:
          model = getClassificationModel(model_name, new_x_train, y_train)
          y_pred = model.predict(new_x_test)
          # evaluate current model and save its results
          current_eval = model_result_evaluation(technique_name, model_name, y_test, y_pred)
          evaluation_results.append(current_eval)
          #print(current_eval)

  return evaluation_results

In [38]:
evaluation_results = create_evaluate_models(x_train, x_test, y_train, y_test)

In [39]:
final_results = pd.DataFrame(evaluation_results)

print('All Models Performance Results (based on testing set):')
print('------------------------------------------------------')
print(final_results.to_markdown(tablefmt="github", index=False))

All Models Performance Results (based on testing set):
------------------------------------------------------
| Text Embedding Technique   | Classification Model   |   Accuracy |   Precision |   Recall |   F1-Score |
|----------------------------|------------------------|------------|-------------|----------|------------|
| Word2Vec                   | Logistic Regression    |    98.5339 |     98.9071 |  96.5333 |    97.7058 |
| Word2Vec                   | Decision Tree          |    97.1971 |     95.1252 |  96.2667 |    95.6925 |
| Word2Vec                   | Support Vector Machine |    98.4476 |     99.0385 |  96.1333 |    97.5643 |
| Doc2Vec                    | Logistic Regression    |    97.1539 |     97.2376 |  93.8667 |    95.5224 |
| Doc2Vec                    | Decision Tree          |    86.2009 |     77.9221 |  80      |    78.9474 |
| Doc2Vec                    | Support Vector Machine |    97.3696 |     97.7809 |  94      |    95.8532 |
| TF-IDF                     | Log