In [None]:
# Import libraries
import string
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import warnings
import plotly.express as px
import string
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.metrics import roc_curve, roc_auc_score,auc
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Data Loading and Intial Exploration

In [None]:
data_path = "/kaggle/input/spam-email-dataset/emails.csv"
data = pd.read_csv(data_path)

In [None]:
spam_df = data.copy()
spam_df.head()

## Dataset Information

In [None]:
spam_df.info()

## Statistical Analysis

In [None]:
spam_df.describe().T

In [None]:
spam_df.dtypes

In [None]:
len(spam_df['text'].unique())

# 2. Data Cleaning

## Missing Values

In [None]:
#Check Null values
spam_df.isnull().sum()

In [None]:
#To check if there are any missing values
is_any_missing_data=spam_df.isna().any().any()
is_any_missing_data

**No Missing Values**

## Duplicate Values

In [None]:
spam_df.duplicated().sum()

**There are 33 duplicated values that has to be removed**

In [None]:
#Fetch all duplicate values
duplicate=spam_df[spam_df.duplicated(keep='last')]
duplicate.head()

In [None]:
#Remove Duplicate Records
spam_df.drop_duplicates(inplace=True)
spam_df.duplicated().sum()

# 3. Data Preprocessing

**Rename Column names**

In [None]:
# Rename names columns
spam_df=spam_df.rename(columns={'spam':'label'})

In [None]:
spam_df["length"] = spam_df["text"].apply(len)
spam_df.sort_values(by='length', ascending=False).head(3)

## Distribution of Spam and Ham Emails

In [None]:
import plotly.express as px

# Get value counts and reset the index
counts = spam_df['label'].value_counts().reset_index()
counts.columns = ['Label', 'Count']

# Create a bar plot using Plotly Express with default color settings
fig = px.bar(counts, x='Label', y='Count', color='Label', color_discrete_sequence=px.colors.qualitative.Plotly)

# Update layout for title and axis titles
fig.update_layout(title='Number of Spam and Legitimate Emails', xaxis_title='Label', yaxis_title='Count')

# Update x-axis for tick values and text
fig.update_xaxes(tickvals=[0, 1], ticktext=['Legitimate', 'Spam'])

# Show the figure
fig.show()


In [None]:
spam_df.hist(column = 'length', by ='label',figsize=(12,4), bins = 10,color='blue')

## Further Processing

* **Remove Punctuations:** Strip out punctuation marks from the text to focus on the words themselves.
* **Lowering the Case:** Convert all text to lowercase to ensure uniformity and improve matching.
* **Remove Stop Words:** Eliminate common, non-informative words (e.g., "and", "the") that don't contribute to the analysis.
* **Lemmatization:** Reduce words to their base or root form (e.g., "running" to "run") for better consistency.
* **Identify Spam and Legit Words Using N-Gram Model:** Use N-grams (sequences of N words) to detect patterns and frequently occurring word combinations in spam and legitimate (ham) emails.

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
def preprocess_text(text):
    # Remove punctuation
    no_punctuation = ''.join([char for char in text if char not in string.punctuation])

    # Lowercase the text
    no_punctuation_lower = no_punctuation.lower()

    # Tokenize the text into words
    words = nltk.word_tokenize(no_punctuation_lower)

    # Remove stopwords and non-alphabetic characters, and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stopwords.words('english') and word.isalpha()]

    # Join the lemmatized words back into a sentence
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text


In [None]:
spam_df["preprocessed_text"] = spam_df["text"].apply(preprocess_text)
spam_df.head()

## N-gram Plot

Let's define a function `plot_ngrams` visualize the most frequent N-grams (sequences of N words) in a given set of texts. It then applies this function to spam and non-spam messages from a dataset to compare their top bigrams (N-grams of size 2).

### Function: `plot_ngrams`
1. **Initialize Count Vectorizer**: Sets up a count vectorizer to extract N-grams from the texts.
2. **Fit and Transform Texts**: Converts the texts into a matrix of N-gram counts.
3. **Get Feature Names**: Retrieves the N-gram feature names.
4. **Sum N-gram Occurrences**: Totals the occurrences of each N-gram.
5. **Create N-gram Dictionary**: Maps N-grams to their counts.
6. **Sort N-grams**: Orders the N-grams by frequency in descending order.
7. **Select Top N-grams**: Chooses the top N N-grams based on their counts.
8. **Plot N-grams**: Plots the top N-grams using Seaborn's barplot.

### Visualization
1. **Filter Messages**: Separates spam and non-spam messages.
2. **Create Subplots**: Sets up side-by-side plots.
3. **Plot Top Bigrams**: Uses `plot_ngrams` to visualize the top bigrams in spam and non-spam messages.
4. **Display Plots**: Adjusts layout and displays the plots.


In [None]:
def plot_ngrams(ax, texts, ngram_range=(2, 2), num_top_ngrams=25, title=''):
    # Initialize count vectorizer
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    
    # Fit and transform the texts
    X = vectorizer.fit_transform(texts)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Sum the occurrences of each n-gram
    ngram_counts = X.sum(axis=0).A1
    
    # Create a dictionary of n-grams and their counts
    ngram_dict = dict(zip(feature_names, ngram_counts))
    
    # Sort the dictionary by counts in descending order
    sorted_ngrams = sorted(ngram_dict.items(), key=lambda x: x[1], reverse=True)
    
    # Select top N n-grams
    top_ngrams = sorted_ngrams[:num_top_ngrams]
    
    # Plot the top N n-grams
    sns.barplot(ax=ax, x=[ngram[1] for ngram in top_ngrams],
                y=[ngram[0] for ngram in top_ngrams],
                orient="h",
                width=0.5,
                palette='Spectral')
    ax.set_xlabel('Frequency')
    ax.set_ylabel('N-gram')
    ax.set_title(title)

In [None]:
# Filter spam and non-spam messages
spam_texts = spam_df[spam_df['label'] == 1]['preprocessed_text']
non_spam_texts = spam_df[spam_df['label'] == 0]['preprocessed_text']

#Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
plot_ngrams(axes[0], spam_texts, title='Top Bigrams in Spam Messages')
plot_ngrams(axes[1], non_spam_texts, title='Top Bigrams in Non-Spam Messages')
axes[0].grid(axis='x')
axes[1].grid(axis='x')
plt.tight_layout()
plt.show()

# 4. Feature Engineering

### 1. Count Vectorizer
Transforms text data into a matrix of token (word) counts. Each entry in the matrix represents the frequency of a word in a specific document, providing a straightforward numerical representation of the text suitable for machine learning models.

In [None]:
# Initialize count vectorizer
vectorizer = CountVectorizer()

# Bag of words
bow_text = vectorizer.fit_transform(spam_df["preprocessed_text"])

# Fetch the vocabulary set
print(f"10 Bag Of Words Features: {vectorizer.get_feature_names_out()[100:110]}")
print(f"Total number of vocab words: {len(vectorizer.vocabulary_)}")

In [None]:
# Convert strings to vectors using BoW
transformed_bow = vectorizer.transform(spam_df["preprocessed_text"])

# Print the shape of the sparse matrix and count the number of non-zero occurrences
print(f"Shape of sparse matrix: {transformed_bow.shape}")
print(f"Amount of non-zero occurrences: {transformed_bow.nnz}")

### 2. TF-IDF Vectorizer (Term Frequency-Inverse Document Frequency)
Converts text data into a matrix where each entry reflects the importance of a word in a document relative to the entire corpus. This is calculated by multiplying term frequency (how often a word appears in a document) by inverse document frequency (how common or rare a word is across all documents). This technique helps emphasize unique words and diminish the influence of commonly occurring terms, improving model performance by highlighting significant features.

In [None]:
# TF-IDF
tfidf_transformer = TfidfTransformer().fit(transformed_bow)

# Transform entire BoW into tf-idf corpus
text_tfidf = tfidf_transformer.transform(transformed_bow)
print(text_tfidf.shape)

# 5. Model Building

In this notebook, I am gonna use the following classification models

1. **Support Vector Machine**
2. **Random Forest**
3. **Decision Tree**
4. **Naive Bayas**
5. **XBoost Classifier**

## Split the dataset into traning and testing set

In [None]:
# Split the dataset to train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    text_tfidf, spam_df["label"], test_size=0.2
)

print(f"train dataset features size: {x_train.shape}")
print(f"train dataset label size: {y_train.shape}")

print(f"test dataset features size: {x_test.shape}")
print(f"test dataset label size: {y_test.shape}")

# Define the Model - with Metrics, Classification Report, Confusion Matrix Visualization

Let's define the `evaluate_model` function to assesses the performance of a given machine learning model on test data, calculating and printing various metrics. Here's a detailed breakdown:

### Function: `evaluate_model`
1. **Predict Labels**: Uses the model to predict labels for the test data (`x_test`).
2. **Confusion Matrix**: Computes the confusion matrix to derive true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).
3. **Performance Metrics**:
    - **Accuracy**: Proportion of correct predictions.
    - **Recall**: Proportion of actual positives correctly identified.
    - **Precision**: Proportion of positive predictions that are correct.
    - **F1-score**: Harmonic mean of precision and recall.
    - **Specificity**: Proportion of actual negatives correctly identified.
    - **Miss Rate**: Proportion of actual positives incorrectly identified as negatives.
    - **Mean Accuracy**: Average accuracy from cross-validation.
    - **Mean Square Error (MSE)**: Included but not applicable for classification tasks.
4. **Classification Report**: Generates a detailed report with precision, recall, and F1-score for each class.
5. **Print Metrics**: Displays the metrics and confusion matrix in a formatted output.
6. **Plot Confusion Matrix**: Visualizes the confusion matrix using `ConfusionMatrixDisplay`.


In [None]:
def evaluate_model(model, x_test, y_test, model_name="Model"):

    # Predict labels on testing data
    y_pred = model.predict(x_test)

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Extract TP, TN, FP, FN
    TN, FP, FN, TP = conf_matrix.ravel()

    # Calculate various performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    specificity = TN / (TN + FP)
    miss_rate = FN / (FN + TP)

    # Calculate mean accuracy using cross-validation
    mean_accuracy = cross_val_predict(model, x_train, y_train, cv=10).mean()

    # Calculate mean square error (for regression tasks, not applicable here)
    mse = mean_squared_error(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    metrics = {
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1-score": f1,
        "Specificity": specificity,
        "Miss Rate": miss_rate,
        "Mean Accuracy": mean_accuracy,
        "Mean Square Error": mse,
        "Classification Report": classification_rep
    }
    # Print evaluation metrics
    print(f"\033[1m EVALUATION METRICS ({model_name}) \033[0m")
    print(f"\n \033 ---------------------------------------------------------------- \033[0m")
    print("\nTrue Positives (TP):", TP)
    print("True Negatives (TN):", TN)
    print("False Positives (FP):", FP)
    print("False Negatives (FN):", FN)
    print("\nAccuracy   : ", accuracy)
    print("Recall       : ", recall)
    print("Precision    : ", precision)
    print("F1-score     : ", f1)
    print("Specificity  : ", specificity)
    print("Miss Rate    : ", miss_rate)
    print("Mean Accuracy: ", mean_accuracy)
    print("Mean Square Error:", mse)  # Not applicable for classification
    print(f" \033 ---------------------------------------------------------------- \033[0m")

    # Print classification report
    print("\033[1m Classification Report: \033[0m")
    print(classification_report(y_test, y_pred))
    print(f"\n \033 ---------------------------------------------------------------- \033[0m")
    print("\033[1m Confusion Matrix \033[0m")
    # Plot the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
    disp.plot(cmap='bone_r')  # specify the colormap for better visualization
    plt.show()
    return metrics,y_pred

### 1. Support Vector Machine (SVM)
A powerful classification algorithm that finds the optimal hyperplane separating different classes by maximizing the margin between them. SVM is effective in high-dimensional spaces and is used for its accuracy and efficiency in text classification tasks.


In [None]:
# Train SVM model
svm_model = SVC()
svm_model.fit(x_train, y_train)

# Evaluate SVM model
svm_metrics,y_pred_svm = evaluate_model(svm_model, x_test, y_test, model_name="SVM")

### 2. Random Forest
An ensemble learning method that constructs multiple decision trees during training and outputs the mode of their predictions for classification. It improves accuracy and robustness by reducing overfitting and variance, making it well-suited for handling large datasets with complex patterns.

In [None]:
# Train Random Forest model
randomForest_model = RandomForestClassifier()
randomForest_model.fit(x_train, y_train)

# Evaluate Random Forest model
rf_metrics,y_pred_rf = evaluate_model(randomForest_model, x_test, y_test, model_name="Random Forest")

### 3. Decision Tree
A tree-structured model where nodes represent feature decisions, and branches represent outcomes, leading to a final decision at the leaf nodes. Decision Trees are easy to interpret and visualize but can be prone to overfitting if not properly pruned.

In [None]:
# Train Decision Tree Model
decisionTree_model = DecisionTreeClassifier()
decisionTree_model.fit(x_train, y_train)

# Evaluate
dt_metrics,y_pred_dt = evaluate_model(decisionTree_model, x_test, y_test, model_name="Decision Tree")

### 4. Naive Bayes
A probabilistic classifier based on Bayes' theorem with an assumption of independence between features. It is highly efficient and works particularly well for text classification, such as spam detection, due to its simplicity and effectiveness in handling large vocabularies.

In [None]:
# Define Naive Bayas model
NaiveBayes_model = GaussianNB()
NaiveBayes_model.fit(x_train.toarray(), y_train)

# Evaluate
nb_metrics,y_pred_nb = evaluate_model(decisionTree_model, x_test.toarray(), y_test, model_name="Naive Bayes")

### 5. XGBoost Classifier
An optimized gradient boosting algorithm designed for speed and performance. XGBoost builds an ensemble of weak learners (typically decision trees) in a sequential manner, focusing on reducing errors from previous iterations. It is known for its high accuracy and scalability in various classification tasks.

In [None]:
# Create the XGBoost model
xgboost_model = XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    objective='binary:logistic',  # Use 'multi:softmax' for multi-class classification
    eval_metric='logloss'
)

# Fit the model to the training data
xgboost_model.fit(x_train, y_train)

# Predict using the trained model
#y_pred_xgb = xgboost_model.predict(x_test)

# Evaluate the model
xgb_metrics, y_pred_xgb = evaluate_model(xgboost_model, x_test, y_test, model_name="XGBoost")


## Model Comparision

In [None]:
svm_fpr, svm_tpr, threshold = roc_curve(y_test, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)
rf_fpr, rf_tpr, threshold = roc_curve(y_test, y_pred_rf)
auc_rfc = auc(rf_fpr, rf_tpr)
dt_fpr, dt_tpr, threshold = roc_curve(y_test, y_pred_dt)
auc_rfc = auc(dt_fpr, dt_tpr)
nb_fpr, nb_tpr, threshold = roc_curve(y_test, y_pred_nb)
auc_rfc = auc(nb_fpr, nb_tpr)
xgb_fpr, xgb_tpr, threshold = roc_curve(y_test, y_pred_xgb)
auc_xgb = auc(xgb_fpr, xgb_tpr)

## ROC and AUC Visualization

In [None]:
classifiers = {
    'Naive Bayes': (y_pred_nb, nb_fpr, nb_tpr),
    'Decision Tree Classifier': (y_pred_dt, dt_fpr, dt_tpr),
    'SVM': (y_pred_svm, svm_fpr, svm_tpr),
    'RandomForest Classifier': (y_pred_rf, rf_fpr, rf_tpr),
    'XGBoost Classifier': (y_pred_xgb, xgb_fpr, xgb_tpr)
}

plt.figure(figsize=(8, 6), dpi=100)
colors=['crimson','orange','steelblue','limegreen', 'red']


for i, (clf_name, (y_pred, fpr, tpr)) in enumerate(classifiers.items()):
    auc_score = roc_auc_score(y_test, y_pred)  # Calculate AUC score using roc_auc_score
    sns.lineplot(x=fpr, y=tpr, marker='.', label=f'{clf_name} (AUC = {auc_score:.3f})', color=colors[i])

# Loop through classifiers and plot ROC curves
#for clf_name, (y_pred, fpr, tpr) in classifiers.items():
#    auc_score = roc_auc_score(y_test, y_pred)  # Calculate AUC score using roc_auc_score
#    sns.lineplot(x=fpr, y=tpr, marker='.', label=f'{clf_name} (AUC = {auc_score:.3f})')  # Use seaborn lineplot

# plot
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curves for Classifiers')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Extract metric names and values
metric_names = list(svm_metrics.keys())
svm_values = [svm_metrics[key] for key in metric_names if key != 'Classification Report']
rf_values = [rf_metrics[key] for key in metric_names if key != 'Classification Report']
dt_values = [dt_metrics[key] for key in metric_names if key != 'Classification Report']
nb_values = [nb_metrics[key] for key in metric_names if key != 'Classification Report']
xgb_values = [xgb_metrics[key] for key in metric_names if key != 'Classification Report']

length = len(metric_names)-1

length_svm = len(svm_values)
length_rf = len(rf_values)
length_dt = len(dt_values)
length_nb = len(nb_values)
length_xgb = len(xgb_values)

print(length,length_svm,length_rf,length_dt,length_nb, length_xgb)
print()

## Bar Chart Comparision of Metrix

In [None]:
# Create a bar chart
plt.figure(figsize=(10, 6))
x = np.arange(len(metric_names) - 1)  # Assuming metric_names has the names excluding 'Classification Report'
width = 0.2

# Plot bars for each model with different colors
plt.bar(x - 2*width, svm_values, width, label='SVM', color='limegreen')
plt.bar(x - width, rf_values, width, label='Random Forest', color='gold')
plt.bar(x, dt_values, width, label='Decision Tree', color='tomato')
plt.bar(x + width, nb_values, width, label='Naive Bayes', color='deepskyblue')
plt.bar(x + 2*width, xgb_values, width, label='XGBoost', color='mediumorchid')

plt.xticks(x, [name for name in metric_names if name != 'Classification Report'], rotation=45, ha='right')
plt.ylabel('Metric Score')
plt.title('Comparison of Model Metrics')
plt.legend()

# Show the plot
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# **|| END OF NOTEBOOK ||**