### Importing necessary libraries

In [1]:
import nltk
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn import metrics
import random
random.seed(42)
from urllib.parse import urlparse
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
from sklearn.metrics import auc, precision_recall_curve

%matplotlib inline
import matplotlib.pyplot as plt

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading the datasets

In [2]:
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

### Preparing the attributes and labels for training

In [3]:
X_train, y_train = train["processed_text"], train["spam"]
X_val, y_val = val["processed_text"], val["spam"]
X_test, y_test = test["processed_text"], test["spam"]

### Some predefined functions

In [4]:
def accuracy(predictions, y_test):
    predictions = np.array(predictions)
    y_test = np.array(y_test)
    TP = np.sum((predictions == 1)[y_test==1])
    TN = np.sum((predictions == 0)[y_test==0])
    FP = np.sum((predictions == 1)[y_test == 0])
    FN = np.sum((predictions == 0)[y_test == 1])
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    return accuracy

def precision(predictions, y_test):
    predictions = np.array(predictions)
    y_test = np.array(y_test)
    TP = np.sum((predictions == 1)[y_test==1])
    FP = np.sum((predictions == 1)[y_test == 0])
    precision = TP/(TP + FP)
    return precision

def recall(predictions, y_test):
    predictions = np.array(predictions)
    y_test = np.array(y_test)
    TP = np.sum((predictions == 1)[y_test==1])
    FN = np.sum((predictions == 0)[y_test == 1])
    recall = TP/(TP+FN)
    return recall

def f1_score(predictions, y_test):
    predictions = np.array(predictions)
    y_test = np.array(y_test)
    TP = np.sum((predictions == 1)[y_test==1])
    FP = np.sum((predictions == 1)[y_test == 0])
    FN = np.sum((predictions == 0)[y_test==1])
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    if precision == 0 and recall == 0:
        return 0
    f1 = 2 * precision * recall/(precision + recall)
    return f1

def AUCPR(predictions, y_test):
    precision, recall, _ = precision_recall_curve(y_test, predictions)
    return auc(recall, precision)

### Training the 3 benchmark models

In [5]:
# Naive Bayes
print("Naive Bayes\n\n")
pipeline_nb = make_pipeline(CountVectorizer(), MultinomialNB(alpha = 0.1))
pipeline_nb.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_nb.predict(X_val)
print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(accuracy(predictions, y_val)*100, 2)) + "%")
print("Precision : " + str(round(precision(predictions, y_val)*100, 2)) + "%")
print("Recall : " + str(round(recall(predictions, y_val)*100, 2)) + "%")
print("f1 score : " + str(round(f1_score(predictions, y_val)*100, 2)) + "%")
print("AUCPR : " + str(round(AUCPR(predictions, y_val)*100, 2)) + "%")
print("\n\n\n\n")



# Logistic Regression
print("Logistic Regression\n\n")
pipeline_lr = make_pipeline(CountVectorizer(), LogisticRegression(random_state = 42))
pipeline_lr.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_lr.predict(X_val)
print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(accuracy(predictions, y_val)*100, 2)) + "%")
print("Precision : " + str(round(precision(predictions, y_val)*100, 2)) + "%")
print("Recall : " + str(round(recall(predictions, y_val)*100, 2)) + "%")
print("f1 score : " + str(round(f1_score(predictions, y_val)*100, 2)) + "%")
print("AUCPR : " + str(round(AUCPR(predictions, y_val)*100, 2)) + "%")
print("\n\n\n\n")



# Random Forest Classifier
print("Random Forest Classifier\n\n")
pipeline_rf = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state = 42, max_depth=60, n_jobs=-1))
pipeline_rf.fit(X_train, y_train)

# Make predictions on validation set
predictions = pipeline_rf.predict(X_val)
print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(accuracy(predictions, y_val)*100, 2)) + "%")
print("Precision : " + str(round(precision(predictions, y_val)*100, 2)) + "%")
print("Recall : " + str(round(recall(predictions, y_val)*100, 2)) + "%")
print("f1 score : " + str(round(f1_score(predictions, y_val)*100, 2)) + "%")
print("AUCPR : " + str(round(AUCPR(predictions, y_val)*100, 2)) + "%")

Naive Bayes


On validation Dataset:
Accuracy : 98.72%
Precision : 98.1%
Recall : 96.73%
f1 score : 97.41%
AUCPR : 97.82%





Logistic Regression


On validation Dataset:
Accuracy : 98.49%
Precision : 97.63%
Recall : 96.26%
f1 score : 96.94%
AUCPR : 97.41%





Random Forest Classifier


On validation Dataset:
Accuracy : 94.99%
Precision : 100.0%
Recall : 79.91%
f1 score : 88.83%
AUCPR : 92.46%


### Logging models to mlflow

In [6]:
# Naive Bayes
with mlflow.start_run(run_name="Naive Bayes"):
    y_pred = pipeline_nb.predict(X_test)
    mlflow.log_param("model_name", "Naive Bayes")
    mlflow.log_metric("accuracy", accuracy(y_pred, y_test))
    mlflow.log_metric("precision", precision(y_pred, y_test))
    mlflow.log_metric("recall", recall(y_pred, y_test))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test))
    mlflow.log_metric("AUCPR", AUCPR(y_pred, y_test))
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_nb, "model")
    
    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_nb,
        artifact_path="sklearn-model",
        registered_model_name="Naive Bayes model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_nb, "model", registered_model_name="Naive Bayes")
    else:
        mlflow.sklearn.log_model(pipeline_nb, "model")


# Logistic Regression
with mlflow.start_run(run_name="Logistic Regression"):
    y_pred = pipeline_lr.predict(X_test)
    mlflow.log_param("model_name", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy(y_pred, y_test))
    mlflow.log_metric("precision", precision(y_pred, y_test))
    mlflow.log_metric("recall", recall(y_pred, y_test))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test))
    mlflow.log_metric("AUCPR", AUCPR(y_pred, y_test))
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_lr, "model")
    
    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_nb,
        artifact_path="sklearn-model",
        registered_model_name="Logistic Regression model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_lr, "model", registered_model_name="Logistic Regression")
    else:
        mlflow.sklearn.log_model(pipeline_lr, "model")


# Random Forest
with mlflow.start_run(run_name="Random Forest"):
    y_pred = pipeline_rf.predict(X_test)
    mlflow.log_param("model_name", "Random Forest")
    mlflow.log_metric("accuracy", accuracy(y_pred, y_test))
    mlflow.log_metric("precision", precision(y_pred, y_test))
    mlflow.log_metric("recall", recall(y_pred, y_test))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test))
    mlflow.log_metric("AUCPR", AUCPR(y_pred, y_test))
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_rf, "model")
    
    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_rf,
        artifact_path="sklearn-model",
        registered_model_name="Random Forest model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_rf, "model", registered_model_name="Random Forest")
    else:
        mlflow.sklearn.log_model(pipeline_rf, "model")

Successfully registered model 'Naive Bayes model'.
Created version '1' of model 'Naive Bayes model'.
Successfully registered model 'Logistic Regression model'.
Created version '1' of model 'Logistic Regression model'.
Successfully registered model 'Random Forest model'.
Created version '1' of model 'Random Forest model'.
