In [5]:
#!pip install --user --upgrade pip

In [6]:
#!pip install mlflow --user

In [4]:
#! pip install jinja2==3.0.3

In [1]:
import os
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [2]:
import mlflow
import logging
from urllib.parse import urlparse
from markupsafe import escape

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# loading the data sets

In [3]:
#reading the data sets
train = pd.read_csv("data/train.csv")
validation = pd.read_csv("data/validation.csv")
test = pd.read_csv("data/test.csv")

In [4]:
y_train,X_train = train["target"],train["text"]
y_validation,X_validation = validation["target"],validation ["text"]
y_test,X_test = test["target"],test["text"]

# Data modelling 

# Multinomial NB

In [5]:
# fitting and transforming X_train using a tfid vectorizer, ignoring terms with a document frequency lower than 3.
vect = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [8]:
mlflow.sklearn.autolog()

clf = sklearn.naive_bayes.MultinomialNB()
clf.fit(X_train_vectorized, y_train)

y_pred = clf.predict(vect.transform(X_test))
aucscore = roc_auc_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

with mlflow.start_run(run_name="Multinomial Naive Bayes"):

    mlflow.sklearn.log_model(clf, "model")

    print("\nMultinomial Naive Bayes")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
    print(f'\nTrue Positives: {tp}')
    print(f'False Positives: {fp}')
    print(f'True Negatives: {tn}')
    print(f'False Negatives: {fn}')

    print(f'True Positive Rate: { (tp / (tp + fn))}')
    print(f'Specificity: { (tn / (tn + fp))}')
    print(f'False Positive Rate: { (fp / (fp + tn))}')
    print(f"Auc Score: {aucscore} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multinomial-nb-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multinomial-nb-model")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/28 01:48:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '28168eb9a7e544329f750afe37ce49ea', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multinomial Naive Bayes
             Predicted Spam  Predicted Ham
Actual Spam             731              0
Actual Ham               20             85

True Positives: 85
False Positives: 0
True Negatives: 731
False Negatives: 20
True Positive Rate: 0.8095238095238095
Specificity: 1.0
False Positive Rate: 0.0
Auc Score: 0.9047619047619048 




Registered model 'multinomial-nb-model' already exists. Creating a new version of this model...
2023/02/28 01:48:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multinomial-nb-model, version 4
Created version '4' of model 'multinomial-nb-model'.


In [9]:
mlflow.tracking.MlflowClient().get_model_version("multinomial-nb-model","1")

<ModelVersion: creation_timestamp=1677519465514, current_stage='None', description=None, last_updated_timestamp=1677519465514, name='multinomial-nb-model', run_id='13c8ed0e7a75496cbe29286d74e45e01', run_link=None, source='file:///C:/Users/hp/Downloads/mlruns/0/13c8ed0e7a75496cbe29286d74e45e01/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

# Support Vector Classifier Model 

In [10]:
#defining an additional function
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [11]:
#fit and transfor x_train and X_test
vectorizer = TfidfVectorizer(min_df=5)

X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed_with_length = add_feature(X_train_transformed, X_train.str.len())

X_test_transformed = vectorizer.transform(X_test)
X_test_transformed_with_length = add_feature(X_test_transformed, X_test.str.len())

In [12]:
mlflow.sklearn.autolog()

# SVM creation
clf = SVC(C=10000)
clf.fit(X_train_transformed_with_length, y_train)

y_pred = clf.predict(X_test_transformed_with_length)
aucscore = roc_auc_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

with mlflow.start_run(run_name="Support Vector Classifier"):

    mlflow.sklearn.log_model(clf, "model")

    print("\nSupport Vector Classifier")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
    print(f'\nTrue Positives: {tp}')
    print(f'False Positives: {fp}')
    print(f'True Negatives: {tn}')
    print(f'False Negatives: {fn}')

    print(f'True Positive Rate: { (tp / (tp + fn))}')
    print(f'Specificity: { (tn / (tn + fp))}')
    print(f'False Positive Rate: { (fp / (fp + tn))}')
    print(f"Auc Score: {aucscore} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="support-vector-classifier"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="support-vector-classifie")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/28 01:48:57 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e13c4dc2a1104362874091db09a8dd60', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Support Vector Classifier
             Predicted Spam  Predicted Ham
Actual Spam             729              2
Actual Ham                6             99

True Positives: 99
False Positives: 2
True Negatives: 729
False Negatives: 6
True Positive Rate: 0.9428571428571428
Specificity: 0.9972640218878249
False Positive Rate: 0.0027359781121751026
Auc Score: 0.9700605823724839 




Registered model 'support-vector-classifier' already exists. Creating a new version of this model...
2023/02/28 01:49:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: support-vector-classifier, version 3
Created version '3' of model 'support-vector-classifier'.


In [13]:
mlflow.tracking.MlflowClient().get_model_version("support-vector-classifier","1")

<ModelVersion: creation_timestamp=1677520302888, current_stage='None', description=None, last_updated_timestamp=1677520302888, name='support-vector-classifier', run_id='a844c8b0b173495fa819918b25b8e351', run_link=None, source='file:///C:/Users/hp/Downloads/mlruns/0/a844c8b0b173495fa819918b25b8e351/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

# Logistic Regression Model with ngrams

In [14]:
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(min_df=5, ngram_range=[1,3])

X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed_with_length = add_feature(X_train_transformed, [X_train.str.len(),
                                                                    X_train.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

X_test_transformed = vectorizer.transform(X_test)
X_test_transformed_with_length = add_feature(X_test_transformed, [X_test.str.len(),
                                                                  X_test.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

In [15]:
mlflow.sklearn.autolog()

clf = LogisticRegression(C=100)
clf.fit(X_train_transformed_with_length, y_train)

y_pred = clf.predict(X_test_transformed_with_length)
aucscore = roc_auc_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

with mlflow.start_run(run_name="Logistic Regression"):

    mlflow.sklearn.log_model(clf, "model")

    print("\nLogistic Regression")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
    print(f'\nTrue Positives: {tp}')
    print(f'False Positives: {fp}')
    print(f'True Negatives: {tn}')
    print(f'False Negatives: {fn}')

    print(f'True Positive Rate: { (tp / (tp + fn))}')
    print(f'Specificity: { (tn / (tn + fp))}')
    print(f'False Positive Rate: { (fp / (fp + tn))}')
    print(f"Auc Score: {aucscore} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="logistic-regression-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="logistic-regression-model")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/28 01:49:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a9cd5624eb2a47e189dc5cf17af63bb6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression
             Predicted Spam  Predicted Ham
Actual Spam             728              3
Actual Ham                7             98

True Positives: 98
False Positives: 3
True Negatives: 728
False Negatives: 7
True Positive Rate: 0.9333333333333333
Specificity: 0.9958960328317373
False Positive Rate: 0.004103967168262654
Auc Score: 0.9646146830825354 




Registered model 'logistic-regression-model' already exists. Creating a new version of this model...
2023/02/28 01:49:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: logistic-regression-model, version 4
Created version '4' of model 'logistic-regression-model'.


In [16]:
mlflow.tracking.MlflowClient().get_model_version("logistic-regression-model","1")

<ModelVersion: creation_timestamp=1677520573207, current_stage='None', description=None, last_updated_timestamp=1677520573207, name='logistic-regression-model', run_id='4c1591cd393c4f5bae059c13c2cce610', run_link=None, source='file:///C:/Users/hp/Downloads/mlruns/0/4c1591cd393c4f5bae059c13c2cce610/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

## The resultant best performing model was the SVC Model, although it should be noted that all 3 models performed reasonably well at detecting spam messages (all AUC > 0.9).