<h1 style=\"text-align: center; font-size: 50px;\">🚫 Spam Detection and NLP (Natural Language Processing) with Python </h1>
This notebook shows how to detect spam using Natural Language Processing (NPL)

## Notebook Overview
- Imports
- Configurations
- Loading Data
- Exploratory Data Analysis
- Text Pre-processing
- Vectorization
- Training a model
- Model Evaluation
- Train Test Split
- Creating a Data Pipeline
- Logging Model to MLflow
- Fetching the Latest Model Version from MLflow
- Loading the Model and Running Inference

## Imports

In [None]:
%pip install nltk

In [None]:
# ------------------------ System Utilities ------------------------
import warnings
import logging
from pathlib import Path

# ------------------------ Data Manipulation ------------------------
import pandas as pd

# ------------------------ Data Visualization ------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------ Text Preprocessing ------------------------
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# ------------------------ Machine Learning tools ------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# ------------------------ MLflow for Experiment Tracking and Model Management ------------------------
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

## Configurations

In [None]:
# Suppress Python warnings
warnings.filterwarnings("ignore")

In [None]:
# Create logger
logger = logging.getLogger("nlp_logger")
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", 
                              datefmt="%Y-%m-%d %H:%M:%S")  

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.propagate = False

In [None]:
# ------------------------- Paths -------------------------
DATA_PATH = '/home/jovyan/datafabric/tutorial/spam_utf8.csv'

# ------------------------ MLflow Integration ------------------------
EXPERIMENT_NAME = "Spam_Detection_Experiment"
RUN_NAME = "Spam_Detection_Run"
MODEL_NAME = "Spam_Detection_Model"

## Verify Assets

In [None]:
# Check whether the Dataset file exists
is_dataset_available = Path(DATA_PATH).exists()

# Log the configuration status of the dataset
if is_dataset_available:
    logger.info("The Dataset is properly configured.")
else:
    logger.info(
        "The Dataset is not properly configured. Please create and download the required assets "
        "in your project on AI Studio."
    )

## Loading Data

In [None]:
messages = [line.rstrip() for line in open(DATA_PATH)]
logger.info(len(messages))

In [None]:
for message_no, message in enumerate(messages[:3]):
    logger.info(message_no, message)
    logger.info('\n')

In [None]:
messages = pd.read_csv(DATA_PATH, sep=',',
                           names=["label", "message", "v3", "v4", "v5"])
messages.head()

## Exploratory Data Analysis

In [None]:
messages.describe()

In [None]:
messages.groupby('label').describe()

In [None]:
messages['length'] = messages['message'].apply(len)
messages.head()

In [None]:
messages

### Data Visualization

In [None]:
messages['length'].plot(bins=50, kind='hist');

In [None]:
messages.length.describe()

In [None]:
messages.hist(column='length', by='label', bins=50,figsize=(12,4));

## Text Pre-processing

In [None]:
mess = 'Sample message! Notice: it has punctuation.'

# Check characters to see if they are in punctuation
nopunc = [char for char in mess if char not in string.punctuation]

# Join the characters again to form the string.
nopunc = ''.join(nopunc)

In [None]:
stopwords.words('english')[0:10] # Show some stop words

In [None]:
nopunc.split()

In [None]:
# Now just remove any stopwords
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
clean_mess

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
messages.head()

In [None]:
# Check to make sure its working
messages['message'].head(5).apply(text_process)

In [None]:
# Show original dataframe
messages.head()

## Vectorization

In [None]:
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])

# Print total number of vocab words
logger.info(len(bow_transformer.vocabulary_))

In [None]:
message4 = messages['message'][3]
logger.info(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
logger.info(bow4)
logger.info(bow4.shape)

In [None]:
logger.info(bow_transformer.get_feature_names_out()[4073])
logger.info(bow_transformer.get_feature_names_out()[9570])

In [None]:
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
logger.info('Shape of Sparse Matrix: ', messages_bow.shape)
logger.info('Amount of Non-Zero occurences: ', messages_bow.nnz)

In [None]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
logger.info('sparsity: {}'.format(round(sparsity)))

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
logger.info(tfidf4)

In [None]:
logger.info(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
logger.info(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
logger.info(messages_tfidf.shape)

## Training a model

In [None]:
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

In [None]:
logger.info('predicted:', spam_detect_model.predict(tfidf4)[0])
logger.info('expected:', messages.label[3])

## Model Evaluation

In [None]:
all_predictions = spam_detect_model.predict(messages_tfidf)
logger.info(all_predictions)

In [None]:
logger.info(classification_report(messages['label'], all_predictions))

## Train Test Split

In [None]:
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['message'], messages['label'], test_size=0.2)

logger.info(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

## Creating a Data Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(msg_train,label_train)

In [None]:
predictions = pipeline.predict(msg_test)

In [None]:
logger.info(classification_report(predictions,label_test))

In [None]:
class SpamDetectionModel(mlflow.pyfunc.PythonModel):
    def __init__(self, pipeline):
        self.pipeline = pipeline

    def preprocess(self, text):
        """
        Preprocesses the message, performing:
        1. Removal of all punctuation
        2. Removal of all stopwords
        3. Return of a list of the cleaned text
        """
        nopunc = [char for char in text if char not in string.punctuation]

        # Join the characters again to form the string.
        nopunc = ''.join(nopunc)

        clean_text =  [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

        # Now just remove any stopwords
        return clean_text
    
    def predict(self, context, model_input, params):
        """
        Computes the prediction of whether it is ham or spam.
        """
        processed_texts = model_input['text'].apply(self.preprocess)
        return self.pipeline.predict(processed_texts)

    
    @classmethod
    def log_model(cls, model_name):
        """
        Logs the model to MLflow with appropriate artifacts and schema.
        """
        # Define input and output schema
        input_schema = Schema([
            ColSpec("string","text"),
            
            ])
        output_schema = Schema([
            ColSpec("string"),
        ])
        
        # Define model signature
        signature = ModelSignature(inputs=input_schema, outputs=output_schema)
        
        # Log the model in MLflow
        mlflow.pyfunc.log_model(
            model_name,
            artifact_path = "spam_model"
            python_model=cls(),
            signature=signature,
            pip_requirements=["mlflow", "pandas", "scikit-learn", "numpy"]

        )

## Logging Model to MLflow

In [None]:
logger.info(f'Starting the experiment: {EXPERIMENT_NAME}')

# Set the MLflow experiment name
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

# Start an MLflow run
with mlflow.start_run(run_name=RUN_NAME) as run:
    # Print the artifact URI for reference
    logging.info(f"Run's Artifact URI: {run.info.artifact_uri}")
    
    # Log the model to MLflow
    SpamDetectionModel.log_model(model_name=MODEL_NAME)

    # Register the logged model in MLflow Model Registry
    mlflow.register_model(
        model_uri=f"runs:/{run.info.run_id}/{MODEL_NAME}", 
        name=MODEL_NAME
    )

logger.info(f'Registered the model: {MODEL_NAME}')

## Fetching the Latest Model Version from MLflow

In [None]:
# Initialize the MLflow client
client = MlflowClient()

# Retrieve the latest version of the "Iris_Flower_Model" model (not yet in a specific stage)
model_metadata = client.get_latest_versions(MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version  # Extract the latest model version

# Fetch model information, including its signature
model_info = mlflow.models.get_model_info(f"models:/{MODEL_NAME}/{latest_model_version}")

# Print the latest model version and its signature
print(f"Latest Model Version: {latest_model_version}")
print(f"Model Signature: {model_info.signature}")

## Loading the Model and Running Inference

In [None]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{MODEL_NAME}/{latest_model_version}")

# Define a sample text for testing
text = pd.DataFrame({'text': ["You have won a free ticket!"]})

# Use the model to predict 
result = model.predict(text)
logger.info(result)

In [None]:
logger.info('Notebook execution completed.')

Built with ❤️ using [**Z by HP AI Studio**](https://zdocs.datascience.hp.com/docs/aistudio/overview).