# SMS spam classification [sklearn]
Binary classification of sms being spam or not.
Reference: <https://www.kaggle.com/code/faressayah/natural-language-processing-nlp-for-beginners>

## Install necessary packages

In [None]:
!pip install nltk giskard

## Import libraries

In [None]:
import os
from typing import Iterable

import nltk
import string
import sklearn
import pandas as pd
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import giskard
from giskard import wrap_dataset, wrap_model

## Download NLTK corpus

In [None]:
nltk.download("stopwords")

## Notebook-level settings

In [None]:
# Necessary for custom preprocessing function transformers.
sklearn.set_config(transform_output="pandas")

## Define constants

In [None]:
# Constants.
TARGET_NAME = "label"
TEXT_COLUMN_NAME = "message"

# Paths.
DATA_DIRECTORY = os.path.join(".", "datasets", "sms_spam_classification_dataset", "spam.csv")

## Load and initially preprocess data

In [None]:
def load_data() -> pd.DataFrame:
    """Load data."""
    df = pd.read_csv(DATA_DIRECTORY, encoding='latin-1')
    df.dropna(how="any", inplace=True, axis=1)
    df.columns = [TARGET_NAME, TEXT_COLUMN_NAME]
    return df

messages_df = load_data()
messages_df.head()

In [None]:
def preprocess_label(df: pd.DataFrame) -> pd.DataFrame:
    """Change string labels to the integer encoding."""
    df.label = LabelEncoder().fit_transform(df.label)
    return df

messages_df = preprocess_label(messages_df)
messages_df.head()

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(messages_df[[TEXT_COLUMN_NAME]], messages_df[TARGET_NAME], random_state=1)

## Define preprocessing pipeline

In [None]:
def remove_punctuation(df: pd.DataFrame) -> pd.DataFrame:
    """Remove punctuation from text."""
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))
    return df

remove_punctuation_transformer = FunctionTransformer(remove_punctuation)

In [None]:
def remove_stop_words(df: pd.DataFrame) -> pd.DataFrame:
    """Remove stopwords from text."""
    _STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']

    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in _STOPWORDS]))
    return df

remove_stop_words_transformer = FunctionTransformer(remove_stop_words)

In [None]:
def adapt_vectorizer_input(df: pd.DataFrame) -> Iterable:
    """Adapt input for the vectorizers.

    The problem is that vectorizers accept iterable, not DataFrame, but Series. Thus, we need to ravel dataframe with text have input single dimension.
    Issue reference: https://stackoverflow.com/questions/50665240/valueerror-found-input-variables-with-inconsistent-numbers-of-samples-1-3185"""

    df = df.iloc[:, 0]
    return df

adapt_vectorizer_input_transformer = FunctionTransformer(adapt_vectorizer_input)

## Define Naive Bayes model pipeline

In [None]:
# Define data preprocessor pipeline.
preprocessor = Pipeline(steps=[
    ("punctuation_remover", remove_punctuation_transformer),
    ("stop_words_remover", remove_stop_words_transformer),
    ("text_vectorizer_adapter", adapt_vectorizer_input_transformer),
    ('bow', CountVectorizer()),
    ('tfid', TfidfTransformer()),
])

# Define general pipeline with data preprocessing and model.
pipeline_naive_bayes = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', MultinomialNB())
])

# Fit model.
pipeline_naive_bayes.fit(X_train, y_train)
y_pred_prob = pipeline_naive_bayes.predict_proba(X_test)[:, 1]

# Get test metric.
metric = metrics.roc_auc_score(y_test, y_pred_prob)
print(f"Test ROC-AUC score: {metric}")

## Define Logistic Regression model pipeline

In [None]:
# Define general pipeline with data preprocessing and model.
pipeline_logistic_regression = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ("logistic_regression", LogisticRegression())
])

# Fit model.
pipeline_logistic_regression.fit(X_train, y_train)
y_pred_prob = pipeline_logistic_regression.predict_proba(X_test)[:, 1]

# Get test metric.
metric = metrics.roc_auc_score(y_test, y_pred_prob)
print(f"Test ROC-AUC score: {metric}")

## Wrap data and models

In [None]:
# Wrap test dataset.
raw_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = wrap_dataset(raw_dataset,
                               name="sms_spam",
                               target="label",
                               column_types={"message": "text"})

In [None]:
# Wrap Naive-Bayes model.
wrapped_model_naive_bayes = wrap_model(pipeline_naive_bayes,
                                       model_type="classification",
                                       name="spam_classifier_naive_bayes",
                                       feature_names=X_test.columns,
                                       classification_threshold=0.5)
wrapped_model_naive_bayes.predict(wrapped_dataset)

In [None]:
# Wrap Logistic Regression model.
wrapped_model_logistic_regression = wrap_model(pipeline_logistic_regression,
                                               model_type="classification",
                                               name="spam_classifier_logistic_regression",
                                               feature_names=X_test.columns,
                                               classification_threshold=0.5)
wrapped_model_logistic_regression.predict(wrapped_dataset)

## Scan models

In [None]:
# Scan Naive Bayes model.
naive_bayes_scan = giskard.scan(wrapped_model_naive_bayes, wrapped_dataset)

In [None]:
display(naive_bayes_scan)

In [None]:
# Scan Logistic regression model.
logistic_regression_scan = giskard.scan(wrapped_model_logistic_regression, wrapped_dataset)

In [None]:
display(logistic_regression_scan)