In [1]:
from pathlib import Path
import string
from nltk.corpus import stopwords
import xgboost as xgb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score)

In [6]:
# Stopwords
stop = set(stopwords.words())

In [92]:
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [33]:
def calc_metrics(y_test, pred, proba=None, labels=["ham", "spam"], print_=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred)
    output["Precision"] = precision_score(y_test, pred)
    output["F1"] = f1_score(y_test, pred)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    return output

In [766]:
class Preprocessor(object):
    """
    Preprocess input document
    """
    def __init__(self, stopwords=stop, remove_stop=True, min_length=0):
        self.stopwords = stopwords
        self.remove_stop = remove_stop
        self.min_length = min_length

    def __call__(self, doc):
        if not isinstance(doc, (str, bytes)):
            doc = str(doc)
        if self.remove_stop:
            tokens = [el.strip(' \n\t\r').lower() for el in doc.split() if el.strip().lower() not in self.stopwords]
        else:
            tokens = [el.strip(' \n\t\r').lower() for el in doc.split()]
        return ' '.join(filter(lambda x: len(x)>=self.min_length, tokens))

#### Read the data

In [3]:
filename = "SMSSpamCollection.txt"
df = pd.read_csv(Path() / "data" / filename, sep="\t", header=None, names=["label", "sms"])
mapping = {"ham": 0,
           "spam": 1}
df["label"] = df["label"].map(mapping)
df["label"].value_counts(normalize=True)

0    0.865937
1    0.134063
Name: label, dtype: float64

#### Train - test split

In [4]:
X = df["sms"]
y = df["label"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4457, Num. of test: 1115


#### Let's implement NB

In [10]:
def calc_likelihood(data, vectorizer, y):
    tfidf =  vectorizer.fit_transform(data).toarray()
    prob_spam = tfidf[y > 0].sum(axis=0) / np.sum(tfidf)
    prob_ham = tfidf[y < 1].sum(axis=0) / np.sum(tfidf)
    return np.vstack((prob_ham, prob_spam)).T

In [14]:
def predict(X_test, vectorizer, priors, likelihood):
    tf = vectorizer.transform(X_test).toarray() 
    prob = tf[:, :, np.newaxis] * likelihood[np.newaxis, :, :]
    prob[prob==0] = 1
    score = np.exp(np.log(prob).sum(axis=1)) * priors
    score[score.max(axis=1)==0] = eps
    return score / score.sum(axis=1)[:, np.newaxis]

In [24]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": stop,
             "ngram_range": (3, 3),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 3500,
             "norm": None,
             "use_idf": True
             }

In [25]:
eps = 1e-15
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)

In [26]:
likelihood = calc_likelihood(X_train, vectorizer, y_train)
likelihood[likelihood==0] = eps
likelihood[likelihood==1] = 1 - eps
assert np.amax(np.fabs(np.sum(likelihood, axis=1) - train.toarray().sum(axis=0) / np.sum(train))) <= 1e-5
pred_probs = predict(X_test, vectorizer, priors, likelihood)
proba = pred_probs[:, 1]
pred = np.zeros_like(proba)
pred[proba>=0.5] = 1
metrics = calc_metrics(y_test, pred, proba, labels=["ham", "spam"])

AUC: 0.940
Recall: 0.839
Precision: 0.806
F1: 0.822
accuracy: 0.952
      pred_ham  pred_spam
ham        936         30
spam        24        125


#### Fit Scikit learn NB

In [30]:
clf = MultinomialNB(alpha=2.5, class_prior=[0.5, 0.5])
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
proba = clf.predict_proba(test.toarray())[:, 1]
metrics = calc_metrics(y_test, pred, proba, labels=["ham", "spam"])

AUC: 0.984
Recall: 0.953
Precision: 0.928
F1: 0.940
accuracy: 0.984
      pred_ham  pred_spam
ham        955         11
spam         7        142


#### Fit XGBoost

In [846]:
params = {}
params['scale_pos_weight'] = sum(y_train==0) / sum(y_train==1)
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = 'binary:logistic'
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["error","auc"]
params["early_stopping_rounds"] = 50

In [847]:
dtrain = xgb.DMatrix(train, y_train)
dtest = xgb.DMatrix(test, y_test)
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [848]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-error:0.189141	train-auc:0.918843	eval-error:0.202691	eval-auc:0.904508
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[50]	train-error:0.056316	train-auc:0.982994	eval-error:0.063677	eval-auc:0.973439
[100]	train-error:0.049585	train-auc:0.984981	eval-error:0.061883	eval-auc:0.972814
Stopping. Best iteration:
[64]	train-error:0.05295	train-auc:0.983661	eval-error:0.061883	eval-auc:0.974009



In [849]:
pred_proba_xgb = model.predict(dtest)
pred_xgb = np.zeros_like(pred_proba_xgb)
pred_xgb[pred_proba_xgb>=0.5] = 1
xgb_metrics = calc_metrics(y_test, pred_xgb, pred_proba_xgb)

AUC: 0.973
Recall: 0.886
Precision: 0.725
F1: 0.798
accuracy: 0.940
      pred_ham  pred_spam
ham        916         50
spam        17        132
