In [13]:
%load_ext autoreload

%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
from pathlib import Path
import string
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
import xgboost as xgb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, 
                             confusion_matrix, accuracy_score, f1_score)

In [36]:
%aimport src.config
from src.config import data_dir

In [6]:
# Stopwords
stop = set(stopwords.words())

In [4]:
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [4]:
def calc_metrics(y_test, pred, proba=None, labels=["ham", "spam"], print_=True):
    output = {}
    if proba is not None:
        roc_auc = roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = recall_score(y_test, pred)
    output["Precision"] = precision_score(y_test, pred)
    output["F1"] = f1_score(y_test, pred)
    output["accuracy"] = accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + el for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(confusion_matrix(y_test, pred), 
                                         columns=columns, index=index)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
    return output

In [7]:
class Preprocessor(object):
    """
    Preprocess input document
    """
    def __init__(self, stopwords=stop, remove_stop=True, min_length=0):
        self.stopwords = stopwords
        self.remove_stop = remove_stop
        self.min_length = min_length

    def __call__(self, doc):
        if not isinstance(doc, (str, bytes)):
            doc = str(doc)
        if self.remove_stop:
            tokens = [el.strip(' \n\t\r').lower() for el in doc.split() if el.strip().lower() not in self.stopwords]
        else:
            tokens = [el.strip(' \n\t\r').lower() for el in doc.split()]
        return ' '.join(filter(lambda x: len(x)>=self.min_length, tokens))

#### Read the data

In [49]:
mapping = {"ham": 0,
           "spam": 1}

In [165]:
# SMS Collection
filename = "SMSSpamCollection.txt"
df_coll = pd.read_csv(data_dir / filename, sep="\t", header=None, names=["label", "text"])
df_coll["label"] = df_coll["label"].map(mapping)
df_coll["source"] = "coll"
df_coll["label"].value_counts(normalize=True)

0    0.865937
1    0.134063
Name: label, dtype: float64

In [166]:
# SMS from DIT - only spam
tree = ET.parse(data_dir / 'spam.xml')
root = tree.getroot()
data = []
for sms in root.iterfind("sms"):
    record = {child.tag:child.text for child in sms.getchildren()}
    data.append(record)
df_dit = pd.DataFrame(data).rename(columns={"class": "label"})
df_dit["label"] = df_dit["label"].map(lambda x: mapping.get(x, x))
df_dit["label"].value_counts()
# only spam

1    1353
Name: label, dtype: int64

In [167]:
df = pd.concat([df_coll, df_dit], axis=0)

In [138]:
# Clean up duplicates
s = df.text.map(lambda x: x.strip(" \t\n\r").lower())
duplicates = df.loc[s.duplicated(), "text"]
df.loc[df.text.isin(duplicates), ["source", "label", "text"]].sort_values(by="text").head(10)

Unnamed: 0,source,label,text
476,almeida,1,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3991,coll,1,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
572,almeida,1,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4903,coll,1,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
568,almeida,1,**FREE MESSAGE**Thanks for using the Auction S...
4863,coll,1,**FREE MESSAGE**Thanks for using the Auction S...
2124,coll,1,+123 Congratulations - in this week's competit...
69,almeida,1,+123 Congratulations - in this week's competit...
505,coll,1,+123 Congratulations - in this week's competit...
100,almeida,1,+449071512431 URGENT! This is the 2nd attempt ...


In [173]:
df_clean = df.loc[~s.duplicated()]
df_clean.label.value_counts(normalize=True)

0    0.768903
1    0.231097
Name: label, dtype: float64

#### Train - test split

In [174]:
X = df_clean["text"]
y = df_clean["label"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4697, Num. of test: 1175


#### Let's implement NB

In [39]:
def calc_likelihood(data, vectorizer, y):
    tfidf =  vectorizer.fit_transform(data).toarray()
    prob_spam = tfidf[y > 0].sum(axis=0)
    prob_ham = tfidf[y < 1].sum(axis=0)
    prob_spam /= np.sum(prob_spam)
    prob_ham /= np.sum(prob_ham)
    return np.vstack((prob_ham, prob_spam)).T

In [40]:
def predict(X_test, vectorizer, priors, likelihood):
    tf = vectorizer.transform(X_test).toarray() 
    prob = tf[:, :, np.newaxis] * likelihood[np.newaxis, :, :]
    prob[prob==0] = 1
    score = np.exp(np.log(prob).sum(axis=1)) * priors
    score[score.max(axis=1)==0] = eps
    return score / score.sum(axis=1)[:, np.newaxis]

In [189]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": stop,
             "ngram_range": (3, 3),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 3500,
             "norm": "l2"*0,
             "use_idf": 1
             }

In [190]:
eps = 1e-15
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)

In [192]:
likelihood = calc_likelihood(X_train, vectorizer, y_train)
likelihood[likelihood==0] = eps
likelihood[likelihood==1] = 1 - eps
#assert np.amax(np.fabs(np.sum(likelihood, axis=1) - train.toarray().sum(axis=0) / np.sum(train))) <= 1e-5
pred_probs = predict(X_test, vectorizer, priors, likelihood)
proba = pred_probs[:, 1]
pred = np.zeros_like(proba)
pred[proba>=0.5] = 1
metrics = calc_metrics(y_test, pred, proba, labels=["ham", "spam"])

AUC: 0.960
Recall: 0.890
Precision: 0.906
F1: 0.898
accuracy: 0.953
      pred_ham  pred_spam
ham        878         25
spam        30        242


#### Fit Scikit learn NB

In [193]:
clf = MultinomialNB(alpha=2.5, class_prior=[0.5, 0.5])
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
proba = clf.predict_proba(test.toarray())[:, 1]
metrics = calc_metrics(y_test, pred, proba, labels=["ham", "spam"])

AUC: 0.965
Recall: 0.908
Precision: 0.969
F1: 0.937
accuracy: 0.972
      pred_ham  pred_spam
ham        895          8
spam        25        247


#### Fit XGBoost

In [45]:
params = {}
params['scale_pos_weight'] = sum(y_train==0) / sum(y_train==1)
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = 'binary:logistic'
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["error","auc"]
params["early_stopping_rounds"] = 50

In [154]:
dtrain = xgb.DMatrix(train, y_train)
dtest = xgb.DMatrix(test, y_test)
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [155]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-error:0.168831	train-auc:0.918823	eval-error:0.16766	eval-auc:0.924514
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[50]	train-error:0.100277	train-auc:0.977128	eval-error:0.108085	eval-auc:0.972736
[100]	train-error:0.086864	train-auc:0.980495	eval-error:0.097021	eval-auc:0.973377
[150]	train-error:0.08729	train-auc:0.981587	eval-error:0.091915	eval-auc:0.973621
[200]	train-error:0.080264	train-auc:0.983037	eval-error:0.08766	eval-auc:0.973951
[250]	train-error:0.076858	train-auc:0.984178	eval-error:0.090213	eval-auc:0.974546
[300]	train-error:0.076432	train-auc:0.98498	eval-error:0.092766	eval-auc:0.973963
Stopping. Best iteration:
[257]	train-error:0.076006	train-auc:0.984216	eval-error:0.091064	eval-auc:0.974713



In [156]:
pred_proba_xgb = model.predict(dtest)
pred_xgb = np.zeros_like(pred_proba_xgb)
pred_xgb[pred_proba_xgb>=0.5] = 1
xgb_metrics = calc_metrics(y_test, pred_xgb, pred_proba_xgb)

AUC: 0.974
Recall: 0.926
Precision: 0.741
F1: 0.824
accuracy: 0.908
      pred_ham  pred_spam
ham        815         88
spam        20        252
