In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.config
%aimport src.helpers

In [1460]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
import re
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from functools import partial
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier

In [1045]:
from src.config import data_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class

#### Process raw SMS data

In [51]:
filename = "karim-sms-allow.xml"
source = data_dir / filename
data = []
for event, elem in iterparse(source):
    if elem.tag == "sms":
        #if any(elem.attrib["body"]==r["text"] for r in data):
        #    continue
        record = {}
        record["text"] = elem.attrib["body"]
        record["contact_name"] = elem.attrib["contact_name"]
        record["address"] = elem.attrib["address"]
        record["timestamp"] = int(elem.attrib["date"])
        record["type"] = elem.attrib["type"]
        data.append(record)

In [55]:
df = pd.DataFrame(data)
df.to_excel(data_dir / "karim-sms-allow.xlsx", index=False)

#### Read labeled data

In [6]:
labeled_filename = "karim-sms-allow-labeled.xlsx"
labeled = pd.read_excel(data_dir / labeled_filename, sheet_name="total sms")
labeled["timestamp"] = (labeled["timestamp"] / 1000).map(datetime.fromtimestamp)
labeled["resp"] = 0

In [7]:
mapp = {"ham": 0, "spam": 1}

In [8]:
responses_filename = "SMS Data Collection (Responses).xlsx"
responses = pd.read_excel(data_dir / responses_filename)
responses = responses.rename(columns={"SMS text": "text", 
                                      "Is it a spam or ham?": "label",
                                     "Timestamp": "timestamp"})
responses["resp"] = 1
responses["label"] = responses["label"].map(lambda x: mapp.get(x, x))

In [21]:
total = pd.concat([labeled, responses], ignore_index=True)
total.to_excel(data_dir / "sms-uk-total.xlsx")

In [22]:
# Check dimensionality and class imbalance
total.shape
total.label.value_counts(normalize=True).round(5)*100
total.text.isnull().sum()
total = total.loc[total.text.notnull()]
total.shape

(3497, 8)

0    78.81
1    21.19
Name: label, dtype: float64

3

(3494, 8)

#### Train-test split

In [1938]:
total["text_rep"] = total["text"].str.replace(r"[\(\d][\d\s\(\)-]{8,15}\d", "PHONE_NUMBER", flags=re.I)

In [1939]:
X = total["text_rep"]
y = total["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 2445, Num. of test: 1049


#### Build features

In [1950]:
def build_features(X_train, X_test, var="text", features=None, vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            tf_train = vectorizer.fit_transform(X_train).toarray()
            tf_test = vectorizer.transform(X_test).toarray()
            f_train.append(tf_train)
            f_test.append(tf_test)
        if feature == "length":
            if "tfidf" in features:
                train = (tf_train>0).sum(axis=1)[:, np.newaxis]
                test = (tf_test>0).sum(axis=1)[:, np.newaxis]
            else:
                train = X_train.map(len).values[:, np.newaxis]
                test = X_test.map(len).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
        if feature == "patt":
            patt = "%|taxi|скидк|цін"
            train = (X_train.str.contains(patt, regex=True, flags=re.I)
                     .astype(int).values[:, np.newaxis])
            test = (X_test.str.contains(patt, regex=True, flags=re.I)
                    .astype(int).values[:, np.newaxis])
            f_train.append(train)
            f_test.append(test)
        if feature == "phone":
            patt = "PHONE_NUMBER"
            train = X_train.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            test = X_test.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [1951]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": None,
             "ngram_range": (4, 4),
             "min_df": 0.0,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 3500,
             "norm": "l2"*0,
             "use_idf": 0
             }

In [1130]:
top = 100
r = tfidf_train.toarray().sum(axis=1)
topn_ids = np.argsort(r)[::-1][:top]
voc = [f for i,f in enumerate(features) if i not in topn_ids]
tf_params["vocabulary"] = None#voc

In [1952]:
vectorizer = TfidfVectorizer(**tf_params)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
features = [
            "tfidf", 
            "length",
            "phone",
            "patt",
]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer, var="text")

In [None]:
features = vectorizer.get_feature_names()
dfs = top_feats_by_class(tfidf_train, y_train, features, min_tfidf=0.1, top_n=25)
plot_tfidf_classfeats_h(dfs)

#### Fit Naive Bayes

In general it is much worse to misclassify ham
SMS than letting spam pass the filter. So, it is desirable to be able to bias
the filter towards classifying SMS as ham, yielding higher precision at the expense of recall

In [655]:
def predict_class(tf, X_test, clf, w=1.5):
    probas = clf.predict_proba(X_test)
    ratios = np.log(probas[:, 1] ) - np.log(probas[:, 0])
    lengths = (tf.toarray()>0).sum(axis=1).T
    thresholds = lengths * np.log(w)
    y_pred = np.zeros_like(y_test)
    y_pred[ratios>thresholds] = 1
    return y_pred, ratios, thresholds

In [1492]:
clf = RandomForestClassifier(min_samples_leaf=5, min_samples_split=15,
                             n_estimators=100, max_depth=20, max_features="auto", 
                             class_weight="balanced")

In [1966]:
clf = LogisticRegression(random_state=25, class_weight="balanced", 
                         C=1, penalty="l2")
#clf = MultinomialNB(alpha=4)#, class_prior=[0.5, 0.5])
clf.fit(train, y_train)
pred, ratios, thresholds = predict_class(tfidf_test, test, clf, w=1.2)
pred = clf.predict(test)
proba = clf.predict_proba(test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

AUC: 0.994
Recall: 0.937
Precision: 0.986
F1: 0.961
Accuracy: 0.984

Confusion matrix:
      pred_ham  pred_spam
ham        824          3
spam        14        208

Report:
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       827
          1       0.99      0.94      0.96       222

avg / total       0.98      0.98      0.98      1049



In [1050]:
fp_i = np.where((pred==1) & (y_test==0))[0]
fn_i = np.where((pred==0) & (y_test==1))[0]


array([ 32, 281, 310, 327, 394, 424, 456, 457, 668, 804, 943])

#### Fit XGBoost

In [762]:
params = {}
#params['scale_pos_weight'] = sum(y_train==0) / sum(y_train==1)
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = 'binary:logistic'
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["error","auc"]
params["early_stopping_rounds"] = 50

In [763]:
dtrain = xgb.DMatrix(train, y_train)
dtest = xgb.DMatrix(test, y_test)
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [764]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-error:0.212165	train-auc:0.817627	eval-error:0.211731	eval-auc:0.80761
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[50]	train-error:0.188551	train-auc:0.843192	eval-error:0.197425	eval-auc:0.825158
Stopping. Best iteration:
[20]	train-error:0.212165	train-auc:0.842316	eval-error:0.211731	eval-auc:0.828377



In [765]:
pred_proba_xgb = model.predict(dtest)
pred_xgb = np.zeros_like(pred_proba_xgb)
pred_xgb[pred_proba_xgb>=0.5] = 1
xgb_metrics = calc_metrics(y_test, pred_xgb, pred_proba_xgb, mode="binary",
                          labels=["ham", "spam"], print_=True)

AUC: 0.825
Recall: 0.345
Precision: 0.554
F1: 0.425
Accuracy: 0.803

Confusion matrix:
      pred_ham  pred_spam
ham        510         41
spam        97         51

Report:
             precision    recall  f1-score   support

          0       0.84      0.93      0.88       551
          1       0.55      0.34      0.43       148

avg / total       0.78      0.80      0.78       699



In [1165]:
def unsquash(X):
    ''' (n,) -> (n,1) '''
    if len(X.shape) == 1 or X.shape[0] == 1:
        return np.asarray(X).reshape((len(X), 1))
    else:
        return X

In [1174]:
def squash(X):
    ''' (n,1) -> (n,) '''
    return np.squeeze(np.asarray(X))

In [1175]:
class Transformer(TransformerMixin):
    '''Base class for pure transformers'''

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        return X

    def get_params(self, deep=True):
        return dict()

In [1178]:
class ModelTransformer(TransformerMixin):
    ''' Use model predictions as transformer '''
    def __init__(self, model, probs=True):
        self.model = model
        self.probs = probs

    def get_params(self, deep=True):
        return dict(model=self.model, probs=self.probs)

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        if self.probs:
            pred = self.model.predict_proba(X)[:, 1]
        else:
            pred = self.model.predict(X)
        return unsquash(pred)

In [1305]:
class Converter(Transformer):
    
    def __init__(self):
        pass
    
    def transform(self, X, **kwargs):
        if isinstance(X, np.ndarray):
            return X
        elif isinstance(X, pd.Series):
            return X.values
        elif isinstance(X, str):
            return np.array([X])
        else:
            return X

In [1687]:
class TfIdfLen(Transformer):
    def __init__(self, add_len=True, **tfidf_params):
        self.add_len = add_len
        self.tfidf_params = tfidf_params
        self.vectorizer = TfidfVectorizer(**self.tfidf_params)

    def get_params(self, deep=True):
        output = self.tfidf_params
        output.update({"add_len": self.add_len})
        return output
    
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X, **kwargs):
        res = self.vectorizer.transform(X).toarray()
        if self.add_len:
            lens = unsquash(np.count_nonzero(res, axis=1))
            res = np.concatenate((res, lens), axis=1)
        return res

In [1692]:
a = X_test.iloc[:1]#.values
l = TfIdfLen(add_len=1, **tf_params)

In [1751]:
class MatchPattern(Transformer):
    
    def __init__(self, pattern, is_len, flags=re.U):
        self.pattern = pattern
        self.is_len = is_len
        self.flags = flags
        
    def get_params(self, deep=True):
        return dict(pattern=self.pattern, is_len=self.is_len, flags=self.flags)
    
    def transform(self, X, **kwargs):
        if self.is_len:
            func = lambda text: len(re.findall(self.pattern, text, self.flags))
        else:
            func = lambda text: bool(re.search(self.pattern, text, self.flags))
        rez = np.vectorize(func)(X).astype(int)
        return unsquash(rez)

In [1768]:
class TargetEncoder(object):

    def __init__(self, type_="expanding", na_as_category=True, na_value=-999.0,
                 columns=None, feature_names=None):
        self.na_as_category = na_as_category
        self.na_value = na_value
        self.columns = columns
        self.feature_names = feature_names
        self.mapping = None

    def fit(self, X, y):
        assert X.shape[0] == y.shape[0]
        self._train = True
        self._fitted = False
        _, mapping = self.target_encode(X, y, mapping=self.mapping, cols=self.columns,
                                        na_as_category=self.na_as_category, na_value=self.na_value)
        self.mapping = mapping
        self._fitted = True
        return self

    def transform(self, X, y=None):
        if not self._fitted:
            raise ValueError('Must train encoder before it can be used to transform data.')
        assert (y is None or X.shape[0] == y.shape[0])
        X, mapping = self.target_encode(X, y, mapping=self.mapping, cols=self.columns,
                                        na_as_category=self.na_as_category,
                                        na_value=self.na_value)
        return X

    def target_encode(self, X_in, y, mapping=None, cols=None, na_as_category=None, na_value="-999"):
        X = X_in.copy(deep=True)
        if cols is None:
            cols = X.columns.values
#         if na_as_category:
#             #self.converter.columns = cols
#             self.nafiller.columns = cols
#             X = self.nafiller(X)
        if mapping is not None and self._fitted:
            mapping_out = mapping
            index = X.index
            for el in self.mapping:
                col = el["col"]
                length = get_len(col)
                default = "_".join(col)+"_tmp"
                new_col = self.feature_names.get(col, default)
                if self._train:
                    grouper = el.pop("grouper")
                    cumsum = grouper.cumsum() - y
                    cumcnt = grouper.cumcount()
                    X[new_col] = cumsum * 1.0 / cumcnt
                    X[new_col] = X[new_col].fillna(self._mean)
                else:
                    X[new_col] = [el["mean"].get(get_slice(record, length), self._mean) for record in
                                  X[list(col)].itertuples(index=False, name=None)]
            if self._train:
                self._train = False
        else:
            self._mean = y.mean()
            mapping_out = []
            for col in cols:
                grouper = pd.concat([X, y], axis=1).groupby(col)[get_target_name(y)]
                means = grouper.agg("mean").to_dict()
                mapping_out.append({"col": col, "grouper": grouper, "mean": means})
        return X, mapping_out

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X, y)

In [1434]:
class EnsembleBinaryClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):

    def __init__(self, mode, weights=None):
        self.mode = mode
        self.weights = weights

    def fit(self, X, y=None):
        return self

    def predict_proba(self, X):
        ''' Predict (weighted) probabilities '''
        probs = np.average(X, axis=1, weights=self.weights)
        return np.column_stack((1-probs, probs))

    def predict(self, X):
        ''' Predict class labels. '''
        if self.mode == 'average':
            return binarize(self.predict_proba(X)[:,[1]], 0.5)
        else:
            res = binarize(X, 0.5)
            return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)

In [1420]:
def build_ensemble(model_list, estimator=None):
    models = []
    for i, model in enumerate(model_list):
        models.append(('model_transform'+str(i), ModelTransformer(model)))

    if not estimator:
        return FeatureUnion(models)
    else:
        return Pipeline([
            ('features', FeatureUnion(models)),
            ('estimator', estimator)
            ])

In [1182]:
class Squash(Transformer):
    def transform(self, X, **kwargs):
        return squash(X)

class Unsquash(Transformer):
    def transform(self, X, **kwargs):
        return unsquash(X)

In [1669]:
def get_vec_pipe(add_len=True, **tfidf_params):
    vectorizer = TfIdfLen(add_len, **tfidf_params)
    vec_pipe = [
        ('vectorizer', vectorizer)]
    return Pipeline(vec_pipe)

In [1745]:
def get_pattern_pipe(patterns):
    pipes = []
    for i, (patt, params) in enumerate(patterns):
        kwargs = params.copy()
        name = kwargs.pop("name") + "_" + str(i)
        temp = MatchPattern(pattern=patt, **kwargs)
        pipes.append((name, temp))
    return pipes

In [1971]:
r"[\(\d][\d\s\(\)-]{8,15}\d"

'[\\(\\d][\\d\\s\\(\\)-]{8,15}\\d'

In [1972]:
PATTERNS = [(r"PHONE_NUMBER", {"name": "phone",
                                            "is_len": 0}),
           (r"%|taxi|скидк|цін", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U})
           ]

In [1981]:
vec_pipe = get_vec_pipe(**tf_params)

patt_pipe = get_pattern_pipe(PATTERNS)

clf_log = LogisticRegression(random_state=25, class_weight="balanced", 
                             C=0.1, penalty="l2")
clf_nb = MultinomialNB(alpha=0.1)#, class_prior=[0.5, 0.5])
clf_e = EnsembleBinaryClassifier(mode="averag", weights=[5, 5])  

chain = [
    ('converter', Converter()),
    ('union', FeatureUnion([
    ('vec', vec_pipe),
    *patt_pipe
    ])),
    #("ff", build_ensemble([clf_log, clf_nb], clf_e))
]
chain.append(('estimator', clf_log))
pipe = Pipeline(chain)

In [1997]:
sms = "привіт заходь до нас у ввечері додому"
ham, spam = pipe.predict_proba(sms)[0]
print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")

Probability ham: 80.978%
Probability spam: 19.022%


In [1987]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

Pipeline(memory=None,
     steps=[('converter', <__main__.Converter object at 0x7fecd12a3438>), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('vec', Pipeline(memory=None,
     steps=[('vectorizer', <__main__.TfIdfLen object at 0x7fecd12a3208>)])), ('phone_0', <__main__.MatchPattern object at 0x7fecd12a3358>), ('c...alty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

AUC: 0.994
Recall: 0.941
Precision: 0.968
F1: 0.954
Accuracy: 0.981

Confusion matrix:
      pred_ham  pred_spam
ham        820          7
spam        13        209

Report:
             precision    recall  f1-score   support

          0       0.98      0.99      0.99       827
          1       0.97      0.94      0.95       222

avg / total       0.98      0.98      0.98      1049

