In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.config
%aimport src.helpers

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from functools import partial
from scipy import sparse

In [6]:
from src.config import data_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class

#### Process raw SMS data

In [7]:
filename = "karim-sms-allow.xml"
source = data_dir / filename
data = []
for event, elem in iterparse(source):
    if elem.tag == "sms":
        #if any(elem.attrib["body"]==r["text"] for r in data):
        #    continue
        record = {}
        record["text"] = elem.attrib["body"]
        record["contact_name"] = elem.attrib["contact_name"]
        record["address"] = elem.attrib["address"]
        record["timestamp"] = int(elem.attrib["date"])
        record["type"] = elem.attrib["type"]
        data.append(record)

In [55]:
df = pd.DataFrame(data)
df.to_excel(data_dir / "karim-sms-allow.xlsx", index=False)

#### Read labeled data

In [131]:
def build_dataset(filenames, file_out, date_format="%m-%d-%Y %H:%M:%S", is_save=1):
    output = []
    for k,v in filenames.items():
        if k == "labeled":
            df = pd.read_excel(data_dir / v, sheet_name="total sms")
            df["timestamp"] = (df["timestamp"] / 1000).map(datetime.fromtimestamp)
            df["resp"] = 0
            output.append(df)
        elif k == "labeled_1":
            df = pd.read_excel(data_dir / v)
            df["resp"] = 0
            df["timestamp"] = df["timestamp"].map(lambda x: datetime.strptime(x, date_format))
            exclude = ["Karimushka"]
            df = df.loc[~(df.contact_name.isin(exclude))]
            output.append(df)
        else:
            df = pd.read_excel(data_dir / v)
            df = df.rename(columns={"SMS text": "text", 
                                    "Is it a spam or ham?": "label",
                                    "Timestamp": "timestamp"})
            df["resp"] = 1
            df["label"] = df["label"].map(lambda x: LABEL_MAP.get(x, x))
            output.append(df)
    df = pd.concat(output, ignore_index=True)
    if is_save:
        total.to_excel(data_dir / file_out)
    return df

In [133]:
LABEL_MAP = {"ham": 0, "spam": 1}
FILES = {"labeled": "karim-sms-allow-labeled.xlsx",
         "labeled_1": "tanya-sms-all.xlsx",
         "responses": "SMS Data Collection (Responses).xlsx"}
file_out = "sms-uk-total.xlsx"
total = build_dataset(FILES, file_out=file_out)

In [134]:
# Check dimensionality and class imbalance
total.shape
total.label.value_counts(normalize=True).round(5)*100
total.text.isnull().sum()
total = total.loc[total.text.notnull()]
total.shape

(6107, 8)

0    80.138
1    19.862
Name: label, dtype: float64

3

(6104, 8)

#### Train-test split

In [135]:
total = pd.read_excel(data_dir / file_out)
total = total.loc[total.text.notnull()]

In [137]:
#total["text_rep"] = total["text"].str.replace(r"[\(\d][\d\s\(\)-]{8,15}\d", "PHONE_NUMBER", flags=re.I)

In [138]:
total["text"] = total["text"].str.replace(r"[\n\r]+", "")

In [139]:
X = total["text"]
y = total["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


#### Build features

In [48]:
def build_features(X_train, X_test, var="text", features=None, vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            tf_train = vectorizer.fit_transform(X_train).toarray()
            tf_test = vectorizer.transform(X_test).toarray()
            f_train.append(tf_train)
            f_test.append(tf_test)
        if feature == "length":
            if "tfidf" in features:
                train = (tf_train>0).sum(axis=1)[:, np.newaxis]
                test = (tf_test>0).sum(axis=1)[:, np.newaxis]
            else:
                train = X_train.map(len).values[:, np.newaxis]
                test = X_test.map(len).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
        if feature == "patt":
            patt = "%|taxi|скидк|цін"
            train = (X_train.str.contains(patt, regex=True, flags=re.I)
                     .astype(int).values[:, np.newaxis])
            test = (X_test.str.contains(patt, regex=True, flags=re.I)
                    .astype(int).values[:, np.newaxis])
            f_train.append(train)
            f_test.append(test)
        if feature == "phone":
            patt = r"[\(\d][\d\s\(\)-]{8,15}\d"
            train = X_train.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            test = X_test.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [49]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": None,
             "ngram_range": (4, 4),
             "min_df": 0.0,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 4000,
             "norm": "l2"*0,
             "use_idf": 1
             }

In [27]:
# Remove Top N features
# top = 100
# r = tfidf_train.toarray().sum(axis=1)
# topn_ids = np.argsort(r)[::-1][:top]
# voc = [f for i,f in enumerate(features) if i not in topn_ids]
# tf_params["vocabulary"] = None#voc

In [50]:
vectorizer = TfidfVectorizer(**tf_params)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
features = [
            "tfidf", 
            "length",
            "phone",
            "patt",
]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer, var="text")

In [26]:
# features = vectorizer.get_feature_names()
# dfs = top_feats_by_class(tfidf_train, y_train, features, min_tfidf=0.1, top_n=25)
# plot_tfidf_classfeats_h(dfs)

#### Fit Naive Bayes

In general it is much worse to misclassify ham
SMS than letting spam pass the filter. So, it is desirable to be able to bias
the filter towards classifying SMS as ham, yielding higher precision at the expense of recall

In [51]:
def predict_class(tf, X_test, clf, w=1.5):
    probas = clf.predict_proba(X_test)
    ratios = np.log(probas[:, 1] ) - np.log(probas[:, 0])
    lengths = (tf.toarray()>0).sum(axis=1).T
    thresholds = lengths * np.log(w)
    y_pred = np.zeros_like(y_test)
    y_pred[ratios>thresholds] = 1
    return y_pred, ratios, thresholds

In [1492]:
clf = RandomForestClassifier(min_samples_leaf=5, min_samples_split=15,
                             n_estimators=100, max_depth=20, max_features="auto", 
                             class_weight="balanced")

In [242]:
clf = LogisticRegression(random_state=25, class_weight="balanced", 
                         C=0.02, penalty="l2")
#clf = MultinomialNB(alpha=0.01)#, class_prior=[0.5, 0.5])
clf.fit(train, y_train)
#pred, ratios, thresholds = predict_class(tfidf_test, test, clf, w=1.2)
pred = clf.predict(test)
proba = clf.predict_proba(test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

LogisticRegression(C=0.02, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

AUC: 0.997
Recall: 0.967
Precision: 0.962
F1: 0.964
Accuracy: 0.986

Confusion matrix:
      pred_ham  pred_spam
ham       1453         14
spam        12        352

Report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1467
          1       0.96      0.97      0.96       364

avg / total       0.99      0.99      0.99      1831



In [203]:
X_test.iloc[fn_i[:2]]
total.loc[3469]

3956    Ждем на выходных в Кувшине!Барашек на вертеле,...
3469     "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
Name: text, dtype: object

address                                                       NaN
contact_name                                                  NaN
label                                                           1
resp                                                            1
service                                                       NaN
text             "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
timestamp                              2018-04-12 16:55:09.238000
type                                                          NaN
text_rep         "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
Name: 3469, dtype: object

In [71]:
fp_i = np.where((pred==1) & (y_test==0))[0]
fn_i = np.where((pred==0) & (y_test==1))[0]
for el in X_test.iloc[fp_i].values:
    print(el+"\n")

LEGO №ттн Нової Пошти 20400068786210 www.constructors.com.ua

lifecell вітає в Україні! Дякуємо, що обрали нас для закордонної подорожі. Сподіваємось, Вам сподобались зручні та вигідні тарифи на роумінг від lifecell  та інноваційний додаток BiP разом з послугою "Безкоштовний BiP у роумінгу". Більше деталей: s.lifecell.ua/roam

Вмем подарили автономные зарядки с логотипом айдиалс * crazy *

Христос Воскрес!Нехай полишають негаразди у дні світлості цього свята!

Prof-kosmetika.com.ua новая почта ттн 20450059818655

LEGO №ттн Нової Пошти 20400065998296 www.constructors.com.ua

Тарифи підвищено через збільшення попиту. Ви можете скасувати поїздку протягом 5 хвилин без плати за скасування. Будьте готові за п ять хвилин до прибуття автомобіля.

Строк дії нарахованих МБ Інтернету для BiP за послугою "Безкоштовний BiP у роумінгу" закінчується 18.03.2018 включно (за київським часом). Далі будуть діяти базові тарифи роумінгу відповідно до країни перебування. Деталі: s.lifecell.ua/r

Извините, за

#### Build Pipeline

In [73]:
def unsquash(X):
    ''' (n,) -> (n,1) '''
    if len(X.shape) == 1 or X.shape[0] == 1:
        return np.asarray(X).reshape((len(X), 1))
    else:
        return X

In [74]:
def squash(X):
    ''' (n,1) -> (n,) '''
    return np.squeeze(np.asarray(X))

In [76]:
class Transformer(TransformerMixin):
    '''Base class for pure transformers'''

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        return X

    def get_params(self, deep=True):
        return dict()

In [77]:
class Squash(Transformer):
    def transform(self, X, **kwargs):
        return squash(X)

In [78]:
class Unsquash(Transformer):
    def transform(self, X, **kwargs):
        return unsquash(X)

In [79]:
class ModelTransformer(TransformerMixin):
    ''' Use model predictions as transformer '''
    def __init__(self, model, probs=True):
        self.model = model
        self.probs = probs

    def get_params(self, deep=True):
        return dict(model=self.model, probs=self.probs)

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        if self.probs:
            pred = self.model.predict_proba(X)[:, 1]
        else:
            pred = self.model.predict(X)
        return unsquash(pred)

In [80]:
class Converter(Transformer):
    
    def __init__(self):
        pass
    
    def transform(self, X, **kwargs):
        if isinstance(X, np.ndarray):
            return X
        elif isinstance(X, pd.Series):
            return X.values
        elif isinstance(X, str):
            return np.array([X])
        else:
            return X

In [81]:
class Length(Transformer):
    def __init__(self, use_tfidf=True):
        self.use_tfidf = use_tfidf

    def get_params(self, deep=True):
        return {"use_tfidf": self.use_tfidf}
    
    def transform(self, X, **kwargs):
        if self.use_tfidf:
            res = (X>0).sum(axis=1)
        else:
            res = np.vectorize(len)(X)
        return unsquash(res)

In [82]:
class TfIdfLen(Transformer):
    def __init__(self, add_len=True, **tfidf_params):
        self.add_len = add_len
        self.tfidf_params = tfidf_params.copy()

    def get_params(self, deep=True):
        output = self.tfidf_params
        output.update({"add_len": self.add_len})
        return output
    
    def set_params(self, **params):
        self.tfidf_params.update(**params)
    
    def fit(self, X, y=None):
        self.add_len = self.tfidf_params.pop("add_len", self.add_len)
        self.vectorizer = TfidfVectorizer(**self.tfidf_params)
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X, **kwargs):
        res = self.vectorizer.transform(X)
        if self.add_len:
            lens = (res > 0).sum(axis=1)
            res = sparse.hstack([res, lens]).tocsr()
        return res

In [108]:
a = X_test.iloc[:1]#.values
l = TfIdfLen(add_len=1, **tf_params)
l.fit_transform(a)

<1x65 sparse matrix of type '<class 'numpy.float64'>'
	with 65 stored elements in Compressed Sparse Row format>

In [84]:
class MatchPattern(Transformer):
    
    def __init__(self, pattern, is_len, flags=re.U):
        self.pattern = pattern
        self.is_len = is_len
        self.flags = flags
        
    def get_params(self, deep=True):
        return dict(pattern=self.pattern, is_len=self.is_len, flags=self.flags)
    
    def transform(self, X, **kwargs):
        if self.is_len:
            func = lambda text: len(re.findall(self.pattern, text, self.flags))
        else:
            func = lambda text: bool(re.search(self.pattern, text, self.flags))
        rez = np.vectorize(func)(X).astype(int)
        return unsquash(rez)

In [96]:
class EnsembleBinaryClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):

    def __init__(self, mode, weights=None):
        self.mode = mode
        self.weights = weights

    def fit(self, X, y=None):
        return self

    def predict_proba(self, X):
        ''' Predict (weighted) probabilities '''
        probs = np.average(X, axis=1, weights=self.weights)
        return np.column_stack((1-probs, probs))

    def predict(self, X):
        ''' Predict class labels. '''
        if self.mode == 'average':
            return binarize(self.predict_proba(X)[:,[1]], 0.5)
        else:
            res = binarize(X, 0.5)
            return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)

In [85]:
def build_ensemble(model_list, estimator=None):
    models = []
    for i, model in enumerate(model_list):
        models.append(('model_transform'+str(i), ModelTransformer(model)))

    if not estimator:
        return FeatureUnion(models)
    else:
        return Pipeline([
            ('features', FeatureUnion(models)),
            ('estimator', estimator)
            ])

In [86]:
def get_vec_pipe(add_len=True, tfidf_params={}):
    vectorizer = TfIdfLen(add_len, **tfidf_params)
    vec_pipe = [
        ('vec', vectorizer)]
    return Pipeline(vec_pipe)

In [109]:
def get_pattern_pipe(patterns):
    pipes = []
    for i, (patt, params) in enumerate(patterns):
        kwargs = params.copy()
        name = kwargs.pop("name") + "_" + str(i)
        transformer = MatchPattern(pattern=patt, **kwargs)
        pipes.append((name, transformer))
    return pipes

In [88]:
def get_len_pipe(use_tfidf=True, vec_pipe=None):
    len_pipe = [("length", Length(use_tfidf))]
    if use_tfidf:
        len_pipe.insert(0, ("vec", vec_pipe))
    return Pipeline(len_pipe)

In [140]:
PATTERNS = [(r"[\(\d][\d\s\(\)-]{8,15}\d", {"name": "phone",
                                            "is_len": 0}),
           (r"%|taxi|скидк|цін", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U})
           ]

In [99]:
def build_transform_pipe(tf_params, add_len=True, vec_mode="add"):
    vec_pipe = get_vec_pipe(add_len, tf_params)
    if vec_mode == "only":
        return vec_pipe
    patt_pipe = get_pattern_pipe(PATTERNS)
    chain = [
        ('converter', Converter()),
        ('union', FeatureUnion([
            ('vec', vec_pipe),
            *patt_pipe
        ]))
    ]
    return chain

In [243]:
def build_classifier(name, seed=25):
    if name == "logit":
        model = LogisticRegression(C=1, class_weight="balanced", random_state=seed, penalty="l2")
        model.grid_s = {f'{name}__C' : (0.1, 0.2, 0.3, 0.4, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__C' : [(1)]}
    elif name == "nb":
        model = MultinomialNB(alpha=0.1) #class_prior=[0.5, 0.5])
        model.grid_s = {f'{name}__alpha' : (0.1, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__alpha' : [(1)]}
    model.name = name
    return model

In [92]:
def get_estimator_pipe(name, model, tf_params, vec_mode="add"):
    chain = build_transform_pipe(tf_params, vec_mode=vec_mode)
    chain.append((name, model))
    pipe = Pipeline(chain)
    pipe.name = name
    return pipe

In [93]:
def get_all_classifiers(names):
    return [build_classifier(name) for name in names]

In [144]:
NAMES = ["logit", "nb"]

In [145]:
def build_all_pipes(tf_params, vec_mode="add", names=NAMES):
    clfs = get_all_classifiers(names)
    return [get_estimator_pipe(clf.name, clf, tf_params, vec_mode) for clf in clfs]

In [152]:
def preprocess(data):
    data = data.loc[data.text.notnull()]
    data["text"] = data["text"].str.replace(r"[\n\r]+", "")
    return data

In [154]:
def load_data(filename=file_out):
    data = pd.read_excel(data_dir / filename)
    data = preprocess(data)
    return data

In [244]:
def grid_search(tf_params, filename=file_out, random_state=25, vec_mode="all",
                n_splits=5, log=True, grid="grid_s", transformer_grid={},
                scoring="f1", estimator_names=NAMES):
    
    data = load_data(filename)
    X, y = data["text"], data["label"]
    cv_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, 
                                  shuffle=True)
    # Build pipelines
    pipes = build_all_pipes(tf_params, names=estimator_names, vec_mode=vec_mode)
    
    best = []
    best_scores = []
    for i, pipe in enumerate(pipes):
        if log:
            print(f"Hypertuning model {i+1} out of {len(pipes)}: {pipe.name}")
            print("================================================================================")

        current_grid = getattr(pipe.steps[-1][1], grid)
        current_grid.update(transformer_grid)
        gs = GridSearchCV(pipe, current_grid, scoring=scoring, cv=cv_splitter, n_jobs=-1, verbose=False)
        model = gs.fit(X, y)
        
        if log:
            print(f"Best score on training set (CV): {gs.best_score_:0.3f}" )
            print("Best parameters set:")   
            for params, mean_score, scores in gs.grid_scores_:
                print(f"{mean_score:0.4f} (+/-{scores.std() / 2:0.4f}) for {params}: {scores}")
        best.append(gs.best_estimator_)
        temp = [el for el in gs.grid_scores_ if el.parameters==gs.best_params_][0]
        best_scores.append({"params": temp[0], "mean": temp[1], "scores": temp[-1],
                            "std": temp[-1].std()})
    return best, best_scores

In [224]:
grid_tf = {#"union__vec__vec__use_idf": [0, 1],
           #"union__vec__vec__ngram_range": [(3,3), (4,4), (5,5), (3,5), (3,4)],
           #"union__vec__vec__max_features": range(2000, 4500, 500)
          }

In [245]:
best_estimators, best_scores = grid_search(transformer_grid=grid_tf, tf_params=tf_params)

Hypertuning model 1 out of 2: logit
Best score on training set (CV): 0.955
Best parameters set:
0.9552 (+/-0.0022) for {'logit__C': 0.1}: [0.95454545 0.95       0.9519833  0.96296296 0.95634096]
0.9547 (+/-0.0035) for {'logit__C': 0.2}: [0.95670103 0.94560669 0.9519833  0.96694215 0.95218295]
0.9547 (+/-0.0035) for {'logit__C': 0.3}: [0.95670103 0.94560669 0.9519833  0.96694215 0.95218295]
0.9550 (+/-0.0036) for {'logit__C': 0.4}: [0.95867769 0.94537815 0.9519833  0.96694215 0.95218295]
0.9550 (+/-0.0036) for {'logit__C': 0.5}: [0.95867769 0.94537815 0.9519833  0.96694215 0.95218295]
0.9533 (+/-0.0042) for {'logit__C': 1}: [0.95652174 0.94291755 0.9539749  0.96694215 0.94605809]
0.9503 (+/-0.0038) for {'logit__C': 5}: [0.95435685 0.93842887 0.9519833  0.96049896 0.94605809]
0.9502 (+/-0.0039) for {'logit__C': 10}: [0.95435685 0.93842887 0.9539749  0.96033403 0.94409938]
Hypertuning model 2 out of 2: nb
Best score on training set (CV): 0.852
Best parameters set:
0.8518 (+/-0.0102) for {

In [246]:
best_scores

[{'params': {'logit__C': 0.1},
  'mean': 0.9551650645693002,
  'scores': array([0.95454545, 0.95      , 0.9519833 , 0.96296296, 0.95634096]),
  'std': 0.004458147006864015},
 {'params': {'nb__alpha': 0.1},
  'mean': 0.851805146456668,
  'scores': array([0.83450704, 0.84135472, 0.86131387, 0.8342246 , 0.88764045]),
  'std': 0.02045529803302403}]

In [237]:
# sms = "привіт заходь до нас у ввечері додому"
# ham, spam = pipe.predict_proba(sms)[0]
# print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")

In [238]:
# clf_log = LogisticRegression(random_state=25, class_weight="balanced",                          
#                              C=0.1, penalty="l2")
# clf_nb = MultinomialNB(alpha=0.1)#, class_prior=[0.5, 0.5])
# clf_ensem = EnsembleBinaryClassifier(mode="averag", weights=[5, 5])  

# chain = build_transform_pipe(tf_params)
# chain.append(('estimator', clf_log))
# pipe = Pipeline(chain)

In [239]:
# pipe.fit(X_train, y_train)
# pred = pipe.predict(X_test)
# proba = pipe.predict_proba(X_test)[:, 1]
# output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
#                                            print_=True, mode="binary")