In [2]:
%load_ext autoreload

In [3]:
%autoreload 1

In [4]:
%aimport src.transformers
%aimport src.pipeline

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import regex
import nltk
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer, word_tokenize
import dill
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from functools import partial
from scipy import sparse

In [3]:
from src.config import data_dir, models_dir
from src.helpers import (calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, 
                         init_dir, save_model, load_model, print_dict)
from src.transformers import (TfIdfLen, ModelTransformer, MatchPattern, Length, 
                              Converter, Transformer, unsquash, Select)
from src.pipeline import (grid_search, analyze_model, load_data, build_transform_pipe, TF_PARAMS, PATTERNS,
                          get_vec_pipe, get_pattern_pipe, TOKEN_FEATURES, build_all_pipes)

In [4]:
data = load_data()

In [5]:
data["tokens"] = data["text"].map(word_tokenize)

In [6]:
X = data[["text", "tokens"]]
y = data["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


#### Features

In [7]:
GRUBER_URLINTEXT_PAT = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
                                  (?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+
                                  (?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>
                                  ?\xab\xbb\u201c\u201d\u2018\u2019]))""", re.X)
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro
                |tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh
                |bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy
                |cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi
                |gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo
                |jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk
                |ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe
                |pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl
                |sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug
                |uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?
                \([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|
                [^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post
                |pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|
                bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co
                |cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga
                |gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in
                |io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu
                |lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng
                |ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa
                |sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk
                |tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za
                |zm|zw)\b/?(?!@)))"""
CURRENCY_PATT = u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]"

In [84]:
tf_params = {'lowercase': True,
 'analyzer': 'char_wb',
 'stop_words': None,
 'ngram_range': (4, 4),
 'min_df': 0.0,
 'max_df': 1.0,
 'preprocessor': None,
 'max_features': 4000,
 'norm': '',
 'use_idf': 1}
patterns = [(r"[\(\d][\d\s\(\)-]{8,15}\d", {"name": "phone",
                                            "is_len": 0}),
           (r"%|taxi|скид(?:к|очн)|ц[іе]н|знижк|такс[иі]|промо|акц[іи]|bonus|бонус", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U}),
#            (r"[+-<>/^]", {"name": "math_ops", "is_len": 0}),
            (r"[.]", {"name": "dot", "is_len": 0}),
#            (WEB_URL_REGEX, {"name": "url", "is_len": 0, "flags": re.X}),
            (CURRENCY_PATT, {"name": "currency", "is_len": 0, "flags": re.U}),
#            (r"[*]", {"name": "special_symbols", "is_len": 0})
            (r":\)|:\(|-_-|:p|:v|:\*|:o|B-\)|:’\(", {"name": "emoji", "is_len": 0, "flags": re.U}),
            (r"[0-9]{2,4}[.-/][0-9]{2,4}[.-/][0-9]{2,4}", {"name": "date", "is_len": 0})
           ]

In [10]:
def get_tokens_pipe(tokenizer=word_tokenize, features=TOKEN_FEATURES):
    token_features = TokenFeatures(tokenizer, features=features)
    tok_pipe = [
        ("selector", Select(["tokens"], to_np=0)),
        ('tok', token_features)]
    return Pipeline(tok_pipe)

def get_vec_pipe(add_len=True, tfidf_params={}):
    vectorizer = TfIdfLen(add_len, **tfidf_params)
    vec_pipe = [
        ('vec', vectorizer)]
    return Pipeline(vec_pipe)

In [11]:
def get_pattern_pipe(patterns):
    pipes = []
    for i, (patt, params) in enumerate(patterns):
        kwargs = params.copy()
        name = kwargs.pop("name") + "_" + str(i)
        transformer = MatchPattern(pattern=patt, **kwargs)
        pipes.append((name, transformer))
    return pipes

def get_len_pipe(use_tfidf=True, vec_pipe=None):
    len_pipe = [("length", Length(use_tfidf))]
    if use_tfidf:
        len_pipe.insert(0, ("vec", vec_pipe))
    return Pipeline(len_pipe)

In [12]:
def build_transform_pipe(tf_params=tf_params, add_len=True, vec_mode="add", patterns=patterns,
                         tokenizer=word_tokenize, features=TOKEN_FEATURES):
    vec_pipe = get_vec_pipe(add_len, tf_params)
    if vec_mode == "only":
        return vec_pipe
    patt_pipe = get_pattern_pipe(patterns)
    chain = [
        ('selector', Select(["text"], to_np=0)),
        ('converter', Converter()),
        ('union', FeatureUnion([
            ('vec', vec_pipe),
            *patt_pipe
        ]))
    ]
    tok_pipe = get_tokens_pipe(tokenizer, features)
    final_chain = FeatureUnion([("chain", Pipeline(chain)),
                                ("tok", tok_pipe)])
    return [("final_chain", final_chain)]

def build_classifier(name, seed=25):
    if name == "logit":
        model = LogisticRegression(C=1, class_weight="balanced", random_state=seed, penalty="l2")
        model.grid_s = {f'{name}__C' : (0.1, 0.2, 0.3, 0.4, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__C' : [(1)]}
    elif name == "nb":
        model = MultinomialNB(alpha=0.1) #class_prior=[0.5, 0.5])
        model.grid_s = {f'{name}__alpha' : (0.1, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__alpha' : [(1)]}
    model.name = name
    return model

def get_estimator_pipe(name, model, tf_params, vec_mode="add", patterns=patterns, features=TOKEN_FEATURES):
    chain = build_transform_pipe(tf_params, vec_mode=vec_mode, patterns=patterns, features=features)
    chain.append((name, model))
    pipe = Pipeline(chain)
    pipe.name = name
    return pipe

In [13]:
vec_pipe = get_vec_pipe(True, tf_params)
patt_pipe = get_pattern_pipe(patterns)
chain = [
    ('selector', Select(["text"], to_np=0)),
      ('converter', Converter()),
    ('union', FeatureUnion([
        ('vec', vec_pipe),
        *patt_pipe
   ]))
]
pipe = Pipeline(chain)
pipe.fit_transform(X_test)

<1831x4007 sparse matrix of type '<class 'numpy.float64'>'
	with 89252 stored elements in Compressed Sparse Row format>

In [14]:
def is_lower(tokens):
    return any(token.islower() for token in tokens)

def is_upper(tokens):
    return any(token.isupper() for token in tokens)

In [15]:
class TokenFeatures(Transformer):

    def __init__(self, tokenizer=word_tokenize, features=None):
        self.tokenizer = tokenizer
        self.features = features

    def get_params(self, deep=True):
        return dict()

    def _get_features(self, tokens):
        output = []
        for f in self.features:
            output.append(eval(f)(tokens))
        return np.array(output)
    
    def _job(self, text):
        tokens = self.tokenizer(text)
        return self._get_features(tokens)
        
    def transform(self, X, **kwargs):  
        rez = []
        for record in X:
            temp = self._get_features(record)
            rez.append(temp)
        return np.array(rez)

In [13]:
trf = build_transform_pipe()

In [43]:
clf = build_classifier("logit")
pipe = get_estimator_pipe(clf.name, clf, tf_params)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('final_chain', FeatureUnion(n_jobs=1,
       transformer_list=[('chain', Pipeline(memory=None,
     steps=[('selector', <src.transformers.Select object at 0x7f7c265a4278>), ('converter', <src.transformers.Converter object at 0x7f7c266157b8>), ('union', FeatureUnion(n_jobs=1,
       transform...alty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [70]:
sms = "поїдем до них на таксі?"
sms_df = pd.DataFrame({"text": sms, "tokens": word_tokenize(sms)})
ham, spam = pipe.predict_proba(sms_df)[0]
print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")

Probability ham: 41.728%
Probability spam: 58.272%


In [53]:
p = r"[0-9]{2,4}[.-/][0-9]{2,4}[.-/][0-9]{2,4}"
repr(p)
re.findall(p, "21.04.2016", re.U)

"'[0-9]{2,4}[.-/\\\\][0-9]{2,4}[.-/\\\\][0-9]{2,4}'"

['21.04.2016']

#### Grid Search CV

In [85]:
best_estimators, best_scores = grid_search(patterns=patterns, estimator_names=["logit"])

Hypertuning model 1 out of 1: logit
Best score on training set (CV): 0.960
Best parameters set:
0.9587 (+/-0.0029) for {'logit__C': 0.1}: [0.9694501  0.95652174 0.95218295 0.95867769 0.95687885]
0.9599 (+/-0.0041) for {'logit__C': 0.2}: [0.97352342 0.95652174 0.95       0.96465696 0.95473251]
0.9590 (+/-0.0041) for {'logit__C': 0.3}: [0.97142857 0.95652174 0.94780793 0.96465696 0.95473251]
0.9573 (+/-0.0037) for {'logit__C': 0.4}: [0.96734694 0.95652174 0.94560669 0.9625     0.95473251]
0.9569 (+/-0.0034) for {'logit__C': 0.5}: [0.96523517 0.95652174 0.94560669 0.9625     0.95473251]
0.9556 (+/-0.0035) for {'logit__C': 1}: [0.96523517 0.95652174 0.94560669 0.96033403 0.95041322]
0.9548 (+/-0.0037) for {'logit__C': 5}: [0.96326531 0.95850622 0.94560669 0.96049896 0.94605809]
0.9527 (+/-0.0032) for {'logit__C': 10}: [0.95705521 0.95850622 0.94560669 0.95833333 0.94409938]


In [86]:
best_scores

[{'params': {'logit__C': 0.2},
  'mean': 0.9598908445178538,
  'scores': array([0.97352342, 0.95652174, 0.95      , 0.96465696, 0.95473251]),
  'std': 0.008298807992172147}]

In [37]:
best_scores

[{'params': {'logit__C': 0.2},
  'mean': 0.9599238561276642,
  'scores': array([0.97154472, 0.95867769, 0.95      , 0.9625    , 0.95687885]),
  'std': 0.007085036990868852}]

In [87]:
scores, results, conf_matrix, fnp = analyze_model(model=best_estimators[0], log_fold=False)


Overall results
AUC: 1.00 +/- 0.0011
Accuracy: 0.98 +/- 0.0036
F1: 0.96 +/- 0.0093
Precision: 0.96 +/- 0.0074
Recall: 0.96 +/- 0.0150

Averaged confusion matrix
      pred_ham  pred_spam
ham      968.8        8.6
spam      10.8      232.4

Mean metrics
accuracy: 0.984
specificity: 0.964
recall: 0.989
precision: 0.991
f1: 0.990


In [88]:
fn, fp = fnp["fn"], fnp["fp"]
for el in X.iloc[fn]["text"]:
    print(el+"\n")

Tviy kod 90F7C416K. Obminyay yogo na Pepsi-Bonus v KFC Vokzal Pivdenniy z 16.04.2015 do 16.05.2015

Z 04/07 centr VOLIA po vul.Kikvidze 1/2 bude zachineno na remont. Najblizhchi adresy dlya zvernen: vul.Vasylkivska 4, vul.Knazhyi Zaton 2/30

Women's Day When: 7 March 22:00 Where: TAO Restaurant & Dance Bar 06735

Lyubyi druzhe! Cherez nadzvychaino velyku kilkist otrymanyh lystiv - rozigrash pryziv vidbudetsya 27.05. Dyakuemo za uchast! Bazhaemo peremohy! hwclub.com.ua

Магазин "Карфур" стал больше, новый товар. Открытие 22.10 в 11:00

Время подарков пришло! Цитрус продлил работу на час! time.citrus.ua

Київ, ми знизили ціни на uberX! Відтепер поїздки по місту - від 25 грн

Використай нагоду! На вашому рахунку 31 бонусів. Витрачай до 05.03!

Заказ№4999 ожидает Вас по адресу пр.ЛесяКурбаса,16-а до 19.03

Tomorrow's forecast in SOMA South Park, San Francisco is Clear. https://m.twil.io/kYotCFy

Новые график в КАРФУР!Пт-80%Чт-70%Ср-60%Вт-50%Пн-30%Вс-10%,Сб-ЗАВОЗ!

Москитные сетки Регулиров

In [83]:
(data
 .assign(l=lambda x: x["text"].str.findall(r"%|taxi|скид(?:к|очн)|ц[іе]н|знижк|такс[иі]|промо|акц[іи]|bonus|бонус", flags=re.I|re.U).map(len))
).groupby("l")["label"].agg(["mean", "count"])

Unnamed: 0_level_0,mean,count
l,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.109932,5276
1,0.636704,534
2,1.0,220
3,1.0,63
4,1.0,7
5,1.0,1
6,1.0,1
7,1.0,1
