In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.transformers
%aimport src.pipeline

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import regex
import nltk
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer, StanfordTokenizer, word_tokenize
import dill
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from functools import partial
from scipy import sparse

In [4]:
from src.config import data_dir, models_dir
from src.helpers import (calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, 
                         init_dir, save_model, load_model, print_dict)
from src.transformers import TfIdfLen, ModelTransformer, MatchPattern, Length, Converter, Transformer, unsquash
from src.pipeline import (grid_search, analyze_model, load_data, build_transform_pipe, TF_PARAMS, PATTERNS,
                          get_vec_pipe, get_pattern_pipe, TOKEN_FEATURES, build_all_pipes)

In [4]:
t=build_transform_pipe()[-1][-1]
t.transformer_list[-1][-1].steps[-1][-1].features

['is_upper', 'is_lower']

In [5]:
data = load_data()

In [6]:
X = data["text"]
y = data["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


#### Features

In [7]:
GRUBER_URLINTEXT_PAT = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
                                  (?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+
                                  (?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>
                                  ?\xab\xbb\u201c\u201d\u2018\u2019]))""", re.X)
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro
                |tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh
                |bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy
                |cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi
                |gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo
                |jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk
                |ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe
                |pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl
                |sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug
                |uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?
                \([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|
                [^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post
                |pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|
                bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co
                |cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga
                |gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in
                |io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu
                |lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng
                |ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa
                |sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk
                |tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za
                |zm|zw)\b/?(?!@)))"""
CURRENCY_PATT = u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]"

In [8]:
tf_params = {'lowercase': True,
 'analyzer': 'char_wb',
 'stop_words': None,
 'ngram_range': (4, 4),
 'min_df': 0.0,
 'max_df': 1.0,
 'preprocessor': None,
 'max_features': 4000,
 'norm': '',
 'use_idf': 1}
patterns = [(r"[\(\d][\d\s\(\)-]{8,15}\d", {"name": "phone",
                                            "is_len": 0}),
           (r"%|taxi|скид(?:к|очн)|ц[іе]н|знижк", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U}),
#            (r"[+-<>/^]", {"name": "math_ops", "is_len": 0}),
            (r"[.]", {"name": "dot", "is_len": 0}),
#            (WEB_URL_REGEX, {"name": "url", "is_len": 0, "flags": re.X}),
            (CURRENCY_PATT, {"name": "currency", "is_len": 0, "flags": re.U}),
#            (r"[*]", {"name": "special_symbols", "is_len": 0})
            (r":\)|:\(|-_-|:p|:v|:\*|:o|B-\)|:’\(", {"name": "emoji", "is_len": 0, "flags": re.U})
           ]

In [30]:
def get_tokens_pipe(tokenizer=word_tokenize, features=TOKEN_FEATURES):
    token_features = TokenFeatures(tokenizer, features=features)
    tok_pipe = [
        ('tok', token_features)]
    return Pipeline(tok_pipe)

def get_pattern_pipe(patterns):
    pipes = []
    for i, (patt, params) in enumerate(patterns):
        kwargs = params.copy()
        name = kwargs.pop("name") + "_" + str(i)
        transformer = MatchPattern(pattern=patt, **kwargs)
        pipes.append((name, transformer))
    return pipes

def get_len_pipe(use_tfidf=True, vec_pipe=None):
    len_pipe = [("length", Length(use_tfidf))]
    if use_tfidf:
        len_pipe.insert(0, ("vec", vec_pipe))
    return Pipeline(len_pipe)

def build_transform_pipe(tf_params=tf_params, add_len=True, vec_mode="add", patterns=patterns,
                         tokenizer=word_tokenize, features=TOKEN_FEATURES):
    vec_pipe = get_vec_pipe(add_len, tf_params)
    if vec_mode == "only":
        return vec_pipe
    patt_pipe = get_pattern_pipe(patterns)
    tok_pipe = get_tokens_pipe(tokenizer, features)
    chain = [
        ('converter', Converter()),
        ('union', FeatureUnion([
            ('vec', vec_pipe),
            *patt_pipe,
            ("tok", tok_pipe)
        ]))
    ]
    return chain

def build_classifier(name, seed=25):
    if name == "logit":
        model = LogisticRegression(C=1, class_weight="balanced", random_state=seed, penalty="l2")
        model.grid_s = {f'{name}__C' : (0.1, 0.2, 0.3, 0.4, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__C' : [(1)]}
    elif name == "nb":
        model = MultinomialNB(alpha=0.1) #class_prior=[0.5, 0.5])
        model.grid_s = {f'{name}__alpha' : (0.1, 0.5, 1, 5, 10)}
        model.grid_b = {f'{name}__alpha' : [(1)]}
    model.name = name
    return model

def get_estimator_pipe(name, model, tf_params, vec_mode="add", patterns=patterns, features=TOKEN_FEATURES):
    chain = build_transform_pipe(tf_params, vec_mode=vec_mode, patterns=patterns, features=features)
    chain.append((name, model))
    pipe = Pipeline(chain)
    pipe.name = name
    return pipe

In [54]:
class TokenFeatures(Transformer):

    def __init__(self, tokenizer=word_tokenize, features=None):
        self.tokenizer = tokenizer
        self.features = features

    def get_params(self, deep=True):
        return dict()

    def _get_features(self, tokens):
        output = []
        for f in self.features:
            output.append(eval(f)(tokens))
        return np.array(output)
    
    def _job(self, text):
        tokens = self.tokenizer(text)
        return self._get_features(tokens)
        
    def transform(self, X, **kwargs):  
        rez = []
        for record in X:
            temp = self._job(record)
            rez.append(temp)
        return np.array(rez)

In [57]:
clf = build_classifier("logit")
pipe = get_estimator_pipe(clf.name, clf, tf_params)
pipe.fit(X, y)

Pipeline(memory=None,
     steps=[('converter', <src.transformers.Converter object at 0x7f6b701f2ba8>), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('vec', Pipeline(memory=None,
     steps=[('vec', <src.transformers.TfIdfLen object at 0x7f6b701f2f28>)])), ('phone_0', <src.transformers.MatchPattern object at 0x7f...alty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [66]:
trf = Pipeline(build_transform_pipe(tf_params, vec_mode="add", patterns=patterns))
trf.fit_transform(X.iloc[:1]).toarray().shape

(1, 17)

In [58]:
sms = "привіт заходь до нас у ввечері додому"
ham, spam = pipe.predict_proba(sms)[0]
print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")

Probability ham: 98.774%
Probability spam: 1.226%


In [47]:
p = r":\)|:\(|-_-|:p|:v|:\*|:o|B-\)|:’\("
re.findall(p, "-_-", re.U)

['-_-']

In [32]:
pattern_pipes = []
for i, (patt, params) in enumerate(patterns):
    kwargs = params.copy()
    name = kwargs.pop("name") + "_" + str(i)
    transformer = MatchPattern(pattern=patt, **kwargs)
    pattern_pipes.append((name, transformer))

In [9]:
best_estimators, best_scores = grid_search(patterns=patterns, estimator_names=["logit"])

Hypertuning model 1 out of 1: logit
Best score on training set (CV): 0.956
Best parameters set:
0.9560 (+/-0.0021) for {'logit__C': 0.1}: [0.95454545 0.95218295 0.9519833  0.96296296 0.95850622]
0.9551 (+/-0.0033) for {'logit__C': 0.2}: [0.95670103 0.94780793 0.9519833  0.96694215 0.95218295]
0.9551 (+/-0.0033) for {'logit__C': 0.3}: [0.95670103 0.94780793 0.9519833  0.96694215 0.95218295]
0.9547 (+/-0.0035) for {'logit__C': 0.4}: [0.95670103 0.94560669 0.9519833  0.96694215 0.95218295]
0.9550 (+/-0.0035) for {'logit__C': 0.5}: [0.95867769 0.9475891  0.9519833  0.96694215 0.95      ]
0.9546 (+/-0.0038) for {'logit__C': 1}: [0.95652174 0.94514768 0.95615866 0.96694215 0.94802495]
0.9520 (+/-0.0044) for {'logit__C': 5}: [0.95435685 0.94067797 0.9519833  0.96680498 0.94605809]
0.9515 (+/-0.0040) for {'logit__C': 10}: [0.95238095 0.94067797 0.9539749  0.96465696 0.94605809]


In [10]:
best_scores

[{'mean': 0.9560346387525567,
  'params': {'logit__C': 0.1},
  'scores': array([0.95454545, 0.95218295, 0.9519833 , 0.96296296, 0.95850622]),
  'std': 0.0041851383498463365}]

In [14]:
best_scores

[{'mean': 0.9560346387525567,
  'params': {'logit__C': 0.1},
  'scores': array([0.95454545, 0.95218295, 0.9519833 , 0.96296296, 0.95850622]),
  'std': 0.0041851383498463365}]

In [12]:
scores, results, conf_matrix, fnp = analyze_model(model=best_estimators[0], log_fold=False)


Overall results
AUC: 0.99 +/- 0.0038
Accuracy: 0.98 +/- 0.0018
F1: 0.96 +/- 0.0047
Precision: 0.96 +/- 0.0031
Recall: 0.95 +/- 0.0112

Averaged confusion matrix
      pred_ham  pred_spam
ham      968.8        9.2
spam      12.0      230.6

Mean metrics
accuracy: 0.983
specificity: 0.962
recall: 0.988
precision: 0.991
f1: 0.989


In [None]:
fn, fp = fnp["fn"], fnp["fp"]
for el in X.iloc[fn]:
    print(el+"\n")