In [11]:
#%load_ext autoreload

In [12]:
#%autoreload 1

In [13]:
#%aimport src.transformers
#%aimport src.pipeline

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import regex
import dill
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from functools import partial
from scipy import sparse

In [3]:
from src.config import data_dir, models_dir
from src.helpers import (calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, 
                         init_dir, save_model, load_model, print_dict)
from src.transformers import TfIdfLen, ModelTransformer, MatchPattern, Length, Converter, Transformer
from src.pipeline import (grid_search, analyze_model, load_data, build_transform_pipe, TF_PARAMS, PATTERNS,
                          get_vec_pipe, get_pattern_pipe)

In [4]:
data = load_data()

In [5]:
X = data["text"]
y = data["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


#### Features

In [6]:
GRUBER_URLINTEXT_PAT = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)
                                  (?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+
                                  (?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>
                                  ?\xab\xbb\u201c\u201d\u2018\u2019]))""", re.X)
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro
                |tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh
                |bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy
                |cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi
                |gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo
                |jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk
                |ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe
                |pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl
                |sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug
                |uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?
                \([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|
                [^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.]
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post
                |pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|
                bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co
                |cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga
                |gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in
                |io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu
                |lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng
                |ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa
                |sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk
                |tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za
                |zm|zw)\b/?(?!@)))"""

In [31]:
tf_params = {'lowercase': True,
 'analyzer': 'char_wb',
 'stop_words': None,
 'ngram_range': (4, 4),
 'min_df': 0.0,
 'max_df': 1.0,
 'preprocessor': None,
 'max_features': 4000,
 'norm': '',
 'use_idf': 1}
patterns = [(r"[\(\d][\d\s\(\)-]{8,15}\d", {"name": "phone",
                                            "is_len": 0}),
           (r"%|taxi|скидк|цін", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U}),
           (r"[+-<>/^]", {"name": "math_ops", "is_len": 0}),
           (r"[.]", {"name": "dot", "is_len": 0}),
           (WEB_URL_REGEX, {"name": "url", "is_len": 0, "flags": re.X}),
           #(r"\p{Sc}", {"name": "currency", "is_len": 0, "lib": "regex", "flags": 0}),
            (u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]",
            {"name": "currency", "is_len": 0, "flags": re.U}),
           (r"[!*&#~]", {"name": "special_symbols", "is_len": 0})
           ]

In [30]:
p = u"[$¢£¤¥֏؋৲৳৻૱௹฿៛\u20a0-\u20bd\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6]"
re.findall(p, "$", re.U)

['$']

In [13]:
pattern_pipes = []
for i, (patt, params) in enumerate(patterns):
    kwargs = params.copy()
    name = kwargs.pop("name") + "_" + str(i)
    transformer = MatchPattern(pattern=patt, **kwargs)
    pattern_pipes.append((name, transformer))

In [27]:
vec_pipe = get_vec_pipe(True, tf_params)
chain = [
    ('converter', Converter()),
    ('union', FeatureUnion([
        ('vec', vec_pipe),
        *pattern_pipes
    ]))
]


In [32]:
best_estimators, best_scores = grid_search(patterns=patterns, estimator_names=["logit"])

Hypertuning model 1 out of 1: logit
Best score on training set (CV): 0.955
Best parameters set:
0.9544 (+/-0.0010) for {'logit__C': 0.1}: [0.95670103 0.95218295 0.9519833  0.95473251 0.95652174]
0.9547 (+/-0.0026) for {'logit__C': 0.2}: [0.96082474 0.94780793 0.9519833  0.96066253 0.95238095]
0.9543 (+/-0.0027) for {'logit__C': 0.3}: [0.96082474 0.94780793 0.9519833  0.96066253 0.95020747]
0.9534 (+/-0.0032) for {'logit__C': 0.4}: [0.96082474 0.94560669 0.9519833  0.96066253 0.94802495]
0.9534 (+/-0.0032) for {'logit__C': 0.5}: [0.96082474 0.94560669 0.9519833  0.96066253 0.94802495]
0.9509 (+/-0.0035) for {'logit__C': 1}: [0.95670103 0.94092827 0.95416667 0.95850622 0.94409938]
0.9487 (+/-0.0043) for {'logit__C': 5}: [0.95867769 0.93446089 0.9519833  0.95416667 0.94409938]
0.9474 (+/-0.0040) for {'logit__C': 10}: [0.95454545 0.93446089 0.9519833  0.95416667 0.94190871]
