In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.config
%aimport src.helpers

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from functools import partial
from scipy import sparse

In [11]:
from src.config import data_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class

#### Process raw SMS data

In [51]:
filename = "karim-sms-allow.xml"
source = data_dir / filename
data = []
for event, elem in iterparse(source):
    if elem.tag == "sms":
        #if any(elem.attrib["body"]==r["text"] for r in data):
        #    continue
        record = {}
        record["text"] = elem.attrib["body"]
        record["contact_name"] = elem.attrib["contact_name"]
        record["address"] = elem.attrib["address"]
        record["timestamp"] = int(elem.attrib["date"])
        record["type"] = elem.attrib["type"]
        data.append(record)

In [55]:
df = pd.DataFrame(data)
df.to_excel(data_dir / "karim-sms-allow.xlsx", index=False)

#### Read labeled data

In [205]:
labeled_filename = "karim-sms-allow-labeled.xlsx"
labeled = pd.read_excel(data_dir / labeled_filename, sheet_name="total sms")
labeled["timestamp"] = (labeled["timestamp"] / 1000).map(datetime.fromtimestamp)
labeled["resp"] = 0

In [206]:
labeled_filename_1 = "tanya-sms-all.xlsx"
labeled_1 = pd.read_excel(data_dir / labeled_filename_1)
date_format = "%m-%d-%Y %H:%M:%S"
labeled_1["timestamp"] = labeled_1["timestamp"].map(lambda x: datetime.strptime(x, date_format))
exclude = "Karimushka"
labeled_1 = labeled_1.loc[~(labeled_1.contact_name==exclude)]

In [207]:
mapp = {"ham": 0, "spam": 1}

In [208]:
responses_filename = "SMS Data Collection (Responses).xlsx"
responses = pd.read_excel(data_dir / responses_filename)
responses = responses.rename(columns={"SMS text": "text", 
                                      "Is it a spam or ham?": "label",
                                     "Timestamp": "timestamp"})
responses["resp"] = 1
responses["label"] = responses["label"].map(lambda x: mapp.get(x, x))

In [209]:
total = pd.concat([labeled, responses, labeled_1], ignore_index=True)
total.to_excel(data_dir / "sms-uk-total.xlsx")

In [211]:
# Check dimensionality and class imbalance
total.shape
total.label.value_counts(normalize=True).round(5)*100
total.text.isnull().sum()
total = total.loc[total.text.notnull()]
total.shape

(6107, 8)

0    80.138
1    19.862
Name: label, dtype: float64

3

(6104, 8)

#### Train-test split

In [212]:
total = pd.read_excel(data_dir / "sms-uk-total.xlsx")
total = total.loc[total.text.notnull()]

In [213]:
total["text_rep"] = total["text"].str.replace(r"[\(\d][\d\s\(\)-]{8,15}\d", "PHONE_NUMBER", flags=re.I)

In [214]:
total["text"] = total["text"].str.replace(r"[\n\r]+", "")

In [215]:
X = total["text"]
y = total["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


#### Build features

In [183]:
def build_features(X_train, X_test, var="text", features=None, vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            tf_train = vectorizer.fit_transform(X_train).toarray()
            tf_test = vectorizer.transform(X_test).toarray()
            f_train.append(tf_train)
            f_test.append(tf_test)
        if feature == "length":
            if "tfidf" in features:
                train = (tf_train>0).sum(axis=1)[:, np.newaxis]
                test = (tf_test>0).sum(axis=1)[:, np.newaxis]
            else:
                train = X_train.map(len).values[:, np.newaxis]
                test = X_test.map(len).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
        if feature == "patt":
            patt = "%|taxi|скидк|цін"
            train = (X_train.str.contains(patt, regex=True, flags=re.I)
                     .astype(int).values[:, np.newaxis])
            test = (X_test.str.contains(patt, regex=True, flags=re.I)
                    .astype(int).values[:, np.newaxis])
            f_train.append(train)
            f_test.append(test)
        if feature == "phone":
            patt = r"[\(\d][\d\s\(\)-]{8,15}\d"
            train = X_train.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            test = X_test.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [184]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": None,
             "ngram_range": (4, 4),
             "min_df": 0.0,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 4000,
             "norm": "l2"*0,
             "use_idf": 1
             }

In [27]:
# Remove Top N features
# top = 100
# r = tfidf_train.toarray().sum(axis=1)
# topn_ids = np.argsort(r)[::-1][:top]
# voc = [f for i,f in enumerate(features) if i not in topn_ids]
# tf_params["vocabulary"] = None#voc

In [216]:
vectorizer = TfidfVectorizer(**tf_params)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
features = [
            "tfidf", 
            "length",
            "phone",
            "patt",
]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer, var="text")

In [26]:
# features = vectorizer.get_feature_names()
# dfs = top_feats_by_class(tfidf_train, y_train, features, min_tfidf=0.1, top_n=25)
# plot_tfidf_classfeats_h(dfs)

#### Fit Naive Bayes

In general it is much worse to misclassify ham
SMS than letting spam pass the filter. So, it is desirable to be able to bias
the filter towards classifying SMS as ham, yielding higher precision at the expense of recall

In [29]:
def predict_class(tf, X_test, clf, w=1.5):
    probas = clf.predict_proba(X_test)
    ratios = np.log(probas[:, 1] ) - np.log(probas[:, 0])
    lengths = (tf.toarray()>0).sum(axis=1).T
    thresholds = lengths * np.log(w)
    y_pred = np.zeros_like(y_test)
    y_pred[ratios>thresholds] = 1
    return y_pred, ratios, thresholds

In [1492]:
clf = RandomForestClassifier(min_samples_leaf=5, min_samples_split=15,
                             n_estimators=100, max_depth=20, max_features="auto", 
                             class_weight="balanced")

In [231]:
clf = LogisticRegression(random_state=25, class_weight="balanced", 
                         C=0.01, penalty="l2")
#clf = MultinomialNB(alpha=0.01)#, class_prior=[0.5, 0.5])
clf.fit(train, y_train)
#pred, ratios, thresholds = predict_class(tfidf_test, test, clf, w=1.2)
pred = clf.predict(test)
proba = clf.predict_proba(test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

AUC: 0.997
Recall: 0.970
Precision: 0.957
F1: 0.963
Accuracy: 0.985

Confusion matrix:
      pred_ham  pred_spam
ham       1451         16
spam        11        353

Report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1467
          1       0.96      0.97      0.96       364

avg / total       0.99      0.99      0.99      1831



In [203]:
X_test.iloc[fn_i[:2]]
total.loc[3469]

3956    Ждем на выходных в Кувшине!Барашек на вертеле,...
3469     "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
Name: text, dtype: object

address                                                       NaN
contact_name                                                  NaN
label                                                           1
resp                                                            1
service                                                       NaN
text             "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
timestamp                              2018-04-12 16:55:09.238000
type                                                          NaN
text_rep         "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya...
Name: 3469, dtype: object

In [233]:
fp_i = np.where((pred==1) & (y_test==0))[0]
fn_i = np.where((pred==0) & (y_test==1))[0]
for el in X_test.iloc[fn_i].values:
    print(el+"\n")

Ждем на выходных в Кувшине!Барашек на вертеле,ароматная шурпа,шашлык на мангале,хинкали!Детские развлечения с аниматорами!(067)4687258

Ispolnyajte zhelaniya s kartoj Universalnaya ot A-Banka, limit - do 50 tys. grn. My zhdem Vas: Kiev, Akademika Korolyova 8A.

З 19.04.17 змінюється тариф на роумінг в Білорусі: дзвінки 10хв - 45грн, 100МБ- 55грн, 15 вихідних SMS- 20грн. Пакет до кінця доби (за Київським часом), не залежить від оператора в країні. Кількість пакетів необмежена. Деталі: s.lifecell.ua/74 

ФИНАЛЬНАЯ РАСПРОДАЖА⚠️   ⚠️   ⚠️До - 70% на более 1000 ароматов популярной парфюмерии.🌺*** Спешите. До конца акции осталось 3 дня ***✈️    

 "OSCHADBANK"  VASHU KARTKU ZABLOKOVANO.  Dlya razblokyvanya zvernicya do kontakt-centru za nom. 0919535656 goryacay liniya OSCHADBANK   

Горит Кипр! Туры в августе от 680 евро на двоих! Звоните, приходите- Туи Турагентство на Ольжича,9.

MGI запрошує Вас на концерт хору із Арканзасу сьогодні 26.04 о 19:00

Замовляйте вигідний Інтернет-пакет 1ГБ на

#### Fit XGBoost

In [762]:
params = {}
#params['scale_pos_weight'] = sum(y_train==0) / sum(y_train==1)
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = 'binary:logistic'
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["error","auc"]
params["early_stopping_rounds"] = 50

In [763]:
dtrain = xgb.DMatrix(train, y_train)
dtest = xgb.DMatrix(test, y_test)
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [764]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-error:0.212165	train-auc:0.817627	eval-error:0.211731	eval-auc:0.80761
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[50]	train-error:0.188551	train-auc:0.843192	eval-error:0.197425	eval-auc:0.825158
Stopping. Best iteration:
[20]	train-error:0.212165	train-auc:0.842316	eval-error:0.211731	eval-auc:0.828377



In [765]:
pred_proba_xgb = model.predict(dtest)
pred_xgb = np.zeros_like(pred_proba_xgb)
pred_xgb[pred_proba_xgb>=0.5] = 1
xgb_metrics = calc_metrics(y_test, pred_xgb, pred_proba_xgb, mode="binary",
                          labels=["ham", "spam"], print_=True)

AUC: 0.825
Recall: 0.345
Precision: 0.554
F1: 0.425
Accuracy: 0.803

Confusion matrix:
      pred_ham  pred_spam
ham        510         41
spam        97         51

Report:
             precision    recall  f1-score   support

          0       0.84      0.93      0.88       551
          1       0.55      0.34      0.43       148

avg / total       0.78      0.80      0.78       699



In [55]:
def unsquash(X):
    ''' (n,) -> (n,1) '''
    if len(X.shape) == 1 or X.shape[0] == 1:
        return np.asarray(X).reshape((len(X), 1))
    else:
        return X

In [56]:
def squash(X):
    ''' (n,1) -> (n,) '''
    return np.squeeze(np.asarray(X))

In [141]:
class Squash(Transformer):
    def transform(self, X, **kwargs):
        return squash(X)

In [142]:
class Unsquash(Transformer):
    def transform(self, X, **kwargs):
        return unsquash(X)

In [57]:
class Transformer(TransformerMixin):
    '''Base class for pure transformers'''

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        return X

    def get_params(self, deep=True):
        return dict()

In [58]:
class ModelTransformer(TransformerMixin):
    ''' Use model predictions as transformer '''
    def __init__(self, model, probs=True):
        self.model = model
        self.probs = probs

    def get_params(self, deep=True):
        return dict(model=self.model, probs=self.probs)

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        if self.probs:
            pred = self.model.predict_proba(X)[:, 1]
        else:
            pred = self.model.predict(X)
        return unsquash(pred)

In [59]:
class Converter(Transformer):
    
    def __init__(self):
        pass
    
    def transform(self, X, **kwargs):
        if isinstance(X, np.ndarray):
            return X
        elif isinstance(X, pd.Series):
            return X.values
        elif isinstance(X, str):
            return np.array([X])
        else:
            return X

In [140]:
class Length(Transformer):
    def __init__(self, use_tfidf=True):
        self.use_tfidf = use_tfidf

    def get_params(self, deep=True):
        return {"use_tfidf": self.use_tfidf}
    
    def transform(self, X, **kwargs):
        if self.use_tfidf:
            res = (X>0).sum(axis=1)
        else:
            res = np.vectorize(len)(X)
        return unsquash(res)

In [285]:
class TfIdfLen(Transformer):
    def __init__(self, add_len=True, **tfidf_params):
        self.add_len = add_len
        self.tfidf_params = tfidf_params.copy()

    def get_params(self, deep=True):
        output = self.tfidf_params
        output.update({"add_len": self.add_len})
        return output
    
    def set_params(self, **params):
        self.tfidf_params.update(**params)
    
    def fit(self, X, y=None):
        self.add_len = self.tfidf_params.pop("add_len", self.add_len)
        self.vectorizer = TfidfVectorizer(**self.tfidf_params)
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X, **kwargs):
        res = self.vectorizer.transform(X)
        if self.add_len:
            lens = (res > 0).sum(axis=1)
            res = sparse.hstack([res, lens]).tocsr()
        return res

In [284]:
a = X_test.iloc[:1]#.values
l = TfIdfLen(add_len=1, **tf_params)
l.fit_transform(a)

<1x46 sparse matrix of type '<class 'numpy.float64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [62]:
class MatchPattern(Transformer):
    
    def __init__(self, pattern, is_len, flags=re.U):
        self.pattern = pattern
        self.is_len = is_len
        self.flags = flags
        
    def get_params(self, deep=True):
        return dict(pattern=self.pattern, is_len=self.is_len, flags=self.flags)
    
    def transform(self, X, **kwargs):
        if self.is_len:
            func = lambda text: len(re.findall(self.pattern, text, self.flags))
        else:
            func = lambda text: bool(re.search(self.pattern, text, self.flags))
        rez = np.vectorize(func)(X).astype(int)
        return unsquash(rez)

In [63]:
class EnsembleBinaryClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):

    def __init__(self, mode, weights=None):
        self.mode = mode
        self.weights = weights

    def fit(self, X, y=None):
        return self

    def predict_proba(self, X):
        ''' Predict (weighted) probabilities '''
        probs = np.average(X, axis=1, weights=self.weights)
        return np.column_stack((1-probs, probs))

    def predict(self, X):
        ''' Predict class labels. '''
        if self.mode == 'average':
            return binarize(self.predict_proba(X)[:,[1]], 0.5)
        else:
            res = binarize(X, 0.5)
            return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)

In [64]:
def build_ensemble(model_list, estimator=None):
    models = []
    for i, model in enumerate(model_list):
        models.append(('model_transform'+str(i), ModelTransformer(model)))

    if not estimator:
        return FeatureUnion(models)
    else:
        return Pipeline([
            ('features', FeatureUnion(models)),
            ('estimator', estimator)
            ])

In [260]:
def get_vec_pipe(add_len=True, tfidf_params={}):
    vectorizer = TfIdfLen(add_len, **tfidf_params)
    vec_pipe = [
        ('vec', vectorizer)]
    return Pipeline(vec_pipe)

In [179]:
def get_pattern_pipe(patterns):
    pipes = []
    for i, (patt, params) in enumerate(patterns):
        kwargs = params.copy()
        name = kwargs.pop("name") + "_" + str(i)
        temp = MatchPattern(pattern=patt, **kwargs)
        pipes.append((name, temp))
    return pipes

In [248]:
def get_len_pipe(use_tfidf=True, vec_pipe=None):
    len_pipe = [("length", Length(use_tfidf))]
    if use_tfidf:
        len_pipe.insert(0, ("vec", vec_pipe))
    return Pipeline(len_pipe)

In [181]:
PATTERNS = [(r"[\(\d][\d\s\(\)-]{8,15}\d", {"name": "phone",
                                "is_len": 0}),
           (r"%|taxi|скидк|цін", {"name": "custom",
                                  "is_len": 0,
                                  "flags": re.I | re.U})
           ]

In [261]:
def build_transform_pipe(tf_params, add_len=True, vec_mode="add"):
    vec_pipe = get_vec_pipe(add_len, **tf_params)
    if vec_mode == "only":
        return vec_pipe
    patt_pipe = get_pattern_pipe(PATTERNS)
    chain = [
        ('converter', Converter()),
        ('union', FeatureUnion([
            ('vec', vec_pipe),
            *patt_pipe
        ]))
    ]
    return chain

In [426]:
def build_classifier(name, seed=25):
    if name == "logit":
        model = LogisticRegression(C=1, class_weight="balanced", random_state=seed, penalty="l2")
        model.grid_s = {'logit__C' : (0.1, 0.5, 1, 5, 10)}
        model.grid_b = {'logit__C' : [(1)]}
    elif name == "nb":
        model = MultinomialNB(alpha=0.1) #class_prior=[0.5, 0.5])
        model.grid_s = {'nb__alpha' : (0.1, 0.5, 1, 5, 10)}
        model.grid_b = {'nb__alpha' : [(1)]}
    model.name = name
    return model

In [429]:
def get_estimator_pipe(name, model, tf_params, vec_mode="add"):
    chain = build_transform_pipe(tf_params, vec_mode=vec_mode)
    chain.append((name, model))
    pipe = Pipeline(chain)
    pipe.name = name
    return pipe

In [423]:
def get_all_classifiers(names):
    return [build_classifier(name) for name in names]

In [427]:
def build_all_pipes(tf_params, vec_mode="add", names=["logit", "nb"]):
    clfs = get_all_classifiers(names)
    return [get_estimator_pipe(clf.name, clf, tf_params, vec_mode) for clf in clfs]

In [286]:
clf_log = LogisticRegression(random_state=25, class_weight="balanced",                          
                             C=0.1, penalty="l2")
clf_nb = MultinomialNB(alpha=0.1)#, class_prior=[0.5, 0.5])
clf_ensem = EnsembleBinaryClassifier(mode="averag", weights=[5, 5])  

chain = build_transform_pipe(tf_params)
chain.append(('estimator', clf_log))
pipe = Pipeline(chain)

In [416]:
cv_splitter = StratifiedKFold(n_splits=5, random_state=25, shuffle=True)

In [437]:
pipes = build_all_pipes(tf_params)

In [451]:
grid_tf = {"union__vec__vec__use_idf": [0, 1],
           "union__vec__vec__ngram_range": [(3,3), (4,4), (5,5), (3,5), (3,4)],
           "union__vec__vec__max_features": range(2000, 4500, 500)}
best = []
best_score = []
grid = "grid_s"
for i in range(len(pipes)):
    pipe = pipes[i]

    print(f"Hypertuning model {i+1} out of {len(pipes)}: {pipe.name}")
    print("================================================================================")

    this_grid = getattr(pipe.steps[-1][1], grid)
    this_grid.update(grid_tf)
    gs = GridSearchCV(pipe, this_grid, scoring="f1", cv=cv_splitter, n_jobs=-1, verbose=False)
    model = gs.fit(X, y)

    print(f"Best score on training set (CV): {gs.best_score_:0.3f}" )
    print("Best parameters set:")
    best_parameters = gs.best_estimator_.get_params()
    for param_name in sorted(this_grid.keys()):
        print(f"\t{param_name}: {best_parameters[param_name]}")

    for params, mean_score, scores in gs.grid_scores_:
        print(f"{mean_score:0.4f} (+/-{scores.std() / 2}) for {params}: {scores}")

    # Assess and predict (validation error etc.)
#     print "\nPredict: "
#     yp_val = model.predict(X_val)
#     diff = y_val - yp_val
#     print "Auroc val:", auroc(y_val, yp_val)
#     print classification_report(y_val, yp_val)
#     metrics.confusion_matrix

#     best.append(gs.best_estimator_)
#     print "10-fold cross-validation of best instance on whole set..."
#     tr_roc = np.mean(cross_validation.cross_val_score(gs.best_estimator_, X, y, cv=10, scoring='roc_auc', verbose=2))
#     best_score.append(tr_roc)
#     print "Mean score: ",  tr_roc
#     print "================================================================================"

Hypertuning model 1 out of 2: logit
Best score on training set (CV): 0.957
Best parameters set:
	logit__C: 0.5
	union__vec__vec__max_features: 4000
	union__vec__vec__ngram_range: (4, 4)
	union__vec__vec__use_idf: 1
0.9487 (+/-0.004049733402745225) for {'logit__C': 0.1, 'union__vec__vec__max_features': 2000, 'union__vec__vec__ngram_range': (3, 3), 'union__vec__vec__use_idf': 0}: [0.95681063 0.95302013 0.95138889 0.93333333 0.94915254]
0.9441 (+/-0.006426988813357387) for {'logit__C': 0.1, 'union__vec__vec__max_features': 2000, 'union__vec__vec__ngram_range': (3, 3), 'union__vec__vec__use_idf': 1}: [0.95238095 0.9527027  0.93706294 0.9220339  0.95622896]
0.9449 (+/-0.004158389720650883) for {'logit__C': 0.1, 'union__vec__vec__max_features': 2000, 'union__vec__vec__ngram_range': (4, 4), 'union__vec__vec__use_idf': 0}: [0.93559322 0.94197952 0.9375     0.95652174 0.9527027 ]
0.9451 (+/-0.004089185428960916) for {'logit__C': 0.1, 'union__vec__vec__max_features': 2000, 'union__vec__vec__ngra

In [224]:
# sms = "привіт заходь до нас у ввечері додому"
# ham, spam = pipe.predict_proba(sms)[0]
# print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")

In [225]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

Pipeline(memory=None,
     steps=[('converter', <__main__.Converter object at 0x7fb867a477b8>), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('vec', Pipeline(memory=None,
     steps=[('vectorizer', <__main__.TfIdfLen object at 0x7fb867bf2f28>)])), ('phone_0', <__main__.MatchPattern object at 0x7fb867a47780>), ('c...alty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

AUC: 0.995
Recall: 0.950
Precision: 0.981
F1: 0.966
Accuracy: 0.986

Confusion matrix:
      pred_ham  pred_spam
ham        823          4
spam        11        211

Report:
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       827
          1       0.98      0.95      0.97       222

avg / total       0.99      0.99      0.99      1049

