In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.config
%aimport src.helpers
%aimport src.transformers

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
import numpy as np
import re
import dill
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from functools import partial
from scipy import sparse

In [22]:
from keras.layers import Dense, LSTM, Dropout, Activation, Input, Embedding, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras import regularizers, Model, Sequential, callbacks, optimizers, activations

Using TensorFlow backend.


In [7]:
from src.config import data_dir, models_dir
from src.helpers import calc_metrics, plot_tfidf_classfeats_h, top_feats_by_class, init_dir, save_model, load_model, print_dict

In [38]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

#### Process raw SMS data

In [7]:
filename = "karim-sms-allow.xml"
source = data_dir / filename
data = []
for event, elem in iterparse(source):
    if elem.tag == "sms":
        #if any(elem.attrib["body"]==r["text"] for r in data):
        #    continue
        record = {}
        record["text"] = elem.attrib["body"]
        record["contact_name"] = elem.attrib["contact_name"]
        record["address"] = elem.attrib["address"]
        record["timestamp"] = int(elem.attrib["date"])
        record["type"] = elem.attrib["type"]
        data.append(record)

In [55]:
df = pd.DataFrame(data)
df.to_excel(data_dir / "karim-sms-allow.xlsx", index=False)

#### Read labeled data

In [8]:
def build_dataset(filenames, file_out, date_format="%m-%d-%Y %H:%M:%S", is_save=1):
    output = []
    for k,v in filenames.items():
        if k == "labeled":
            df = pd.read_excel(data_dir / v, sheet_name="total sms")
            df["timestamp"] = (df["timestamp"] / 1000).map(datetime.fromtimestamp)
            df["resp"] = 0
            df["source"] = "K"
            output.append(df)
        elif k == "labeled_1":
            df = pd.read_excel(data_dir / v)
            df["resp"] = 0
            df["timestamp"] = df["timestamp"].map(lambda x: datetime.strptime(x, date_format))
            exclude = ["Karimushka"]
            df = df.loc[~(df.contact_name.isin(exclude))]
            df["source"] = "T"
            output.append(df)
        else:
            df = pd.read_excel(data_dir / v)
            df = df.rename(columns={"SMS text": "text", 
                                    "Is it a spam or ham?": "label",
                                    "Timestamp": "timestamp"})
            df["resp"] = 1
            df["label"] = df["label"].map(lambda x: LABEL_MAP.get(x, x))
            output.append(df)
    df = pd.concat(output, ignore_index=True)
    if is_save:
        df.to_excel(data_dir / file_out)
    return df

In [10]:
LABEL_MAP = {"ham": 0, "spam": 1}
FILES = {"labeled": "karim-sms-allow-labeled.xlsx",
         "labeled_1": "tanya-sms-all.xlsx",
         "responses": "SMS Data Collection (Responses).xlsx"}
file_out = "sms-uk-total.xlsx"
total = build_dataset(FILES, file_out=file_out)

In [14]:
# Check dimensionality and class imbalance
print(total.shape)
print(total.label.value_counts(normalize=True).round(5)*100)
print(total.text.isnull().sum())
total = total.loc[total.text.notnull()]
print(total.shape)

(6104, 9)
0.0    80.079
1.0    19.921
Name: label, dtype: float64
0
(6104, 9)


In [16]:
total.loc[total.resp==1, "label"].shape

(269,)

#### Train-test split

In [17]:
total = pd.read_excel(data_dir / file_out)
total = total.loc[total.text.notnull()]

In [8]:
#total["text_rep"] = total["text"].str.replace(r"[\(\d][\d\s\(\)-]{8,15}\d", "PHONE_NUMBER", flags=re.I)

In [15]:
total["text"] = total["text"].str.replace(r"[\n\r]+", "")

In [16]:
X = total["text"]
y = total["label"]
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 4272, Num. of test: 1831


In [29]:
total.shape[0] - 249

5855

#### Build features

In [17]:
def build_features(X_train, X_test, var="text", features=None, vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            tf_train = vectorizer.fit_transform(X_train).toarray()
            tf_test = vectorizer.transform(X_test).toarray()
            f_train.append(tf_train)
            f_test.append(tf_test)
        if feature == "length":
            if "tfidf" in features:
                train = (tf_train>0).sum(axis=1)[:, np.newaxis]
                test = (tf_test>0).sum(axis=1)[:, np.newaxis]
            else:
                train = X_train.map(len).values[:, np.newaxis]
                test = X_test.map(len).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
        if feature == "patt":
            patt = "%|taxi|скидк|цін"
            train = (X_train.str.contains(patt, regex=True, flags=re.I)
                     .astype(int).values[:, np.newaxis])
            test = (X_test.str.contains(patt, regex=True, flags=re.I)
                    .astype(int).values[:, np.newaxis])
            f_train.append(train)
            f_test.append(test)
        if feature == "phone":
            patt = r"[\(\d][\d\s\(\)-]{8,15}\d"
            train = X_train.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            test = X_test.map(lambda x: len(re.findall(patt, x))>0).values[:, np.newaxis]
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [18]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": None,
             "ngram_range": (4, 4),
             "min_df": 0.0,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 3500,
             "norm": "l2"*0,
             "use_idf": 1
             }

In [27]:
# Remove Top N features
# top = 100
# r = tfidf_train.toarray().sum(axis=1)
# topn_ids = np.argsort(r)[::-1][:top]
# voc = [f for i,f in enumerate(features) if i not in topn_ids]
# tf_params["vocabulary"] = None#voc

In [96]:
vectorizer = TfidfVectorizer(**tf_params)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
features = [
            "tfidf", 
            "length",
            "phone",
            "patt",
]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer, var="text")

#### Train FNN

In [243]:
alpha = 1e-8
input_tfidf = Input(shape=(train.shape[1],))
x = Dense(100, #activation=activations.tanh,
         kernel_regularizer=regularizers.l2(alpha),
         use_bias=1
         )(input_tfidf)
x = Dropout(0.5)(x)
# x = Dense(50, activation=activations.tanh,
#          kernel_regularizer=regularizers.l2(alpha))(x)
# x = Dropout(0.25)(x)
output = Dense(1, activation="sigmoid",
               use_bias=1,
               kernel_regularizer=regularizers.l2(alpha)
              )(x)

In [244]:
model = Model(inputs=input_tfidf, outputs=output)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.RMSprop(lr=0.01), 
              metrics=[f1])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_54 (InputLayer)        (None, 3503)              0         
_________________________________________________________________
dense_111 (Dense)            (None, 100)               350400    
_________________________________________________________________
dropout_54 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_112 (Dense)            (None, 1)                 101       
Total params: 350,501
Trainable params: 350,501
Non-trainable params: 0
_________________________________________________________________
None


In [245]:
weights = class_weight.compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train)

In [246]:
model.fit(train, y_train, validation_data=(test, y_test), 
          epochs=10, 
          batch_size=64,
          class_weight=weights,
          verbose=2,
          shuffle=True
         )

Train on 4272 samples, validate on 1831 samples
Epoch 1/10
 - 2s - loss: 0.4711 - f1: 0.8179 - val_loss: 0.3757 - val_f1: 0.8624
Epoch 2/10
 - 1s - loss: 0.1343 - f1: 0.9512 - val_loss: 0.1672 - val_f1: 0.9434
Epoch 3/10
 - 1s - loss: 0.0780 - f1: 0.9725 - val_loss: 0.1774 - val_f1: 0.9431
Epoch 4/10
 - 1s - loss: 0.0574 - f1: 0.9823 - val_loss: 0.2194 - val_f1: 0.9421
Epoch 5/10
 - 1s - loss: 0.0825 - f1: 0.9803 - val_loss: 0.2189 - val_f1: 0.9356
Epoch 6/10
 - 1s - loss: 0.0641 - f1: 0.9871 - val_loss: 0.2840 - val_f1: 0.9213
Epoch 7/10
 - 1s - loss: 0.0482 - f1: 0.9850 - val_loss: 0.2186 - val_f1: 0.9459
Epoch 8/10
 - 1s - loss: 0.0650 - f1: 0.9850 - val_loss: 0.2280 - val_f1: 0.9399
Epoch 9/10
 - 1s - loss: 0.0421 - f1: 0.9905 - val_loss: 0.2466 - val_f1: 0.9415
Epoch 10/10
 - 1s - loss: 0.0337 - f1: 0.9899 - val_loss: 0.2237 - val_f1: 0.9405


<keras.callbacks.History at 0x7ff6a67eecf8>

In [215]:
probas = model.predict(test)
y_pred = np.zeros_like(probas)
y_pred[probas>=0.5] = 1
metrics.f1_score(y_pred=y_pred, y_true=y_test)
metrics.accuracy_score(y_pred, y_test)
metrics.precision_score(y_pred, y_test)
metrics.recall_score(y_pred, y_test)

0.9395218002812941

0.9765155652648826

0.9175824175824175

0.962536023054755

In [26]:
# features = vectorizer.get_feature_names()
# dfs = top_feats_by_class(tfidf_train, y_train, features, min_tfidf=0.1, top_n=25)
# plot_tfidf_classfeats_h(dfs)

#### Fit Naive Bayes

In general it is much worse to misclassify ham
SMS than letting spam pass the filter. So, it is desirable to be able to bias
the filter towards classifying SMS as ham, yielding higher precision at the expense of recall

In [51]:
def predict_class(tf, X_test, clf, w=1.5):
    probas = clf.predict_proba(X_test)
    ratios = np.log(probas[:, 1] ) - np.log(probas[:, 0])
    lengths = (tf.toarray()>0).sum(axis=1).T
    thresholds = lengths * np.log(w)
    y_pred = np.zeros_like(y_test)
    y_pred[ratios>thresholds] = 1
    return y_pred, ratios, thresholds

In [1492]:
clf = RandomForestClassifier(min_samples_leaf=5, min_samples_split=15,
                             n_estimators=100, max_depth=20, max_features="auto", 
                             class_weight="balanced")

In [242]:
clf = LogisticRegression(random_state=25, class_weight="balanced", 
                         C=0.02, penalty="l2")
#clf = MultinomialNB(alpha=0.01)#, class_prior=[0.5, 0.5])
clf.fit(train, y_train)
#pred, ratios, thresholds = predict_class(tfidf_test, test, clf, w=1.2)
pred = clf.predict(test)
proba = clf.predict_proba(test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

LogisticRegression(C=0.02, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=25,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

AUC: 0.997
Recall: 0.967
Precision: 0.962
F1: 0.964
Accuracy: 0.986

Confusion matrix:
      pred_ham  pred_spam
ham       1453         14
spam        12        352

Report:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1467
          1       0.96      0.97      0.96       364

avg / total       0.99      0.99      0.99      1831



In [None]:
X_test.iloc[fn_i[:2]]
total.loc[3469]

In [None]:
fp_i = np.where((pred==1) & (y_test==0))[0]
fn_i = np.where((pred==0) & (y_test==1))[0]
for el in X_test.iloc[fp_i].values:
    print(el+"\n")

#### Build Pipeline

In [7]:
from src.transformers import TfIdfLen, ModelTransformer, MatchPattern, Length, Converter
from src.pipeline import grid_search, analyze_model

In [19]:
a = X_test.iloc[:1]#.values
l = TfIdfLen(add_len=1, **tf_params)
l.fit_transform(a)

<1x88 sparse matrix of type '<class 'numpy.float64'>'
	with 88 stored elements in Compressed Sparse Row format>

In [20]:
grid_tf = {#"union__vec__vec__use_idf": [0, 1],
           #"union__vec__vec__ngram_range": [(3,3), (4,4), (5,5), (3,5), (3,4)],
           #"union__vec__vec__max_features": range(2000, 4500, 500)
          }

In [38]:
best_estimators, best_scores = grid_search(transformer_grid=grid_tf, tf_params=tf_params)

Hypertuning model 1 out of 2: logit
Best score on training set (CV): 0.955
Best parameters set:
0.9552 (+/-0.0022) for {'logit__C': 0.1}: [0.95454545 0.95       0.9519833  0.96296296 0.95634096]
0.9547 (+/-0.0035) for {'logit__C': 0.2}: [0.95670103 0.94560669 0.9519833  0.96694215 0.95218295]
0.9547 (+/-0.0035) for {'logit__C': 0.3}: [0.95670103 0.94560669 0.9519833  0.96694215 0.95218295]
0.9546 (+/-0.0039) for {'logit__C': 0.4}: [0.95867769 0.94339623 0.9519833  0.96694215 0.95218295]
0.9550 (+/-0.0036) for {'logit__C': 0.5}: [0.95867769 0.94537815 0.9519833  0.96694215 0.95218295]
0.9533 (+/-0.0042) for {'logit__C': 1}: [0.95652174 0.94291755 0.9539749  0.96694215 0.94605809]
0.9499 (+/-0.0044) for {'logit__C': 5}: [0.95435685 0.93842887 0.9519833  0.9626556  0.94190871]
0.9507 (+/-0.0050) for {'logit__C': 10}: [0.95238095 0.93842887 0.9539749  0.96680498 0.94190871]
Hypertuning model 2 out of 2: nb
Best score on training set (CV): 0.852
Best parameters set:
0.8518 (+/-0.0102) for {

In [None]:
scores, results, conf_matrix, fnp = analyze_model(model=best_estimators[0], datafile=file_out, log_fold=False)

In [237]:
# sms = "привіт заходь до нас у ввечері додому"
# ham, spam = pipe.predict_proba(sms)[0]
# print(f"Probability ham: {ham*100:0.3f}%\nProbability spam: {spam*100:.3f}%")