In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport src.config
%aimport src.helpers

In [504]:
import numpy as np
from pathlib import Path
from time import time
import json
import pandas as pd
from xml.etree.ElementTree import iterparse
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
import re

In [5]:
from src.config import data_dir
from src.helpers import calc_metrics

#### Process raw SMS data

In [51]:
filename = "karim-sms-allow.xml"
source = data_dir / filename
data = []
for event, elem in iterparse(source):
    if elem.tag == "sms":
        #if any(elem.attrib["body"]==r["text"] for r in data):
        #    continue
        record = {}
        record["text"] = elem.attrib["body"]
        record["contact_name"] = elem.attrib["contact_name"]
        record["address"] = elem.attrib["address"]
        record["timestamp"] = int(elem.attrib["date"])
        record["type"] = elem.attrib["type"]
        data.append(record)

In [55]:
df = pd.DataFrame(data)
df.to_excel(data_dir / "karim-sms-allow.xlsx", index=False)

#### Read labeled data

In [6]:
labeled_filename = "karim-sms-allow-labeled.xlsx"
labeled = pd.read_excel(data_dir / labeled_filename, sheet_name="total sms")
labeled["timestamp"] = (labeled["timestamp"] / 1000).map(datetime.fromtimestamp)
labeled["resp"] = 0

In [7]:
mapp = {"ham": 0, "spam": 1}

In [8]:
responses_filename = "SMS Data Collection (Responses).xlsx"
responses = pd.read_excel(data_dir / responses_filename)
responses = responses.rename(columns={"SMS text": "text", 
                                      "Is it a spam or ham?": "label",
                                     "Timestamp": "timestamp"})
responses["resp"] = 1
responses["label"] = responses["label"].map(lambda x: mapp.get(x, x))

In [21]:
total = pd.concat([labeled, responses], ignore_index=True)
total.to_excel(data_dir / "sms-uk-total.xlsx")

In [22]:
# Check dimensionality and class imbalance
total.shape
total.label.value_counts(normalize=True).round(5)*100
total.text.isnull().sum()
total = total.loc[total.text.notnull()]
total.shape

(3497, 8)

0    78.81
1    21.19
Name: label, dtype: float64

3

(3494, 8)

#### Train-test split

In [433]:
X = total["text"]
y = total["label"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 2795, Num. of test: 699


#### Build features

In [516]:
def build_features(X_train, X_test, var="text", features=None, vectorizer=None):
    f_train = []
    f_test = []
    for feature in features:
        if feature == "tfidf":
            train = vectorizer.fit_transform(X_train).toarray()
            test = vectorizer.transform(X_test).toarray()
            f_train.append(train)
            f_test.append(test)
            if "len" in features:
                train = (train>0).sum(axis=1)[:, np.newaxis]
                test = (test>0).sum(axis=1)[:, np.newaxis]
                f_train.append(train)
                f_test.append(test)
        if feature == "patt":
            train = (X_train.str.contains("%|taxi|скидк|цін", regex=True, flags=re.I)
                     .astype(int).values[:, np.newaxis])
            test = (X_test.str.contains("%|taxi|скидк|цін", regex=True, flags=re.I)
                    .astype(int).values[:, np.newaxis])
            f_train.append(train)
            f_test.append(test)
    return np.concatenate((f_train), axis=1), np.concatenate((f_test), axis=1)

In [549]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": None,
             "ngram_range": (4, 4),
             "min_df": 0.0,
             "max_df": 1.0,
             "preprocessor": None,#Preprocessor(),
             "max_features": 3500,
             "norm": "l2"*0,
             "use_idf": 0
             }

In [550]:
vectorizer = TfidfVectorizer(**tf_params)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
features = ["tfidf", "len"]
train, test = build_features(X_train, X_test, features=features, vectorizer=vectorizer, var="text")

#### Fit Naive Bayes

In general it is much worse to misclassify ham
SMS than letting spam pass the filter. So, it is desirable to be able to bias
the filter towards classifying SMS as ham, yielding higher precision at the expense of recall

In [368]:
def predict_class(tf, X_test, clf, w=1.5):
    probas = clf.predict_proba(X_test)
    ratios = np.log(probas[:, 1] ) - np.log(probas[:, 0])
    lengths = (tf.toarray()>0).sum(axis=1).T
    thresholds = lengths * np.log(w)
    y_pred = np.zeros_like(y_test)
    y_pred[ratios>thresholds] = 1
    return y_pred, ratios, thresholds

In [569]:
clf = MultinomialNB(alpha=0.1)#, class_prior=[0.5, 0.5])
clf.fit(train, y_train)
pred, ratios, thresholds = predict_class(tfidf_test, test, clf, w=1.2)
#pred = clf.predict(test)
proba = clf.predict_proba(test)[:, 1]
output, report, conf_matrix = calc_metrics(y_test, pred, proba, labels=["ham", "spam"], 
                                           print_=True, mode="binary")

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

AUC: 0.979
Recall: 0.919
Precision: 0.938
F1: 0.928
Accuracy: 0.970

Confusion matrix:
      pred_ham  pred_spam
ham        542          9
spam        12        136

Report:
             precision    recall  f1-score   support

          0       0.98      0.98      0.98       551
          1       0.94      0.92      0.93       148

avg / total       0.97      0.97      0.97       699



In [570]:
cond = (pred - y_test) < 0
X_test[cond].values
proba[cond]
ratios[cond]-thresholds[cond]

array(['Шановний Абоненте! За послугою «Додаткові гроші» буде збільшено кредитний ліміт. З 24.01.18 Ви зможете отримати від 5 до 100 грн. Вартість послуги від 2 грн/7 днів. Підключити послугу *117*1#. Чудова можливість залишатися на зв’язку якщо закінчились кошти! Деталі: s.lifecell.ua/104',
       "Tomorrow's forecast in SOMA South Park, San Francisco is Clear.\nhttps://m.twil.io/kYotCFy",
       'Поздравляем победителей!(trophy)\u2029Ваш номер +380935375947 \u2029получил доступ к магазину c \u2029(Snowflake)ПредНовогодними(Snowflake)\u2029скидками до 75%!\u2029Не пропустите(!)\u2029Только для пользователей Viber(!)\u2029(Christmas_tree)Брендовые часы \u2029(Christmas_tree)Товары для дома\u2029(Christmas_tree)Оригинальные подарки\u2029(Christmas_tree)Натуральная косметика\u2029(Christmas_tree)Продукты для здоровья\u2029\u2029Для просмотра распродажи перейдите по ссылкам ниже.\u2029\u2029Женская распродажа(Phone)\u2029http://mirViber.ru/ukrw\u2029Мужская распродажа(Phone)\u2029http://m

array([1.54820038e-14, 2.58717847e-01, 7.35575600e-06, 3.94158192e-16,
       9.99554636e-01, 9.98679064e-01, 8.15165713e-26, 2.88450204e-01,
       9.77593072e-01, 9.48818567e-01, 1.24428081e-03, 9.99906727e-01])

array([-57.32411604,  -3.05818038, -36.79807335, -55.8897937 ,
        -4.31705044,  -1.57637701, -71.26078639,  -3.45542473,
        -4.79338985, -18.22945981, -15.25706569,  -4.39423203])