In [2]:
import re
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import normalize, scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.sparse import hstack, csr_matrix
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency
from sklearn.pipeline import Pipeline

In [4]:
data_path = "../data/SMSSpamCollection"

with open(data_path) as f:
    data = pd.DataFrame([{"isSpam": line.split("\t")[0],
                         "text": line.split("\t")[1]} for line in f ])
data["label"] = LabelEncoder().fit_transform(data["isSpam"])

In [5]:
transformer = CountVectorizer()
X = transformer.fit_transform(data["text"])
y = data["label"]

In [6]:
scores = cross_val_score(LogisticRegression(), X, data["label"], cv=10, n_jobs=-1, scoring="f1")
print("average score:", np.average(scores))

average score: 0.933348526858


In [7]:
clf = LogisticRegression()
clf.fit(X, y)
samples = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
           "FreeMsg: Txt: claim your reward of 3 hours talk time",
           "Have you visited the last lecture on physics?",
           "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
           "Only 99$"]
results = clf.predict(transformer.transform(samples))
print("answers: ", " ".join(results.astype(str)))

answers:  1 1 0 0 0


In [8]:
n_gramm_list = [(2, 2), (3,3), (1, 3)]
for n_gramm in n_gramm_list:
    transformer = CountVectorizer(ngram_range=n_gramm)
    X = transformer.fit_transform(data["text"])
    y = data["label"]
    scores = cross_val_score(LogisticRegression(), X, data["label"], cv=10, n_jobs=-1, scoring="f1")
    print("average score:", np.average(scores))

average score: 0.822422066419
average score: 0.725016155547
average score: 0.925138255865


In [9]:
n_gramm_list = [(2, 2), (3,3), (1, 3)]
for n_gramm in n_gramm_list:
    transformer = CountVectorizer(ngram_range=n_gramm)
    X = transformer.fit_transform(data["text"])
    y = data["label"]
    scores = cross_val_score(MultinomialNB(), X, data["label"], cv=10, n_jobs=-1, scoring="f1")
    print("average score:", np.average(scores))

average score: 0.645455401356
average score: 0.378623430876
average score: 0.887905460889


In [10]:
transformer = TfidfVectorizer()
X = transformer.fit_transform(data["text"])
y = data["label"]
scores = cross_val_score(MultinomialNB(), X, data["label"], cv=10, n_jobs=-1, scoring="f1")
print("average score:", np.average(scores))

average score: 0.840253457542


# Tf-idf работает хуже, попробуем bm25.

In [11]:
# взято отсюда
# https://github.com/alexeygrigorev/avito-duplicates-kaggle/blob/master/bm25.py
class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b  : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features] document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)

        doc_len = X.sum(axis=1)
        self._average_document_len = np.average(doc_len)

        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features] document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        doc_len = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]

        # In each row, repeat `doc_len` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(doc_len), sz)

        # Compute BM25 score only for non-zero elements
        nom = self.k1 + 1
        denom = X.data + self.k1 * (1 - self.b + self.b * rep / self._average_document_len)
        data = X.data * nom / denom

        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            X = X * self._idf_diag

        return X
    
    def fit_transform(self, X, copy=True):
        self.fit(X)
        return self.transform(X)

In [12]:
transformer = CountVectorizer(ngram_range=(1, 1))
bm25 = BM25Transformer(b=0).fit(transformer.fit_transform(data["text"]))
X = bm25.transform(transformer.fit_transform(data["text"]))
y = data["label"]
scores = cross_val_score(LogisticRegression(), X, data["label"], cv=10, n_jobs=-1, scoring="f1")
print("average score:", np.average(scores))

average score: 0.943114179038


# Подберем оптимальное C и регуляризацию.

In [13]:
estimators = [("transformer", BM25Transformer()), ("clf", LogisticRegression())]
pipe = Pipeline(estimators)

transformer = CountVectorizer()

X = transformer.fit_transform(data["text"])
y = data["label"]
class_weight = {0: (len(y) - y[y == 0].shape[0]) / len(y),
               1: (len(y) - y[y == 1].shape[0]) / len(y)}
params = {"clf__C": np.logspace(-1, 4, 20), 
          "clf__class_weight": [class_weight],
          "transformer__b": np.linspace(0, 1, 3),
         "transformer__k1": np.linspace(1, 4, 3)}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(pipe, params, n_jobs=-1, cv=cv, scoring="f1", verbose=1)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:   11.0s


{'clf__C': 0.18329807108324356, 'clf__class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'transformer__b': 0.0, 'transformer__k1': 4.0}
0.956666702282


[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:   20.7s finished


# Попробуем убрать стоп-слова.

In [14]:
transformer = CountVectorizer(stop_words="english")
X = transformer.fit_transform(data["text"])
y = data["label"]
estimators = [("transformer", BM25Transformer()), ("clf", LogisticRegression())]
pipe = Pipeline(estimators)

class_weight = {0: (len(y) - y[y == 0].shape[0]) / len(y),
               1: (len(y) - y[y == 1].shape[0]) / len(y)}
params = {"clf__C": np.logspace(-1, 4, 20), 
          "clf__class_weight": [class_weight],
          "transformer__b": [0],
          "transformer__k1": [4]}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(pipe, params, n_jobs=-1, cv=cv, scoring="f1", verbose=1)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'clf__C': 0.10000000000000001, 'clf__class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'transformer__b': 0, 'transformer__k1': 4}
0.951628130087


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.6s finished


# Добавим лемматизацию.

In [18]:
tokenizer = TweetTokenizer(reduce_len=True)
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return " ".join(map(lemmatizer.lemmatize, tokenizer.tokenize(text)))

tokenizer = TweetTokenizer(reduce_len=True)
data["lemma_text"] = data["text"].apply(lemmatize_text)

In [19]:
transformer = CountVectorizer()
X = transformer.fit_transform(data["lemma_text"])
y = data["label"]

params = {"C": np.logspace(-1, 4, 20), "penalty": ["l1", "l2"], "class_weight": [class_weight]}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(LogisticRegression(), params, n_jobs=-1, cv=cv, scoring="f1", verbose=1)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)

transformer = CountVectorizer()
X = transformer.fit_transform(data["lemma_text"])
y = data["label"]
estimators = [("transformer", BM25Transformer()), ("clf", LogisticRegression())]
pipe = Pipeline(estimators)

params = {"clf__C": np.logspace(-1, 4, 20), 
          "clf__class_weight": [class_weight],
          "transformer__b": [0],
          "transformer__k1": [4]}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(pipe, params, n_jobs=-1, cv=cv, scoring="f1", verbose=1)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    3.4s finished


{'class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'C': 263.66508987303581, 'penalty': 'l2'}
0.949127503225
Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'clf__C': 0.18329807108324356, 'clf__class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'transformer__b': 0, 'transformer__k1': 4}
0.957399436005


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.4s finished


# Добавим еще признаков.

In [155]:
def url_replce(text):
    return " ".join(["url" if ("http" in word) or ("www" in word) else word for word in text.split()])

def url_count(text):
    url_regexp = re.compile("(https?:\\/\\/)?(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)")
    #return len(re.findall(url_regexp, text))
    return 1 if ("http" in text) or ("www" in text) else 0

def capital_letters_range(text):
    return len(list(filter(lambda x: x.isupper(), text))) / len(text)

def num_count(text):
    num_regexp = re.compile("[0-9]+\.?[0-9]*")
    return len(re.findall(num_regexp, text))

def money_count(text):
    money_regexp = re.compile("[$£]+")
    return len(re.findall(money_regexp, text))

def citation_count(text):
    money_regexp = re.compile('\"+')
    return len(re.findall(money_regexp, text))

def capital_words_count(text):
    return len(list(filter(lambda x: x.isupper(), tokenizer.tokenize(text))))

def phone_number_count(text):
    num_regexp = re.compile("[0-9]+\.?[0-9]*")
    return len(list(filter(lambda x: len(x) > 4, re.findall(num_regexp, text))))

tokenizer = TweetTokenizer(reduce_len=True)

data["text_url"] = data["lemma_text"].apply(url_replce)
data["url_count"] = data["text"].apply(url_count)
data["chars_len"] = data["text"].apply(lambda text: len(text))
data["words_len"] = data["text"].apply(lambda text: len(tokenizer.tokenize(text)))
data["capital_letters_range"] = data["text"].apply(capital_letters_range)
data["num_count"] = data["text"].apply(num_count)
data["money_count"] = data["text"].apply(money_count)
data["capital_words_count"] = data["text"].apply(capital_words_count)
data["phone_number_count"] = data["text"].apply(phone_number_count)
data["citation_count"] = data["text"].apply(citation_count)

In [157]:
transformer = CountVectorizer(ngram_range=(1, 1))
X = transformer.fit_transform(data["text"])
columns = ["url_count", "words_len", "capital_letters_range", "num_count",
          "money_count", "capital_words_count", "phone_number_count"]
X_features = csr_matrix(data[columns].values)
X_joined = hstack((X, X_features))
y = data["label"]
class_weight = {0: (len(y) - y[y == 0].shape[0]) / len(y),
               1: (len(y) - y[y == 1].shape[0]) / len(y)}
params = {"C": np.logspace(-1, 4, 20), "penalty": ["l1", "l2"], "class_weight": [class_weight]}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(LogisticRegression(), params, cv=cv, scoring="f1", verbose=1)
grid.fit(X_joined, y)
print(grid.best_params_)
print(grid.best_score_)


transformer = CountVectorizer()
#X = BM25Transformer(b=0, k1=4).fit_transform(transformer.fit_transform(data["text"]))
X = transformer.fit_transform(data["text"])
y = data["label"]
X_joined = hstack((X, X_features))
estimators = [("transformer", BM25Transformer()), ("clf", LogisticRegression())]
pipe = Pipeline(estimators)

params = {"clf__C": np.logspace(-2, 4, 20), 
          "clf__class_weight": [class_weight],
          "transformer__b": [0],
          "transformer__k1": [4]}
#params = {"C": np.logspace(-2, 4, 20), "class_weight": [class_weight]}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(pipe, params, n_jobs=-1, cv=cv, scoring="f1", verbose=1)
grid.fit(X_joined, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
{'class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'C': 42.813323987193911, 'penalty': 'l2'}
0.964921457782
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   12.0s finished
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.2s


{'clf__C': 0.088586679041008226, 'clf__class_weight': {0: 0.1340150699677072, 1: 0.8659849300322928}, 'transformer__b': 0, 'transformer__k1': 4}
0.966008523135


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.6s finished


# Теперь попробуем деревья.

In [173]:
transformer = CountVectorizer(ngram_range=(1, 1))
X = transformer.fit_transform(data["text"])
columns = ["url_count", "words_len", "capital_letters_range", "num_count",
          "money_count", "capital_words_count", "phone_number_count"]
X_features = csr_matrix(data[columns].values)
X_joined = hstack((X, X_features))
y = data["label"]

params = {"n_estimators": [500],
          "colsample_bytree": [0.4], 
          "colsample_bylevel": [0.6],
          "min_child_weight": [1],
          "max_depth": [3]}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X, y)
grid = GridSearchCV(XGBClassifier(), params, cv=cv, scoring="f1", verbose=1)
grid.fit(X_joined, y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   15.9s finished


{'max_depth': 3, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.6, 'min_child_weight': 1, 'n_estimators': 500}
0.964032355569


# Попробуем усреднить результаты предсказаний деревьев и линейного классификатора.

In [169]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
transformer = CountVectorizer()
X = transformer.fit_transform(data["text"])
columns = ["url_count", "words_len", "capital_letters_range", "num_count",
          "money_count", "capital_words_count", "phone_number_count"]
X_features = csr_matrix(data[columns].values)
X_joined = hstack((X, X_features), format="csr")

y = data["label"]

cv_score = np.zeros(10)

for number, (train_index, test_index) in enumerate(cv.split(X, y)):
    X_joined_train, X_joined_test = X_joined[train_index, :], X_joined[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    estimators = [("transformer", BM25Transformer()), ("clf", LogisticRegression())]
    pipe = Pipeline(estimators)
    pipe.set_params(clf__C=0.0885, clf__class_weight=class_weight, transformer__b=0, transformer__k1=4)
    clf_1 = pipe
    clf_1.fit(X_joined_train, y_train)
    result_1 = clf_1.predict_proba(X_joined_test)

    clf_2 = XGBClassifier(n_estimators=500, colsample_bytree=0.4, colsample_bylevel=0.6, min_child_weight=1,
                         max_depth=3)
    clf_2.fit(X_joined_train, y_train)
    result_2 = clf_2.predict_proba(X_joined_test)
    
    result = np.argmax(result_1 + result_2, axis=1)
    print(f1_score(y_test, result))
    cv_score[number] = f1_score(y_test, result)
print("cv score: ", cv_score.mean())

0.929577464789
0.972602739726
0.958333333333
0.958904109589
0.96644295302
0.965517241379
0.931506849315
0.965517241379
0.986486486486
0.942857142857
cv score:  0.957774556187


# Выводы
На коротких зашумленных текстах tf-idf работает не очень, но можно настроить bm25.

Помогают дополнительные признаки, такие как количество телефонных номеров в тексте, url'ов.

Деревья и линейных методы работают почти одинаково неплохо, их усреднение работает еще лучше, неплохо работал KNN на дополнительных признаках. 

Поэтому можно делать ансамбль, но сэпл маленький и делать без отложенного контроля не очень хочется.

Лемматизация не особо помогала.

In [126]:
def url_count(text):
    url_regexp = re.compile("(https?:\\/\\/)?(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)")
    #url_regexp = re.compile("(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)")
    return len(re.findall(url_regexp, text))

text = "www.bigfick.com"
print(url_count(text))

1
