In [252]:
# 2017
# Автор: Гусев Илья, 294
# Описание: первое задание по NLP. Spam-detection, bag of words, cv, grid-search.
# Примечание: цифры кроссвалидации округляю до 3 знаков после точки, мне так удобнее, остальное вроде по требованиям. 
# Метрика - 'f1', надеюсь та.

import pandas as pd
import copy
import numpy as np
import re
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from nltk import pos_tag
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from nltk.stem.snowball import SnowballStemmer
morph_en = SnowballStemmer("english")

In [229]:
# Считываем данные. Пункт 2 и 3.
train = pd.read_csv("SMSSpamCollection.csv", header=0,names = ["answer", "text"], delimiter="\t", quoting=3)
train.answer = train.answer.map(lambda x: int(x == "spam"))
train.head(5)

Unnamed: 0,answer,text
0,0,Ok lar... Joking wif u oni...
1,1,Free entry in 2 a wkly comp to win FA Cup fina...
2,0,U dun say so early hor... U c already then say...
3,0,"Nah I don't think he goes to usf, he lives aro..."
4,1,FreeMsg Hey there darling it's been 3 week's n...


In [230]:
# Для стемминга (10* пункт).
def stem_sentence(sentence, language):
    """
    Получение предложения, состоящего из основ слов изначального.
    """
    words = text_to_wordlist(sentence)
    for j in range(len(words)):
        if language == 'en':
            words[j] = morph_en.stem(words[j])
    return " ".join(words)

In [218]:
def bow(train_texts, test_texts, language='en', stem=False, tokenizer=None, preprocessor=None,
        use_tfidf=False, max_features=None, bow_ngrams=(1,1), analyzer='word'):
    """
    Обобщение BoW, можцно задавать n-граммы, стеммить, считать tf-idf.
    """
    train = copy.deepcopy(train_texts)
    test = copy.deepcopy(test_texts)
    if stem:
        for i in range(len(train)):
            train[i] = stem_sentence(train[i], language)
        for i in range(len(test)):
            test[i] = stem_sentence(test[i], language)

    if use_tfidf:
        vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    else:
        vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    data = train+test
    data = vectorizer.fit_transform(data)
    train_data = data[:len(train)]
    test_data = data[len(train):]
    return train_data, test_data

In [219]:
# Для POS-тегов (10* пункт).
def text_to_wordlist(sentence, cyrillic=False):
    """
    Самописная простейшая токенизация.
    """
    regexp = "[^а-яА-Яёa-zA-Z]"
    if cyrillic:
        regexp = "[^а-яА-Яё]"
    sentence = re.sub(regexp, " ", sentence)
    result = sentence.lower().split()
    return result

def get_sentence_tags(sentence):
    """
    Получить частеречные теги слов в предложении.
    """
    words = text_to_wordlist(sentence)
    tags = []
    if len(words) != 0:
        tags = [i[1] for i in pos_tag(words)]
    return " ".join(tags)

In [258]:
# Собственно основные функции, задающие эксперименты.
def cv(clf, train_data, train_answer, n):
    """
    Кроссвалидация с заданным параметром и форматом вывода.
    """
    cv_scores = cross_val_score(clf, train_data, train_answer, cv=n, scoring='f1')
    return "%0.3f" % cv_scores.mean()

def get_data(n_gram, use_tfidf=False, use_pos_tags=False, use_stemming=False):
    """
    Получение нужных признаков из данных.
    """
    train_data = bow(list(train.text), [], bow_ngrams=n_gram, 
                     use_tfidf=use_tfidf, stem=use_stemming)[0]
    if use_pos_tags:
        pos_train_data = []
        for text in list(train.text):
            pos_train_data.append(get_sentence_tags(text))
        pos_data = bow(pos_train_data, [])[0]
        train_data = hstack([train_data, pos_data])
    return train_data

def experiment(clf, n_grams, use_tfidf=False, use_pos_tags=False, use_stemming=False):
    """
    Одиночный эксперимент с заданным классификатором и параметрами признаков.
    """
    results = []
    for n_gram in n_grams:
        train_data = get_data(n_gram, use_tfidf, use_pos_tags, use_stemming)
        results.append(cv(clf, train_data, list(train.answer), 10))
    print(" ".join(results))

def grid_search(clf, tuned_parameters, use_tfidf=False, use_pos_tags=False, use_stemming=False):
    """
    Поиск по сетке параметров классификатора с заданным классификатором и параметрами признаков.
    """
    clf = GridSearchCV(clf, tuned_parameters, cv=10, scoring='f1')
    clf.fit(get_data((1,1), use_tfidf, use_pos_tags, use_stemming), train.answer)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

In [232]:
# Пункт 5.
experiment(clf=LogisticRegression(), n_grams=[(1,1)])

0.933


In [233]:
# Пункт 6.
test = [
    "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
    "FreeMsg: Txt: claim your reward of 3 hours talk time",
    "Have you visited the last lecture on physics?",
    "Have you visited the last lecture on physics? Just buy his book and you will have all materials! Only 99$",
    "Only 99$" 
]
train_data, test_data = bow(list(train.text), test)
clf = LogisticRegression()
clf.fit(train_data, train.answer)
print(" ".join((str(i) for i in clf.predict(test_data))))

1 1 0 0 0


In [234]:
# Пункт 7.
experiment(clf=LogisticRegression(), n_grams=[(2,2), (3,3), (1,3)])

0.822 0.725 0.925


In [235]:
# Пункт 8.
experiment(clf=MultinomialNB(), n_grams=[(2,2), (3,3), (1,3)])

0.645 0.379 0.888


In [236]:
# Пункт 9.
experiment(clf=LogisticRegression(), n_grams=[(1,1)], use_tfidf=True)

0.853


In [178]:
# А вот здесь уже интересно. Добавляем POS-теги - увеличиваем результат почти на 1%.
experiment(clf=LogisticRegression(), n_grams=[(1,1)], use_pos_tags=True)

0.941


In [238]:
# Стемминг даёт прирост в 0,5% по сравнению с изначальной версией.
experiment(clf=LogisticRegression(), n_grams=[(1,1)], use_stemming=True)

0.939


In [239]:
# Их совместное использование даёт ещё +0.4%, то есть почти полтора процента по сравнению с изначальной версией. 
experiment(clf=LogisticRegression(), n_grams=[(1,1)], use_pos_tags=True, use_stemming=True)

0.945


In [248]:
# Поиск по сетке с логрегрессией дал ещё +0.5%.
grid_search(LogisticRegression(), 
            [{'C': [0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000]}], 
            use_pos_tags=True, use_stemming=True)

Best parameters set found on development set:
{'C': 100}

Grid scores on development set:
0.832 (+/-0.075) for {'C': 0.01}
0.912 (+/-0.040) for {'C': 0.1}
0.936 (+/-0.029) for {'C': 0.5}
0.945 (+/-0.029) for {'C': 1}
0.951 (+/-0.038) for {'C': 5}
0.950 (+/-0.037) for {'C': 10}
0.951 (+/-0.031) for {'C': 100}
0.951 (+/-0.032) for {'C': 200}
0.951 (+/-0.034) for {'C': 500}
0.951 (+/-0.034) for {'C': 1000}
0.951 (+/-0.032) for {'C': 10000}
0.951 (+/-0.032) for {'C': 15000}
0.949 (+/-0.032) for {'C': 20000}
0.949 (+/-0.032) for {'C': 100000}


In [None]:
# Меняем классификатор на SVM, делаем поиск по сетке, получаем ещё чуть-чуть. Итого: 95.3%
grid_search(LinearSVC(), 
            [{'tol': [0.0001, 0.01], 
              'C': [0.01, 0.1, 0.5, 1, 5, 10, 100]}], 
            use_pos_tags=True, use_stemming=True, use_tfidf=True)