# ** Описание **

In [1]:
from __future__ import division

import base64
import csv
import gzip
import zlib

from collections import namedtuple

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
TRACE_NUM = 1000
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

def trace(items_num, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)
        
def trace_worker(items_num, worker_id, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d in worker_id %d" % (items_num, worker_id))

### Утилиты

#### Декораторы

In [3]:
def to_utf8(text):
    if isinstance(text, unicode): text = text.encode('utf8')
    return text

def convert2unicode(f):
    def tmp(text):
        if not isinstance(text, unicode): text = text.decode('utf8')
        return f(text)
    return tmp

def convert2lower(f):
    def tmp(text):        
        return f(text.lower())
    return tmp

#P.S. Декораторы могут усложнять отладку, так что от них вполне можно отказаться и воспользоваться copy-paste

### Извлечение текста из html

#### Извлечение текста при помощи встроенных модулей

In [4]:
from HTMLParser import HTMLParser
import re

###Извлечение текста из title можно вписать сюда

class TextHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._text = []
        self._title = []
        self._link = []
        self._in_title = False
        self._in_link = False

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = re.sub('[ \t\r\n]+', ' ', text)
            if self._in_title:
                self._title.append(text + ' ')
            if self._in_link:
                self._link.append(text + ' ')
            self._text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self._text.append('\n\n')
        elif tag == 'br':
            self._text.append('\n')
        elif tag == 'title':
            self._in_title = True
        elif tag == 'a':
            self._in_link = True
            
    def handle_endtag(self, tag):
        if tag == 'title':
            self._in_title = False
        elif tag == 'a':
            self._in_link = False

    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self._text.append('\n\n')

    def text(self):
        return ''.join(self._text).strip()
    
    def title(self):
        return ''.join(self._title).strip()
    
    def link(self):
        return ''.join(self._link).strip()

@convert2unicode
def html2text_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    return parser.text(), parser.title(), parser.link()

#### Извлечение текста при помощи дополнительных библиотек

In [5]:
def html2text_bs(raw_html):
    from bs4 import BeautifulSoup
    """
    Тут производится извлечения из html текста
    """
    soup = BeautifulSoup(raw_html, "html.parser")
    [s.extract() for s in soup(['script', 'style'])]
    return soup.get_text()

def html2text_bs_visible(raw_html):
    from bs4 import BeautifulSoup
    """
    Тут производится извлечения из html текста, который видим пользователю
    """
    soup = BeautifulSoup(raw_html, "html.parser")    
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    return soup.get_text()

def html2text_boilerpipe(raw_html):
    import boilerpipe
    """
    еще одна библиотека очень хорошо извлекающая именно видимый пользователю текст,
    но она завязана на java
    """
    pass

#### Выбираем какой метод для конвертации html в текст будет основным

In [6]:
#html2text = html2text_bs
html2text = html2text_parser

#### Методы для токенизации текста

In [7]:
@convert2lower
@convert2unicode
def easy_tokenizer(text):
    word = unicode()
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = unicode()
    if word: yield word

PYMORPHY_CACHE = {}
MORPH = None
#hint, чтобы установка pymorphy2 не была бы обязательной
def get_lemmatizer():
    import pymorphy2
    global MORPH
    if MORPH is None: MORPH = pymorphy2.MorphAnalyzer()
    return MORPH

@convert2lower
@convert2unicode
def pymorphy_tokenizer(text):
    global PYMORPHY_CACHE
    for word in easy_tokenizer(text):
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = get_lemmatizer().parse(word)[0].normal_form            
        yield PYMORPHY_CACHE[word_hash]

#### Основная функция, которая вызывается для преобразования html в список слов

In [8]:
def html2word(raw_html, to_text=html2text, tokenizer=easy_tokenizer):
    text, title, link = to_text(raw_html)
    return tokenizer(text.lower()), tokenizer(title.lower()), tokenizer(link.lower())

#### Рассчет финальных метрик

In [9]:
def safe_divide(a, b):
    if a == 0: return 0.0
    elif b == 0: return 0.0
    else: return a/b

def calculate_metrics(predictions, threshold):    
    """
    Функция подсчета метрик
    Параметры
    predictions - ранки по документам
    threshold  - порог для метрик
    """
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    for (url_id, mark, url, prediction) in predictions:        
        mark_predict = prediction > threshold

        if mark_predict:                     
            if mark_predict == mark: true_positive += 1
            else: false_positive += 1                    
        else:                     
            if  mark_predict == mark: true_negative += 1
            else: false_negative += 1

    class_prec  = safe_divide(true_positive, true_positive + false_positive)
    class_recall = safe_divide(true_positive, true_positive + false_negative)
        
    class_F1 = safe_divide(2 * class_prec * class_recall, class_prec + class_recall)
    
    
    not_class_prec = safe_divide(true_negative, true_negative + false_negative)
    not_class_recall = safe_divide(true_negative, true_negative + false_positive)
    
    not_class_F1 = safe_divide(2 * not_class_prec * not_class_recall, not_class_prec + not_class_recall)
    
    return ( (class_prec, class_recall, class_F1), (not_class_prec, not_class_recall, not_class_F1) )

def arange(start, stop, step):
    cur_value = start
    while True:
        if cur_value > stop: break
        yield cur_value
        cur_value += step

def plot_results(docs, min_threshold=-1, max_threshold=1, step=0.1, trace=False):
    x = []
    y_p = []
    y_n = []
    docs_predictions = classifier.predict_all(docs)
    for threshold in arange(min_threshold, max_threshold, step):
        r = calculate_metrics(docs_predictions, threshold)
        x.append(threshold)
        y_p.append(r[0])
        y_n.append(r[1])        
        if trace: 
            print 'threshold %s' % threshold
            print '\tclass_prec %s, class_recall %s, class_F1 %s' % r[0]
            print '\tnot_class_prec %s, not_class_recall %s, not_class_F1 %s' % r[1]
            print '\t\tMacroF1Mesure %s' % ((r[0][2] + r[1][2])/2)
    plot_stats(x, y_p, "Class Result")
    plot_stats(x, y_n, "Not class Result")    


def plot_stats(x, y, title):
    plt.figure(figsize=(10, 5))

    prec, = plt.plot( x, 
                     [k[0] for k in y], "r", label='Precision', 
                     linewidth=1)
    accur, = plt.plot( x, 
                      [k[1] for k in y], "b", label='Recall',
                      linewidth=1)
    f1, =    plt.plot( x, 
                      [k[2] for k in y], "g", label='F1',
                      linewidth=1)
    plt.grid(True)
    plt.legend(handles=[prec, accur, f1])
    plt.title(title)
    plt.show()

In [10]:
def calc_features(url, html_data):
    words, title, link = map(list, html2word(html_data))
    words_num = len(words)
    avg_word_len = sum([len(word) for word in words])/len(words)
    title_words_num = len(title)
    anchor_words_num = len(link)
    compression_level = len(zlib.compress(to_utf8(html_data)))/len(html_data)
    
    return [len(words), avg_word_len, title_words_num, anchor_words_num, compression_level, html2text(html_data)[0]]

In [11]:
test_html_data = u'''
<html>
<title> Заголовок Ololo </title>
спам 1 2 3
<body> <a>Текст ссылки </a> 
<a>Текст ссылки 1 </a> 
</body>
</html>
'''
test_url = 'http://ololo'
test_features = calc_features(test_url, test_html_data)
print test_features[5]

Заголовок Ololo спам 1 2 3 Текст ссылки Текст ссылки 1


In [12]:
from multiprocessing import Process, Queue

DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'features'])

WORKER_NUM = 4

def load_csv(input_file_name, calc_features_f):    
    """
    Загружаем данные и извлекаем на лету признаки
    Сам контент не сохраняется, чтобы уменьшить потребление памяти - чтобы
    можно было запускать даже на ноутбуках в классе
    """
    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            features = calc_features_f(url, html_data)            
            yield DocItem(url_id, mark, url, features)            
                
        trace(i, 1)  
        
def load_csv_worker(input_file_name, calc_features_f, worker_id, res_queue):    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace_worker(i, worker_id)
            if i % WORKER_NUM != worker_id: continue
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            features = calc_features_f(url, html_data)            
            res_queue.put(DocItem(url_id, mark, url, features))
                
        trace_worker(i, worker_id, 1)  
    res_queue.put(None)
        
def load_csv_multiprocess(input_file_name, calc_features_f):
    processes = []
    res_queue = Queue()    
    for i in xrange(WORKER_NUM):
        process = Process(target=load_csv_worker, args=(input_file_name, calc_features_f, i, res_queue))
        processes.append(process)
        process.start()
    
    complete_workers = 0
    while complete_workers != WORKER_NUM:
        item = res_queue.get()
        if item is None:
            complete_workers += 1
        else:
            yield item
        
    for process in processes: process.join()

** Обрабатываем входной файл **
<br>
Формат - поля разделенные табуляциями
<br>
0 - идентификатор документа
<br>
1 - метка класса 0 - не спам, 1 - спам
<br>
2 - урл документа
<br>
3 - документ в кодировке base64

Выходной формат - массив кортежей вида
(doc_id, is_spam, url, html_data)

In [13]:
%%time

TRAIN_DATA_FILE  = 'kaggle_train_data_tab.csv.gz'
# TRAIN_DATA_FILE  = 'kaggle/kaggle_train_data_tab_300.csv.gz'

train_docs = list(load_csv_multiprocess(TRAIN_DATA_FILE, calc_features))

23:33:14 INFO:Complete items 00000 in worker_id 2
23:33:14 INFO:Complete items 00000 in worker_id 0
23:33:14 INFO:Complete items 00000 in worker_id 3
23:33:14 INFO:Complete items 00000 in worker_id 1
23:33:56 INFO:Complete items 01000 in worker_id 0
23:33:57 INFO:Complete items 01000 in worker_id 2
23:33:58 INFO:Complete items 01000 in worker_id 3
23:34:00 INFO:Complete items 01000 in worker_id 1
23:34:09 INFO:Complete items 02000 in worker_id 0
23:34:09 INFO:Complete items 02000 in worker_id 1
23:34:10 INFO:Complete items 02000 in worker_id 3
23:34:10 INFO:Complete items 02000 in worker_id 2
23:34:19 INFO:Complete items 03000 in worker_id 0
23:34:19 INFO:Complete items 03000 in worker_id 1
23:34:20 INFO:Complete items 03000 in worker_id 3
23:34:22 INFO:Complete items 03000 in worker_id 2
23:34:32 INFO:Complete items 04000 in worker_id 0
23:34:33 INFO:Complete items 04000 in worker_id 1
23:34:34 INFO:Complete items 04000 in worker_id 2
23:34:35 INFO:Complete items 04000 in worker_id 3


CPU times: user 1.2 s, sys: 477 ms, total: 1.68 s
Wall time: 1min 51s


In [14]:
texts_target = [int(doc.is_spam) for doc in train_docs if doc.features != None]

In [31]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import xgboost as xgb

In [16]:
stop_words = stopwords.words('russian')
stop_words.extend([u'что', u'это', u'так', u'вот', u'быть', u'как', u'в', u'—', u'к', u'на'])

In [17]:
def prep(text):
    regex = re.compile(u'[^a-zA-Zа-яА-Я]')
    return regex.sub(u' ', text)

In [18]:
prep_text = [prep(doc.features[5].lower()) for doc in train_docs]

In [19]:
vectorizer_prep = TfidfVectorizer(ngram_range=(1,3), min_df=5, stop_words=stop_words)

In [20]:
%%time 
vectorizer_prep.fit(prep_text)

CPU times: user 2min 23s, sys: 7.81 s, total: 2min 31s
Wall time: 2min 30s


TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=[u'\u0438', u'\u0432', u'\u0432\u043e', u'\u043d\u0435', u'\u0447\u0442\u043e', u'\u043e\u043d', u'\u043d\u0430', u'\u044f', u'\u0441', u'\u0441\u043e', u'\u043a\u0430\u043a', u'\u0430', u'\u0442\u043e', u'\u0432\u0441\u0435', u'\u043e\u043d\u0430', u'\u0442\u0430\u043a', u'\u0435\u0433\u...'\u0431\u044b\u0442\u044c', u'\u043a\u0430\u043a', u'\u0432', u'\u2014', u'\u043a', u'\u043d\u0430'],
        strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
%%time 
X_vec_prep = vectorizer_prep.transform(prep_text)

CPU times: user 39.3 s, sys: 677 ms, total: 40 s
Wall time: 39.9 s


In [22]:
gb_prep = xgb.XGBClassifier(n_jobs=-1, max_depth=5, n_estimators=400)

lr_prep = LogisticRegression(C=1000)

In [23]:
%time gb_prep.fit(X_vec_prep, texts_target)

CPU times: user 48min 54s, sys: 3min 7s, total: 52min 2s
Wall time: 4min 8s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [24]:
%time lr_prep.fit(X_vec_prep, texts_target)

CPU times: user 1min 26s, sys: 6.28 s, total: 1min 32s
Wall time: 8.99 s


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
%%time

TEST_DATA_FILE  = 'kaggle_test_data_tab.csv.gz'
# TEST_DATA_FILE  = 'kaggle/kaggle_train_data_tab_300.csv.gz'

test_docs = list(load_csv_multiprocess(TEST_DATA_FILE, calc_features))

23:42:46 INFO:Complete items 00000 in worker_id 0
23:42:46 INFO:Complete items 00000 in worker_id 1
23:42:46 INFO:Complete items 00000 in worker_id 2
23:42:46 INFO:Complete items 00000 in worker_id 3
23:42:58 INFO:Complete items 01000 in worker_id 2
23:43:01 INFO:Complete items 01000 in worker_id 1
23:43:06 INFO:Complete items 01000 in worker_id 3
23:43:09 INFO:Complete items 01000 in worker_id 0
23:43:14 INFO:Complete items 02000 in worker_id 2
23:43:17 INFO:Complete items 02000 in worker_id 1
23:43:27 INFO:Complete items 02000 in worker_id 0
23:43:31 INFO:Complete items 02000 in worker_id 3
23:43:31 INFO:Complete items 03000 in worker_id 2
23:43:37 INFO:Complete items 03000 in worker_id 1
23:43:44 INFO:Complete items 03000 in worker_id 3
23:43:45 INFO:Complete items 04000 in worker_id 2
23:43:45 INFO:Complete items 03000 in worker_id 0
23:43:52 INFO:Complete items 04000 in worker_id 1
23:43:58 INFO:Complete items 05000 in worker_id 2
23:44:00 INFO:Complete items 04000 in worker_id 0


CPU times: user 6.71 s, sys: 2.41 s, total: 9.12 s
Wall time: 4min 17s


In [26]:
test_prep = [prep(doc.features[5].lower()) for doc in test_docs]

In [27]:
X_test_prep = vectorizer_prep.transform(test_prep)

In [28]:
gb_prep_proba = gb_prep.predict_proba(X_test_prep)

In [29]:
lr_prep_proba = lr_prep.predict_proba(X_test_prep)

In [32]:
with open('my_submission_prep.csv', 'wb') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    idx = 0
    for doc in test_docs:
        if np.array([gb_prep_proba[idx][0], lr_prep_proba[idx][0]]).mean() > 0.5:
            writer.writerow([doc[0], 0])
        else:
            writer.writerow([doc[0], 1])
        idx += 1