In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from multiprocessing import Pool, Lock, Value
from time import sleep
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
import base64
import numpy as np
import zlib
from itertools import product
import re

In [2]:
df = pd.read_csv('data/kaggle_train_data_tab_new.csv', sep='\t')

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub('[^а-яёА-ЯЁ\w]+', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [4]:
def extract_features(html_page):
    html = base64.b64decode(html_page)
    soup = BeautifulSoup(html)
    compressibility = len(html) / len(zlib.compress(html))
    anchor_words = 0
    if soup.title:
        title_words_cnt = len(soup.title.text.split())
    else:
        title_words_cnt = 0
    if soup.text:
        text = soup.text
        words_cnt = len(text.split())
        if text.split():
            cl_text = clean_text(text)
            clean_words_cnt = len(cl_text.split())
            mean_word_length = np.mean(list(map(len, text.split())))
            mean_clean_word_length = np.mean(list(map(len, cl_text.split())))
        else: 
            clean_words_cnt = 0
            mean_word_length = 0
            mean_clean_word_length = 0
    else:
        words_cnt = 0
        mean_word_length = 0
        clean_words_cnt = 0
        mean_clean_word_length = 0
    link_cnt = len(soup.find_all('link'))
    img_cnt = len(soup.find_all('img'))
    style_cnt = len(soup.find_all('style'))
    style_length = len(str(soup.find_all('style')))
    script_cnt = len(soup.find_all('script'))
    script_length = len(str(soup.find_all('script')))
    if soup.head:
        head_length = len(soup.head.text)
    else:
        head_length = 0
    anchors = soup.find_all('a')
    if anchors:
        for item in anchors:
            anchor_words += len(item.text.split())
    else:
        anchor_words = 0
    anchor_cnt = len(anchors)
    return [title_words_cnt, words_cnt, clean_words_cnt,
            mean_word_length, mean_clean_word_length, link_cnt, img_cnt,
            style_cnt, style_length, script_cnt, script_length, head_length,
            anchor_cnt, anchor_words, compressibility]

In [5]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(page):
    res = extract_features(page) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df['PageBase64'].values)


4660 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


6790 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


7040 objects are processed...

In [6]:
X_train = np.array(res)
y_train = df['Prediction'].values

In [7]:
# params = product(
#     (400, 600, 800, 1000, 1200), # n_estimators
#     (0.01, 0.1, 0.2, 0.3), # learning_rate
#     (3, 6, 9), # max_depth
# )
# res_score = 0

# for n_estimators, learning_rate, max_depth in list(params):
#     clf = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
#                         max_depth=max_depth, nthread=4)
#     res = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')
#     score = res['test_score'].mean()
#     print(score)
#     print(clf)
#     if score > res_score:
#         res_score = score
#         res_model = clf
        
# res_model, res_score

In [9]:
clf = XGBClassifier(learning_rate=0.01, max_depth=9,
                    n_estimators=1200, nthread=4)
scores = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')
scores['test_score'].mean()

0.9209589510135681

In [11]:
# clf = res_model
clf = XGBClassifier(learning_rate=0.01, max_depth=9,
                    n_estimators=1200, nthread=4)

clf.fit(X_train, y_train)

df_test = pd.read_csv('data/kaggle_test_data_tab_new.csv', sep='\t')

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df_test['PageBase64'].values)
X_test = np.array(res)
y_pred = clf.predict(X_test)

12610 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


14320 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


19170 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


19300 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


21930 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


23080 objects are processed...

In [12]:
df_pred = pd.DataFrame({
    'Id': df_test['Id'].values,
    'Prediction': y_pred
})
df_pred.to_csv('my_submission.csv', index=False) 
