In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from multiprocessing import Pool, Lock, Value
from time import sleep
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
import base64
import numpy as np
import zlib
from itertools import product
import re

In [2]:
df = pd.read_csv('data/kaggle_train_data_tab_new.csv', sep='\t')

In [4]:
def extract_features(page_info):
    html_page, url = page_info
    html = base64.b64decode(html_page)
    soup = BeautifulSoup(html)
    compressibility = len(html) / len(zlib.compress(html))
    anchor_words = 0
    url_len = len(url)
    url_dots = url.count(".")
    url_slash = url.count("/")
    if soup.title:
        title_words_cnt = len(soup.title.text.split())
    else:
        title_words_cnt = 0
    if soup.text:
        text = soup.text
        words_cnt = len(text.split())
        if text.split():
            mean_word_length = np.mean(list(map(len, text.split())))
        else: 
            mean_word_length = 0
    else:
        words_cnt = 0
        mean_word_length = 0
    link_cnt = len(soup.find_all('link'))
    img_cnt = len(soup.find_all('img'))
    style_cnt = len(soup.find_all('style'))
    script_cnt = len(soup.find_all('script'))
    if soup.head:
        head_length = len(soup.head.text)
    else:
        head_length = 0
    anchors = soup.find_all('a')
    if anchors:
        for item in anchors:
            anchor_words += len(item.text.split())
    else:
        anchor_words = 0
    anchor_cnt = len(anchors)
    return [url_len, url_dots, url_slash,
            title_words_cnt, words_cnt,
            mean_word_length, link_cnt, img_cnt,
            style_cnt, script_cnt, head_length,
            anchor_cnt, anchor_words, compressibility]

In [5]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(page):
    res = extract_features(page) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, zip(df['PageBase64'].values, df['Url'].values))

7040 objects are processed...

In [6]:
X_train = np.array(res)
y_train = df['Prediction'].values

In [7]:
# params = product(
#     (400, 600, 800, 1000, 1200), # n_estimators
#     (0.01, 0.1, 0.2, 0.3), # learning_rate
#     (3, 6, 9), # max_depth
# )
# res_score = 0

# for n_estimators, learning_rate, max_depth in list(params):
#     clf = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
#                         max_depth=max_depth, nthread=4)
#     res = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')
#     score = res['test_score'].mean()
#     print(score)
#     print(clf)
#     if score > res_score:
#         res_score = score
#         res_model = clf
        
# res_model, res_score

In [8]:
clf = XGBClassifier(learning_rate=0.01, max_depth=9,
                    n_estimators=1200, nthread=4)
scores = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')
scores['test_score'].mean()

0.9456146346303794

In [9]:
# clf = res_model
clf = XGBClassifier(learning_rate=0.01, max_depth=9,
                    n_estimators=1200, nthread=4)

clf.fit(X_train, y_train)

df_test = pd.read_csv('data/kaggle_test_data_tab_new.csv', sep='\t')

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, zip(df_test['PageBase64'].values, 
                                     df_test['Url'].values))
X_test = np.array(res)
y_pred = clf.predict(X_test)

23080 objects are processed...

In [10]:
df_pred = pd.DataFrame({
    'Id': df_test['Id'].values,
    'Prediction': y_pred
})
df_pred.to_csv('my_submission.csv', index=False) 
