In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from multiprocessing import Pool, Lock, Value
from time import sleep
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
import base64
import numpy as np
import zlib

In [3]:
df = pd.read_csv('data/kaggle_train_data_tab_new.csv', sep='\t')

In [4]:
def extract_features(html_page):
    html = base64.b64decode(html_page)
    soup = BeautifulSoup(html)
    compressibility = len(html) / len(zlib.compress(html))
    anchor_words = 0
    if soup.title:
        title_words_cnt = len(soup.title.text.split())
    else:
        title_words_cnt = 0
    if soup.text:
        text_list = soup.text.split()
        words_cnt = len(text_list)
        if text_list:
            mean_word_length = np.mean(list(map(len, text_list)))
        else: 
            mean_word_length = 0
    else:
        words_cnt = 0
        mean_word_length = 0
    link_cnt = len(soup.find_all('link'))
    img_cnt = len(soup.find_all('img'))
    style_cnt = len(soup.find_all('style'))
    style_length = len(str(soup.find_all('style')))
    script_cnt = len(soup.find_all('script'))
    script_length = len(str(soup.find_all('script')))
    if soup.head:
        head_length = len(soup.head.text)
    else:
        head_length = 0
    anchors = soup.find_all('a')
    if anchors:
        for item in anchors:
            anchor_words += len(item.text.split())
    else:
        anchor_words = 0
    anchor_cnt = len(anchors)
    return [title_words_cnt, words_cnt, mean_word_length, link_cnt, img_cnt,
            style_cnt, style_length, script_cnt, script_length, head_length,
            anchor_cnt, anchor_words, compressibility]

In [5]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(page):
    res = extract_features(page) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df['PageBase64'].values)


7040 objects are processed...

In [7]:
X_train = np.array(res)
y_train = df['Prediction'].values

In [8]:
clf = XGBClassifier(n_estimators=1000, n_jobs=4)
scores = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')

In [9]:
scores['test_score'].mean()

0.9152538090839978

In [10]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

df_test = pd.read_csv('data/kaggle_test_data_tab_new.csv', sep='\t')

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df_test['PageBase64'].values)
X_test = np.array(res)
y_pred = clf.predict(X_test)

23080 objects are processed...

In [11]:
df_pred = pd.DataFrame({
    'Id': df_test['Id'].values,
    'Prediction': y_pred
})
df_pred.to_csv('my_submission.csv', index=False) 
