In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from multiprocessing import Pool, Lock, Value
from time import sleep
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
import base64
import numpy as np
import zlib

In [2]:
df = pd.read_csv('data/kaggle_train_data_tab_new.csv', sep='\t')

In [25]:
def extract_features(html_page):
    html = base64.b64decode(html_page)
    soup = BeautifulSoup(html)
    compressibility = len(html) / len(zlib.compress(html))
    anchor_words = 0
    if soup.title:
        title_words_cnt = len(soup.title.text.split())
    else:
        title_words_cnt = 0
    if soup.text:
        text_list = soup.text.split()
        words_cnt = len(text_list)
        if text_list:
            mean_word_length = np.mean(list(map(len, text_list)))
        else: 
            mean_word_length = 0
    else:
        words_cnt = 0
        mean_word_length = 0
    link_cnt = len(soup.find_all('link'))
    img_cnt = len(soup.find_all('img'))
    anchors = soup.find_all('a')
    if anchors:
        for item in anchors:
            anchor_words += len(item.text.split())
    else:
        anchor_words = 0
    anchor_cnt = len(anchors)
    return [title_words_cnt, words_cnt, mean_word_length, link_cnt, img_cnt,
            anchor_cnt, anchor_words, compressibility]

In [19]:
html = base64.b64decode(df['PageBase64'].values[1])
soup = BeautifulSoup(html)

In [21]:
soup.find_all('style') # script, css

[<style>
 #domain-discovery {
 	background: url('http://www.uz.ru/wp-content/plugins/domain-name-search/img/domain-tool-bg/bg-red.jpg') no-repeat 0 0;
 }
                     </style>,
 <style type="text/css">
 #chatoutput { 
 	border: 1px solid #0066CC; 
 	color: #333333;
 	background: #FFFFFF;
 }
 #chatoutput span { 
 	color: #0066CC;
 }
 #chatoutput li a { 
 	color: #0066CC;
 }
 #chatoutput li span a {
 	border-bottom: 1px dotted #0066CC;
 }
 #chatoutput ul#outputList li {
 	color: #333333;
 	min-height: 32px;
 }
 #lastMessage {
 	border-bottom: 2px dotted #888888;
 }
 #usersOnline {
 	color: #0066CC; 
 }
 tr.bg td {
 	border-bottom: 1px dashed #888888;
 }
 tr.bg:hover td, tr.bg:hover td a {
 	 background: #0066CC;
 	 color: #FFFFFF;
 }
 </style>,
 <style type="text/css">.recentcomments a{display:inline !important;padding:0 !important;margin:0 !important;}</style>]

In [26]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(page):
    res = extract_features(page) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df['PageBase64'].values)


7040 objects are processed...

In [27]:
X_train = np.array(res)
y_train = df['Prediction'].values

In [28]:
clf = XGBClassifier()
scores = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')

In [29]:
scores['test_score'].mean()

0.8892594554433643

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

df_test = pd.read_csv('data/kaggle_test_data_tab_new.csv', sep='\t')

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df_test['PageBase64'].values)
X_test = np.array(res)
y_pred = clf.predict(X_test)

In [None]:
df_pred = pd.DataFrame({
    'Id': df_test['Id'].values,
    'Prediction': y_pred
})
df_pred.to_csv('my_submission.csv', index=False) 
