In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from multiprocessing import Pool, Lock, Value
from time import sleep
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
import base64
import numpy as np
import zlib

import re
import pymorphy2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [2]:
nltk.download('stopwords')
stopWords = set(stopwords.words('russian'))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anastasiabogatenkova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anastasiabogatenkova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('data/kaggle_train_data_tab_new.csv', sep='\t')

In [13]:
def words_stat(text):
    text = text.lower()
    text = re.sub('[^а-яёА-ЯЁ]+', ' ', text)
    text = re.sub('\s+', ' ', text)
    words = word_tokenize(text)
    wordsFiltered = []
    for w in words:
        if w not in stopWords:
            wordsFiltered.append(w)
    morph = pymorphy2.MorphAnalyzer()
    text = list(map(lambda x: morph.parse(x)[0].normal_form, wordsFiltered))
    l = len(text)
    res = [l]
    if Counter(text).most_common():
        res.append(Counter(text).most_common()[0][1] / l)
    else:
        res.append(0)
    return res

In [14]:
def extract_features(html_page):
    html = base64.b64decode(html_page)
    soup = BeautifulSoup(html)
    compressibility = len(html) / len(zlib.compress(html))
    anchor_words = 0
    if soup.title:
        title_words_cnt = len(soup.title.text.split())
    else:
        title_words_cnt = 0
    if soup.text:
        text = soup.text
        words_cnt = len(text.split())
        mean_word_length = np.mean(list(map(len, text.split())))
        words_features = words_stat(text)
    else:
        words_cnt = 0
        mean_word_length = 0
        words_features = [0, 0]
    link_cnt = len(soup.find_all('link'))
    div_cnt = len(soup.find_all('div'))
    anchors = soup.find_all('a')
    for item in anchors:
        anchor_words += len(item.text.split())
    anchor_cnt = len(anchors)
    
    res = [title_words_cnt, words_cnt, mean_word_length, 
           link_cnt, anchor_cnt, anchor_words, div_cnt, compressibility]
    res += words_features
    return res

In [None]:
html = base64.b64decode(df['PageBase64'].values[1])
soup = BeautifulSoup(html)

In [None]:
words_stat(soup.text)

In [15]:
mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(page):
    res = extract_features(page) 
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df['PageBase64'].values)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


10 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


50 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


80 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


120 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


130 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


190 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


250 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


480 objects are processed...

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


7040 objects are processed...

In [16]:
X_train = np.array(res)
y_train = df['Prediction'].values

In [17]:
clf = XGBClassifier()
scores = cross_validate(clf, X_train, y_train, cv=3, scoring='f1')

In [18]:
scores['test_score'].mean()

0.8965148855904063

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

df_test = pd.read_csv('data/kaggle_test_data_tab_new.csv', sep='\t')

with Pool(processes=10) as pool:
    res = pool.map(func_wrapper, df_test['PageBase64'].values)
X_test = np.array(res)
y_pred = clf.predict(X_test)

In [None]:
df_pred = pd.DataFrame({
    'Id': df_test['Id'].values,
    'Prediction': y_pred
})
df_pred.to_csv('my_submission.csv', index=False) 
