In [6]:
import datetime, re, os, sys, json, pickle 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from bs4 import BeautifulSoup as bs

#import requests
#import lxml.html

from scipy import sparse
#import lightgbm as lgb

from functions import dict_gender_to_gendercategory
from functions import dict_age_to_agecategory
from functions import dict_gendercategory_to_gender
from functions import dict_agecategory_to_age

from functions import load_user_json
from functions import url2domain
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter

import pymystem3
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV

In [7]:
#https://bigartm.readthedocs.io/en/stable/installation/linux.html
import artm

In [8]:
import gc
gc.collect()

7

## read data + basic features

In [9]:
%%time

file_path = '../data/share/project01/gender_age_dataset.txt'
df = pd.read_csv(file_path, sep='\t')

CPU times: user 8.28 s, sys: 1.59 s, total: 9.88 s
Wall time: 10.7 s


In [10]:
%%time

df['user_json'] = load_user_json(df['user_json'])
df['domain_list'] = df['user_json'].map(lambda x: [url2domain(visit['url']) for visit in x['visits']])
df['norm_domain_list'] = df['domain_list'].apply(lambda x: [kek[2:-1] for kek in x])

CPU times: user 1min 39s, sys: 3.63 s, total: 1min 43s
Wall time: 1min 46s


In [11]:
%%time

df['gender_cat'] = df['gender'].map(dict_gender_to_gendercategory)
df['age_cat'] = df['age'].map(dict_age_to_agecategory)

df['time_list'] = df['user_json'].map(lambda x: [visit['timestamp'] for visit in x['visits']])

df['visits'] = df['time_list'].apply(len)
df['time_range'] = df['time_list'].apply(np.ptp)
df['visit_rate'] = df['time_range']/df['visits']

df['unique_domains_cnt'] = df['domain_list'].apply(lambda x: len(set(x)))
df['tot_domains_cnt'] = df['domain_list'].apply(lambda x: len(x))
df['avg_visits_per_domain'] = df['tot_domains_cnt']/df['unique_domains_cnt']

CPU times: user 2.89 s, sys: 364 ms, total: 3.25 s
Wall time: 3.42 s


## from json

#### read json

In [12]:
import json

In [13]:
!pwd

/Users/antonina.goryacheva/Desktop/content_bigdata10_proj1_kek/notebooks


In [14]:
%%time
with open('../data/titles-aiohttp.json', 'r') as f:
    results = json.load(f)

CPU times: user 1.56 s, sys: 230 ms, total: 1.79 s
Wall time: 2.29 s


In [15]:
results

{'—é—Ä–∏—Å—Ç-–º–∏—Ö–∞–∏–ª-–±–∞–±–∏–Ω.—Ä—Ñ': {'domain': '—é—Ä–∏—Å—Ç-–º–∏—Ö–∞–∏–ª-–±–∞–±–∏–Ω.—Ä—Ñ',
  'url': 'http://—é—Ä–∏—Å—Ç-–º–∏—Ö–∞–∏–ª-–±–∞–±–∏–Ω.—Ä—Ñ',
  'title': None,
  'keywords': None,
  'error': 'Cannot connect to host —é—Ä–∏—Å—Ç-–º–∏—Ö–∞–∏–ª-–±–∞–±–∏–Ω.—Ä—Ñ:80 ssl:None [nodename nor servname provided, or not known]'},
 '—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ': {'domain': '—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ',
  'url': 'http://—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ',
  'title': '–ò–Ω—Ç–µ—Ä–Ω–µ—Ç –º–∞–≥–∞–∑–∏–Ω —ç—Ñ–∏—Ä–Ω—ã—Ö –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ—Ç–∏–∫–∏ –∏ –∞—Ä–æ–º–∞—Ç–µ—Ä–∞–ø–∏–∏ ¬´–¶–ê –ò—Ä–∏—Å¬ª.',
  'keywords': None},
 '—ç—Ä–æ–ª—é–±.—Ä—Ñ': {'domain': '—ç—Ä–æ–ª—é–±.—Ä—Ñ',
  'url': 'http://—ç—Ä–æ–ª—é–±.—Ä—Ñ',
  'title': '–≠—Ä–æ–ª—é–±.—Ä—Ñ - —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞ –¥–ª—è –≤–∑—Ä–æ—Å–ª—ã—Ö —Å —Ä–µ–∞–ª—å–Ω—ã–º–∏ —Ñ–æ—Ç–æ. | –°–∞–π—Ç –¥–ª—è —Å–µ—Ä—å—ë–∑–Ω—ã—Ö –æ—Ç–Ω–æ—à–µ–Ω–∏–π, –ª—é–±–≤–∏ –∏ —Ä–æ–º–∞–Ω—Ç–∏–∫–∏ –¥–ª—è –≤—Å—Ç—Ä–µ—á –∏ –ª–∏—á–Ω–æ–π –∂–∏–∑–Ω–∏.',
  'keywords':

In [16]:
len(results)

75608

## –ß–∏—Å—Ç–∏–º –ø–æ–ª—É—á–µ–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ

##### –ø–æ–ª—É—á–∏–ª–∞ —á–µ—Ä–µ–∑ –≤–æ—Ç —Ç–∞–∫—É—é –∫–æ–º–∞–Ω–¥—É

cat content_bigdata10_proj1_kek/data/titles-aiohttp.json | jq '.[] | .title' | sort | uniq -c | sort -rn | head 

In [17]:
bad_titles = ['403 Forbidden',
'404 - Ladefehler der Webseite',
'404 - Unable to load website',
'Loading...',
'–ì–ª–∞–≤–Ω–∞—è',
'410 - Website deleted',
'404 Not Found',
'‚Äî @–¥–Ω–µ–≤–Ω–∏–∫–∏: –∞—Å–æ—Ü–∏–∞–ª—å–Ω–∞—è —Å–µ—Ç—å',
'404 - Impossible de t√©l√©charger le site',
'–ì–ª–∞–≤–Ω–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞',
'404 - –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Å–∞–π—Ç',
'Just a moment...',
'–ü–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π —Å–∞–π—Ç - –ì–ª–∞–≤–Ω–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞',
'Welcome!',
'502 Bad Gateway',
'503 Service Temporarily Unavailable',
'–≠—Ç–æ—Ç –¥–æ–º–µ–Ω –ø—Ä–∏–ø–∞—Ä–∫–æ–≤–∞–Ω –∫–æ–º–ø–∞–Ω–∏–µ–π Timeweb',
'–î–æ–º–µ–Ω –Ω–µ –ø—Ä–∏–ª–∏–Ω–∫–æ–≤–∞–Ω –Ω–∏ –∫ –æ–¥–Ω–æ–π –∏–∑ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–π –Ω–∞ —Å–µ—Ä–≤–µ—Ä–µ!',
'Attention Required! | Cloudflare',
'Konto ist gesperrt',
'Apache HTTP Server Test Page powered by CentOS',
'–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ—Ç–∫—Ä—ã—Ç–∏–∏ —Å—Ç—Ä–∞–Ω–∏—Ü—ã',
'Welcome to nginx!',
'Index of /',
'–°—Ä–æ–∫ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –¥–æ–º–µ–Ω–∞ –∑–∞–∫–æ–Ω—á–∏–ª—Å—è. –ö—É–ø–∏—Ç—å –¥–æ–º–µ–Ω –º–æ–∂–Ω–æ —Ç—É—Ç.',
'–†–µ—Å—É—Ä—Å –∑–∞–±–ª–æ–∫–∏—Ä–æ–≤–∞–Ω - Resource is blocked',
'–†–∞–±–æ—Ç–∞ —Å–∞–π—Ç–∞ –≤—Ä–µ–º–µ–Ω–Ω–æ –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞',
'–í –±–ª–∏–∂–∞–π—à–∏–µ —Å—É—Ç–∫–∏ —Ä–∞–±–æ—Ç–∞ —Å–∞–π—Ç–∞ –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–∏—Ç—Å—è.',
'–ü–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π —Å–∞–π—Ç - –ì–ª–∞–≤–Ω–∞—è',
'[.m] masterhost - –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π —Ö–æ—Å—Ç–∏–Ω–≥ —Å–∞–π—Ç–∞',
'Waiting for the redirectiron...',
'job.ru –ø–µ—Ä–µ–µ—Ö–∞–ª –Ω–∞ hh.ru',
'Account disabled by server administrator',
'–≠—Ç–æ—Ç —Å–∞–π—Ç –∑–∞–±–ª–æ–∫–∏—Ä–æ–≤–∞–Ω',
'–í–µ–¥—É—Ç—Å—è —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ —Ä–∞–±–æ—Ç—ã',
'Account Suspended',
'Error 404 (Not Found)!!1',
'404 - Not Found',
'–≠—Ç–æ—Ç –¥–æ–º–µ–Ω –ø—Ä–æ–¥–∞—ë—Ç—Å—è',
'500 Internal Server Error',
'Access Denied',
'–ò—Å—Ç—ë–∫ —Å—Ä–æ–∫ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –¥–æ–º–µ–Ω–∞\xa0zubovskaya-banya.ru']

In [18]:
%%time
clean_results = {}
for result in results:
    if (results[result]['title'] not in bad_titles) and \
    (results[result]['title'] is not None or results[result]['keywords'] is not None):
        clean_results[result] = results[result]

CPU times: user 130 ms, sys: 3.76 ms, total: 134 ms
Wall time: 136 ms


In [19]:
len(clean_results)

65328

In [20]:
# titles = []
# keywords = []
# descriptions = []
# for domain in clean_results:
#     titles.append(clean_results[domain].get('title', ''))
#     keywords.append(clean_results[domain].get('keywords', ''))
#     descriptions.append(clean_results[domain].get('description', ''))

In [21]:
#counts_keywords = Counter(keywords)
#top30_keywords =  sorted(list(counts_keywords.items()), key=lambda tup: tup[1], reverse=True)[:30]

#### train texts

In [22]:
%%time

dom_info = []
for domain in clean_results:
    info = str(clean_results[domain].get('title', '')) \
         + str(clean_results[domain].get('keywords', '')) \
         + str(clean_results[domain].get('description', ''))
    dom_info.append([domain, info])

CPU times: user 1.01 s, sys: 309 ms, total: 1.32 s
Wall time: 1.51 s


In [23]:
len(dom_info)

65328

–î–µ–ª–∞—é –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫—É –ø–æ —á–∞—Å—Ç—è–º, –∏–±–æ –ø–æ—Å—Ç–æ—è–Ω–Ω–æ –ª–æ–≤–∏–ª–∞ broken pipe. –í–æ–æ–±—â–µ –≤—Å–µ –≤ –æ–¥–Ω—É —Ñ—É–Ω–∫—Ü–∏—é –∑–∞–ø–∏—Ö–Ω—É—Ç—å –º–æ–∂–Ω–æ.

In [24]:
def text_to_wordlist(text):
    text = re.sub('[^a-zA-Z–∞-—è–ê-–Ø—ë–Å]', ' ', text)
    text = re.sub('none', ' ', text.lower())
    words = text.lower().strip().split()
    return words

In [25]:
def clean(words, stopWords):
    new_words = [word for word in words if (word not in stopWords) and len(word) > 2]
    return new_words

In [26]:
mystem = pymystem3.Mystem()
wordnet_lemmatizer = WordNetLemmatizer()
def clean_v2(words):    
    new_words = [mystem.lemmatize(x)[0] for x in words]
    new_new_words = [wordnet_lemmatizer.lemmatize(x, pos=wordnet.VERB) for x in new_words]
    return new_new_words

In [27]:
domain_info = pd.DataFrame(data=dom_info, columns=['domain', 'info'])

In [28]:
#–±–µ—Ä–µ–º —Ç–æ–ª—å–∫–æ –±—É–∫–≤—ã + split –ø–æ –ø—Ä–æ–±–µ–ª–∞–º
domain_info['norm_info'] = domain_info['info'].apply(lambda x: text_to_wordlist(x))

In [29]:
%%time
#—É–±–∏—Ä–∞–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ + —É–±–∏—Ä–∞–µ–º —Å–ª–æ–≤–∞ –¥–ª–∏–Ω–Ω–æ–π <= 2
stopWords = stopwords.words(['russian', 'english'])
domain_info['norm_info_v2'] = domain_info['norm_info'].apply(lambda x: clean(x, stopWords))

CPU times: user 9.37 s, sys: 91 ms, total: 9.46 s
Wall time: 9.9 s


In [30]:
%%time
#–ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
domain_info['norm_info_v3'] = domain_info['norm_info_v2'].apply(lambda x: clean_v2(x))

CPU times: user 1min 11s, sys: 22.2 s, total: 1min 33s
Wall time: 3min 25s


In [31]:
%%time
#–±–µ—Ä–µ–º —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –¥–æ–º–µ–Ω–∞
domain_info['norm_info_v4'] = domain_info['norm_info_v3'].apply(lambda x: list(set(x)))

CPU times: user 314 ms, sys: 28.3 ms, total: 342 ms
Wall time: 364 ms


In [32]:
#—É–±–∏—Ä–∞–µ–º –¥–æ–º–µ–Ω—ã –±–µ–∑ –∏–Ω—Ñ—ã
domain_info['is_empty'] = domain_info['norm_info_v4'].apply(lambda x: 0 if x else 1)
domain_info = domain_info[domain_info['is_empty'] == 0]

In [33]:
domain_info.shape

(64285, 7)

In [34]:
domain_info.head()

Unnamed: 0,domain,info,norm_info,norm_info_v2,norm_info_v3,norm_info_v4,is_empty
0,—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ,"–ò–Ω—Ç–µ—Ä–Ω–µ—Ç –º–∞–≥–∞–∑–∏–Ω —ç—Ñ–∏—Ä–Ω—ã—Ö –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ—Ç–∏–∫–∏...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã—Ö, –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã—Ö, –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã–π, –º–∞—Å–ª–æ, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[—ç—Ñ–∏—Ä–Ω—ã–π, –º–∞—Å–ª–æ, –∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, –∏—Ä–∏—Å, –∞—Ä–æ–º...",0
1,—ç—Ä–æ–ª—é–±.—Ä—Ñ,–≠—Ä–æ–ª—é–±.—Ä—Ñ - —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞ –¥–ª—è –≤–∑—Ä–æ—Å–ª—ã—Ö ...,"[—ç—Ä–æ–ª—é–±, —Ä—Ñ, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞, –¥–ª—è, –≤–∑—Ä–æ—Å...","[—ç—Ä–æ–ª—é–±, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞, –≤–∑—Ä–æ—Å–ª—ã—Ö, —Ä–µ–∞–ª...","[—ç—Ä–æ–ª—é–±, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–π, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–æ, –≤–∑—Ä–æ—Å–ª—ã–π, —Ä–µ–∞–ª...","[—Å—Ç—Ä–∞—Å—Ç–Ω—ã–π, —Å–∞–π—Ç, —Ñ–æ—Ç–æ, —Ä–æ–º–∞–Ω—Ç–∏–∫, —Ä–µ–∞–ª—å–Ω—ã–π, –∂–∏...",0
2,—à–∞—Ä–∏–∫–∏–æ–ø—Ç–æ–º.—Ä—Ñ,"–§–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã –¥–ª—è —Ä–∞—Å...","[—Ñ–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã, –¥–ª—è, ...","[—Ñ–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã, —Ä–∞—Å—Ç–µ...","[—Ñ–∏—Ç–æ–ª–∞–º–ø–∞, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä, —Ä–∞—Å—Ç–µ–Ω–∏...","[—Ñ–∏—Ç–æ–ª–∞–º–ø–∞, –≤–µ—Å—å, –≥—Ä–æ—É–±–æ–∫—Å, –∞–∫–≤–∞—Ä–∏—É–º, —Ä–∞—Å—Ç–µ–Ω–∏–µ...",0
3,—á–∏–Ω–∞–º–æ–±–∏–ª.—Ä—Ñ,–ö–∞—Ç–∞–ª–æ–≥ –∑–∞–ø—á–∞—Å—Ç–µ–π CHERY GEELY LIFAN Brilliance...,"[–∫–∞—Ç–∞–ª–æ–≥, –∑–∞–ø—á–∞—Å—Ç–µ–π, chery, geely, lifan, bril...","[–∫–∞—Ç–∞–ª–æ–≥, –∑–∞–ø—á–∞—Å—Ç–µ–π, chery, geely, lifan, bril...","[–∫–∞—Ç–∞–ª–æ–≥, –∑–∞–ø—á–∞—Å—Ç—å, chery, geely, lifan, brill...","[—á–µ—Ä—è, –∞–≤—Ç–æ–º–æ–±–∏–ª—å, –∫–æ–º–ø–ª–µ–∫—Ç—É—é—â–∏–π, chery, brill...",0
4,—á–µ–∫–∏—Å–ø–±.—Ä—Ñ,–õ—é–±—ã–µ —á–µ–∫–∏ –°–ü–±! - –ì–ª–∞–≤–Ω–∞—èNone,"[–ª—é–±—ã–µ, —á–µ–∫–∏, —Å–ø–±, –≥–ª–∞–≤–Ω–∞—è]","[–ª—é–±—ã–µ, —á–µ–∫–∏, —Å–ø–±, –≥–ª–∞–≤–Ω–∞—è]","[–ª—é–±–æ–π, —á–µ–∫, —Å–ø–±, –≥–ª–∞–≤–Ω—ã–π]","[–≥–ª–∞–≤–Ω—ã–π, —á–µ–∫, —Å–ø–±, –ª—é–±–æ–π]",0


In [35]:
domain_dict = dict(zip(domain_info.domain, domain_info.norm_info_v4))

In [36]:
#–µ—â–µ –Ω–∞–¥–æ —É–±—Ä–∞—Ç—å –≤—Å—è–∫—É—é –º—É—Ç—å —Ç–∏–ø–∞ "–¥–æ–º–µ–Ω –ø—Ä–æ–¥–∞–µ—Ç—Å—è"

## Bag of words for user

–ö–∞–∂–¥–æ–º—É —é–∑–µ—Ä—É –ø—Ä–∏—Å–æ–µ–¥–∏–Ω—è–µ–º –µ–≥–æ —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤ –ø–æ —Å–ø–∏—Å–∫—É –¥–æ–º–µ–Ω–æ–≤

In [37]:
def bag_of_words_for_user(norm_domain_list, domain_dict):
    bag_of_words = []
    for dom in norm_domain_list:
        try:
            bag_of_words.append(domain_dict[dom])
        except: 
            pass
    return sum(bag_of_words, [])

In [38]:
%%time
df['bag_of_words'] = df['norm_domain_list'].apply(lambda x: bag_of_words_for_user(x, domain_dict))

CPU times: user 2min, sys: 10.9 s, total: 2min 11s
Wall time: 2min 13s


In [39]:
df.head()

Unnamed: 0,gender,age,uid,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,time_range,visit_rate,unique_domains_cnt,tot_domains_cnt,avg_visits_per_domain,bag_of_words
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,6978153933,1395631000.0,3,5,1.666667,"[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —Ä–µ–±–µ–Ω–æ–∫, –∫—Ä–µ–∞—Ç–∏–≤–Ω—ã–π, –º–Ω–æ–≥–æ..."
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,2266588550,22221460.0,26,102,3.923077,"[–≤–µ—Å—å, —Ç–æ—Ä–≥–∏, —Ç—Ä–µ–π–¥–µ—Ä, forex—Ç–æ—Ä–≥–∏, –ø–æ–ª–µ–∑–Ω—ã–π, —Ñ..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,{'visits': [{'url': 'http://ru.oriflame.com/pr...,"[b'ru.oriflame.com', b'ru.oriflame.com', b'ru....","[ru.oriflame.com, ru.oriflame.com, ru.oriflame...",0,1,"[1418840296062, 1418667832733, 1418667717223, ...",44,8284914026,188293500.0,6,44,7.333333,"[cosmetics, oriflame, cosmetics, oriflame, cos..."
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,{'visits': [{'url': 'http://translate-tattoo.r...,"[b'translate-tattoo.ru', b'nadietah.ru', b'1ob...","[translate-tattoo.ru, nadietah.ru, 1obl.ru, 1o...",0,1,"[1418217864467, 1418124701342, 1417866007812, ...",14,693126229,49509020.0,3,14,4.666667,"[–ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π, –ø–µ—Ä–µ–≤–æ–¥, –∂–∏–≤–æ–π, –ø–µ—Ä–µ–≤–æ–¥—Å–µ—Ä–≤..."
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,{'visits': [{'url': 'https://mail.rambler.ru/#...,"[b'mail.rambler.ru', b'news.rambler.ru', b'mai...","[mail.rambler.ru, news.rambler.ru, mail.ramble...",1,4,"[1427272415001, 1427272415000, 1427271294001, ...",212,613917001,2895835.0,25,212,8.48,"[–Ω–∞–¥–µ–∂–Ω—ã–π, –≤–µ—Å—å, —Å–ø–∞–º, –Ω–µ—Å–∫–æ–ª—å–∫–æ, –ø–æ—á—Ç–æ–≤—ã–π, –±–µ..."


## Get features from Bag Of Words

–ë–µ—Ä–µ–º —Ç–æ–ø 3000 —Å–ª–æ–≤ –∏–∑ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞ —Å —é–∑–µ—Ä–∞–º–∏ df (–¥–∞—Ç–∞—Ñ—Ä–µ–π–º, –≤ –∫–æ—Ç–æ—Ä–æ–º 1 —Å—Ç—Ä–æ–∫–∞ = 1 —é–∑–µ—Ä), —Ç.–∫. –µ—Å–ª–∏ –±—Ä–∞—Ç—å –∏–∑ domain_info (–¥–∞—Ç–∞—Ñ—Ä–µ–º, –≤ –∫–æ—Ç–æ—Ä–æ–º 1 —Å—Ç—Ä–æ–∫–∞ = 1 –¥–æ–º–µ–Ω), —Ç–æ –ø–æ–ª—É—á–∞–µ—Ç—Å—è –≤—Å—è–∫–∏–π —à–ª–∞–∫.

In [40]:
%%time
df['bag_of_words_text'] = df['bag_of_words'].apply(lambda x: str(' '.join(x)).strip())

CPU times: user 4.08 s, sys: 1.24 s, total: 5.33 s
Wall time: 6.45 s


In [41]:
#–º–æ–∂–Ω–æ —Å—é–¥–∞ –µ—â–µ –∫–∞–∫–∏–µ-–Ω–∏–±—É–¥—å —Å–ª–æ–≤–∞ –¥–æ–±–∞–≤–∏—Ç—å, –∫–æ—Ç–æ—Ä—ã–µ –ø–æ–∫–∞–∂—É—Ç—Å—è –ª–∏—à–Ω–∏–º–∏
bad_words = ['–¥–æ–º–µ–Ω', '—É–∫—Ä', '—Ä–µ—Ä', '—Ä–µ—Å', '—Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏—è', '–∏—Å—Ç–µ–∫–∞—Ç—å', 'www', 
             'domain', '–º–Ω–æ–≥–æ–µ', 'net', '–æ–æ–æ', '–Ω–∞—à', '–æ–ø—Ç', 'website', '—Å–∞–π—Ç', 'com',
             '–≤–∞—à', '–∫–æ—Ç–æ—Ä—ã–π', '—ç—Ç–æ', '—Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π', '—Ä–∞–∑–ª–∏—á–Ω—ã–π', '—Ä–∞–∑–Ω—ã–π', '–ª—é–±–æ–π', '—Ö–æ—Ç–µ—Ç—å','–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å',
             '—Å–æ–±–∏—Ä–∞—Ç—å']

stopWords = stopwords.words(['russian', 'english'])
stopWords.extend(bad_words) 

In [42]:
%%time

#–ø–æ–ª—É—á–∏–º —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º –¥–∞–ª–µ–µ —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—Ç—å (—á—Ç–æ–±—ã –±–µ–∑ –≤—Å—è–∫–æ–≥–æ –º—É—Å–æ—Ä–∞)
count_vect_user = CountVectorizer(max_features=3000, stop_words=stopWords)
temp_matrix_user = count_vect_user.fit_transform(df['bag_of_words_text'].values) 
                                             
matrix_counts = temp_matrix_user.toarray()  

CPU times: user 1min 18s, sys: 4.88 s, total: 1min 22s
Wall time: 1min 27s


In [43]:
#—Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥–µ–º –≤ bigARTM —É—á–∏—Ç—ã–≤–∞—Ç—å
good_words = [x[0] for x in sorted(count_vect_user.vocabulary_.items(), key=lambda x: x[1])] 

–û–±—É—á–∞–µ–º CountVectorizer –¥–ª—è –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞ domain_info, —á—Ç–æ–±—ã –Ω–∞ –≤—Ö–æ–¥ –º–æ–¥–µ–ª–∏ –ø–æ–¥–∞—Ç—å.

In [44]:
%%time
norm_info = domain_info.norm_info_v3.tolist()
domain_info['good_words'] = [list(filter(lambda x: x in good_words, sublist)) for sublist in norm_info]

CPU times: user 1min 6s, sys: 7.87 s, total: 1min 14s
Wall time: 1min 17s


In [45]:
domain_info.head(3)

Unnamed: 0,domain,info,norm_info,norm_info_v2,norm_info_v3,norm_info_v4,is_empty,good_words
0,—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ,"–ò–Ω—Ç–µ—Ä–Ω–µ—Ç –º–∞–≥–∞–∑–∏–Ω —ç—Ñ–∏—Ä–Ω—ã—Ö –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ—Ç–∏–∫–∏...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã—Ö, –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã—Ö, –º–∞—Å–µ–ª, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —ç—Ñ–∏—Ä–Ω—ã–π, –º–∞—Å–ª–æ, –∞—Ä–æ–º–∞–∫–æ—Å–º–µ...","[—ç—Ñ–∏—Ä–Ω—ã–π, –º–∞—Å–ª–æ, –∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, –∏—Ä–∏—Å, –∞—Ä–æ–º...",0,"[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, –º–∞—Å–ª–æ]"
1,—ç—Ä–æ–ª—é–±.—Ä—Ñ,–≠—Ä–æ–ª—é–±.—Ä—Ñ - —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞ –¥–ª—è –≤–∑—Ä–æ—Å–ª—ã—Ö ...,"[—ç—Ä–æ–ª—é–±, —Ä—Ñ, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞, –¥–ª—è, –≤–∑—Ä–æ—Å...","[—ç—Ä–æ–ª—é–±, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–µ, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–∞, –≤–∑—Ä–æ—Å–ª—ã—Ö, —Ä–µ–∞–ª...","[—ç—Ä–æ–ª—é–±, —Å—Ç—Ä–∞—Å—Ç–Ω—ã–π, –∑–Ω–∞–∫–æ–º—Å—Ç–≤–æ, –≤–∑—Ä–æ—Å–ª—ã–π, —Ä–µ–∞–ª...","[—Å—Ç—Ä–∞—Å—Ç–Ω—ã–π, —Å–∞–π—Ç, —Ñ–æ—Ç–æ, —Ä–æ–º–∞–Ω—Ç–∏–∫, —Ä–µ–∞–ª—å–Ω—ã–π, –∂–∏...",0,"[–∑–Ω–∞–∫–æ–º—Å—Ç–≤–æ, –≤–∑—Ä–æ—Å–ª—ã–π, —Ä–µ–∞–ª—å–Ω—ã–π, —Ñ–æ—Ç–æ, —Å–µ—Ä—å–µ–∑–Ω..."
2,—à–∞—Ä–∏–∫–∏–æ–ø—Ç–æ–º.—Ä—Ñ,"–§–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã –¥–ª—è —Ä–∞—Å...","[—Ñ–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã, –¥–ª—è, ...","[—Ñ–∏—Ç–æ–ª–∞–º–ø—ã, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫–∏, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä—ã, —Ä–∞—Å—Ç–µ...","[—Ñ–∏—Ç–æ–ª–∞–º–ø–∞, —Ñ–∏—Ç–æ—Å–≤–µ—Ç–∏–ª—å–Ω–∏–∫, –ø—Ä–æ–∂–µ–∫—Ç–æ—Ä, —Ä–∞—Å—Ç–µ–Ω–∏...","[—Ñ–∏—Ç–æ–ª–∞–º–ø–∞, –≤–µ—Å—å, –≥—Ä–æ—É–±–æ–∫—Å, –∞–∫–≤–∞—Ä–∏—É–º, —Ä–∞—Å—Ç–µ–Ω–∏–µ...",0,"[—Ä–∞—Å—Ç–µ–Ω–∏–µ, —Ü–≤–µ—Ç–æ–∫, —Å–∏—Å—Ç–µ–º–∞, –æ—Å–≤–µ—â–µ–Ω–∏–µ, –∞–∫–≤–∞—Ä–∏—É..."


In [46]:
%%time
domain_info['good_words_text'] = domain_info['good_words'].apply(lambda x: str(' '.join(x)).strip())

CPU times: user 122 ms, sys: 26.9 ms, total: 149 ms
Wall time: 154 ms


In [47]:
cv_dom = CountVectorizer()
temp_matrix_dom = cv_dom.fit_transform(domain_info['good_words_text'].values)
n_wd = temp_matrix_dom.T
vocabulary = cv_dom.get_feature_names()

In [48]:
%%time

cv_words = [x[0] for x in sorted(cv_dom.vocabulary_.items(), key=lambda x: x[1])] 
cv_dom = pd.DataFrame(temp_matrix_dom.toarray() , columns=cv_words).head()
cv_dom['id_domain'] = domain_info['domain'] 

CPU times: user 2.56 s, sys: 2.11 s, total: 4.68 s
Wall time: 5.48 s


### BigARTM

https://github.com/bigartm/bigartm/blob/master/README.md

In [49]:
%%time
bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

CPU times: user 1min 8s, sys: 880 ms, total: 1min 9s
Wall time: 1min 11s


In [50]:
%%time

# –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º
T = 30   
# Learn simple LDA model (or you can use advanced artm.ARTM)
model = artm.LDA(num_topics=T, dictionary=bv.dictionary, cache_theta = True)
model.fit_offline(bv, num_collection_passes=20)

CPU times: user 1min 6s, sys: 4.84 s, total: 1min 11s
Wall time: 21.8 s


In [51]:
model.get_top_tokens()

[['–ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π',
  '–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ',
  '–∏–º—è',
  '—Ö–æ—Å—Ç–∏–Ω–≥',
  '—Å–µ—Ä–≤–µ—Ä',
  '—Å–∫–∏–¥–∫–∞',
  '–≤—ã—Å–æ–∫–∏–π',
  '–≥–æ—Ç–æ–≤—ã–π',
  '—Ä–µ—à–µ–Ω–∏–µ',
  '—Ä–µ—Ñ–µ—Ä–∞—Ç'],
 ['—Ö–æ—Ä–æ—à–∏–π',
  '—É–∫—Ä–∞–∏–Ω–∞',
  '–∫–ª—É–±',
  '–∫–∏–Ω–æ',
  '–≥—Ä—É–ø–ø–∞',
  '–Ω–æ–≤–∏–Ω–∫–∞',
  'club',
  '–ª—é–±–∏—Ç–µ–ª—å',
  '—Ö–∞—Ä—å–∫–æ–≤',
  '–∫–∞–º–µ—Ä–∞'],
 ['—Ä–µ–±–µ–Ω–æ–∫',
  '–¥–µ—Ç—Å–∫–∏–π',
  '–∑–¥–æ—Ä–æ–≤—å–µ',
  '–∂–∏–∑–Ω—å',
  '–∫—Ä–∞—Å–æ—Ç–∞',
  '—á–µ–ª–æ–≤–µ–∫',
  '–ø–∏—Ç–∞–Ω–∏–µ',
  '—Ä–∞–∑–≤–∏—Ç–∏–µ',
  '–∂–µ–Ω—Å–∫–∏–π',
  '–¥–∏–µ—Ç–∞'],
 ['—à–∫–æ–ª–∞',
  '—É—Ä–æ–∫',
  '–æ–±—É—á–µ–Ω–∏–µ',
  '—é—Ä–∏–¥–∏—á–µ—Å–∫–∏–π',
  '–∫—É—Ä—Å—ã',
  '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
  'art',
  '—Å–º—Å',
  '–∫–æ–º–ø–ª–µ–∫—Å',
  '–¥–∏–∑–∞–π–Ω'],
 ['–æ—Ç–¥—ã—Ö',
  '—Ç—É—Ä',
  '–º–æ–¥–∞',
  '–æ—Ç–µ–ª—å',
  'sie',
  '–ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–µ',
  '—Ç—É—Ä–∏–∑–º',
  '–∫—Ä—ã–º',
  '–≥–æ—Å—Ç–∏–Ω–∏—Ü–∞',
  '–ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏–µ'],
 ['–æ–±–æ—Ä—É–¥–æ–≤–∞–Ω–∏–µ',
  '–æ—Ñ–∏—Ü–∏–∞

In [52]:
#—Å–æ–æ—Ç–≤–µ—Ç—Å–≤–∏–µ —Ç–µ–º –∏ –∫–ª—é—á–µ–≤—ã—Ö —Å–ª–æ–≤ (–¥–ª—è –∏–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏–∏ —Ç–µ–º)
topic_words = dict(zip(model.get_theta().index, model.get_top_tokens()))

In [53]:
#–º–∞—Ç—Ä–∏—Ü–∞, –≤ –∫–æ—Ç–æ—Ä–æ–π –ø–æ —Å—Ç—Ä–æ–∫–∞–º - —Ç–µ–º—ã, –ø–æ —Å—Ç–æ–ª–±—Ü–∞–º - –≤—Ö–æ–¥–Ω—ã–µ —Ç–µ–∫—Å—Ç—ã (—Ç–æ –µ—Å—Ç—å —É –Ω–∞—Å 1 —Å—Ç–æ–ª–±–µ—Ü = 1 –¥–æ–º–µ–Ω)
model.get_theta().head()

Unnamed: 0,19000,19001,19002,19003,19004,19005,19006,19007,19008,19009,...,25990,25991,25992,25993,25994,25995,25996,25997,25998,25999
topic_0,0.000654,0.000551,0.000885,0.306307,0.000323,0.111617,0.421976,0.000449,0.000752,0.003031,...,0.004348,0.001901,0.000426,0.004348,0.000354,0.033333,0.004348,0.007693,0.000753,0.007693
topic_1,0.132333,0.00016,0.000885,0.003031,0.000322,0.00039,0.000518,0.273757,0.000799,0.003031,...,0.004348,0.001892,0.039464,0.439131,0.107181,0.033333,0.004348,0.007693,0.000783,0.007693
topic_2,0.000661,0.000156,0.000885,0.003333,0.217653,0.000366,0.162358,0.000449,0.000812,0.00303,...,0.004348,0.001887,0.712535,0.004348,0.001097,0.033333,0.004348,0.007692,0.000785,0.007693
topic_3,0.000654,0.000147,0.184703,0.003061,0.000333,0.000368,0.25264,0.000451,0.000755,0.003253,...,0.004348,0.001887,0.083464,0.004348,0.000353,0.033333,0.004351,0.007693,0.000766,0.007693
topic_4,0.000654,0.000138,0.000885,0.003031,0.00032,0.000366,0.000554,0.000449,0.000757,0.003031,...,0.004348,0.001887,0.000412,0.004348,0.000353,0.033333,0.004348,0.007692,0.000876,0.007693


In [54]:
theta = model.get_theta().T
theta['domain'] = domain_info['domain'].values
theta.reset_index(drop=True, inplace=True)

In [55]:
theta.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,domain
0,0.000654,0.132333,0.000661,0.000654,0.000654,0.000661,0.066229,0.000654,0.000654,0.000666,...,0.000654,0.146375,0.000663,0.000657,0.001022,0.000665,0.000654,0.133742,0.001131,—ç—Ñ–∏—Ä–Ω—ã–µ–º–∞—Å–ª–∞.—Ä—Ñ
1,0.000551,0.00016,0.000156,0.000147,0.000138,0.000214,0.147771,0.043515,0.000142,0.000159,...,0.000162,0.000265,0.000178,0.000167,0.08554,0.075686,0.000136,0.000152,0.088793,—ç—Ä–æ–ª—é–±.—Ä—Ñ
2,0.000885,0.000885,0.000885,0.184703,0.000885,0.303151,0.000885,0.000885,0.000959,0.000946,...,0.000885,0.178284,0.000885,0.000885,0.000904,0.310609,0.000885,0.000885,0.000885,—à–∞—Ä–∏–∫–∏–æ–ø—Ç–æ–º.—Ä—Ñ
3,0.306307,0.003031,0.003333,0.003061,0.003031,0.003031,0.003031,0.004084,0.003031,0.300533,...,0.003031,0.003031,0.003031,0.003032,0.003031,0.003038,0.003031,0.003031,0.003031,—á–∏–Ω–∞–º–æ–±–∏–ª.—Ä—Ñ
4,0.000323,0.000322,0.217653,0.000333,0.00032,0.00032,0.00032,0.000378,0.000333,0.270602,...,0.00032,0.00044,0.000322,0.000324,0.00032,0.00032,0.00032,0.255499,0.000322,—á–µ–∫–∏—Å–ø–±.—Ä—Ñ


–¢–µ–ø–µ—Ä—å —Å–æ–ø–æ—Å—Ç–∞–≤–∏–º –∫–∞–∂–¥–æ–º—É –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é –µ–≥–æ —Å–ø–∏—Å–æ–∫ –¥–æ–º–µ–Ω–æ–≤.
–ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç—ã –ø—Ä–∏ —Ç–µ–º–∞—Ö —Å—É–º–º–∏—Ä—É–µ–º.

In [56]:
topic_cols = list(topic_words.keys())

In [57]:
def user_interest(dom_list, theta):
    topic_vector = [0]*(theta.shape[1]-1)
    for domain in dom_list:
        try:
            topic_vector += theta[theta['domain'] == domain].values[0][:-1]
        except:
            pass
    return topic_vector

In [58]:
df.head()

Unnamed: 0,gender,age,uid,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,time_range,visit_rate,unique_domains_cnt,tot_domains_cnt,avg_visits_per_domain,bag_of_words,bag_of_words_text
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,6978153933,1395631000.0,3,5,1.666667,"[–∏–Ω—Ç–µ—Ä–Ω–µ—Ç, –º–∞–≥–∞–∑–∏–Ω, —Ä–µ–±–µ–Ω–æ–∫, –∫—Ä–µ–∞—Ç–∏–≤–Ω—ã–π, –º–Ω–æ–≥–æ...",–∏–Ω—Ç–µ—Ä–Ω–µ—Ç –º–∞–≥–∞–∑–∏–Ω —Ä–µ–±–µ–Ω–æ–∫ –∫—Ä–µ–∞—Ç–∏–≤–Ω—ã–π –º–Ω–æ–≥–æ–µ –¥–æ–º...
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,2266588550,22221460.0,26,102,3.923077,"[–≤–µ—Å—å, —Ç–æ—Ä–≥–∏, —Ç—Ä–µ–π–¥–µ—Ä, forex—Ç–æ—Ä–≥–∏, –ø–æ–ª–µ–∑–Ω—ã–π, —Ñ...",–≤–µ—Å—å —Ç–æ—Ä–≥–∏ —Ç—Ä–µ–π–¥–µ—Ä forex—Ç–æ—Ä–≥–∏ –ø–æ–ª–µ–∑–Ω—ã–π —Ñ–æ—Ä–µ–∫—Å ...
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,{'visits': [{'url': 'http://ru.oriflame.com/pr...,"[b'ru.oriflame.com', b'ru.oriflame.com', b'ru....","[ru.oriflame.com, ru.oriflame.com, ru.oriflame...",0,1,"[1418840296062, 1418667832733, 1418667717223, ...",44,8284914026,188293500.0,6,44,7.333333,"[cosmetics, oriflame, cosmetics, oriflame, cos...",cosmetics oriflame cosmetics oriflame cosmetic...
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,{'visits': [{'url': 'http://translate-tattoo.r...,"[b'translate-tattoo.ru', b'nadietah.ru', b'1ob...","[translate-tattoo.ru, nadietah.ru, 1obl.ru, 1o...",0,1,"[1418217864467, 1418124701342, 1417866007812, ...",14,693126229,49509020.0,3,14,4.666667,"[–ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π, –ø–µ—Ä–µ–≤–æ–¥, –∂–∏–≤–æ–π, –ø–µ—Ä–µ–≤–æ–¥—Å–µ—Ä–≤...",–ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π –ø–µ—Ä–µ–≤–æ–¥ –∂–∏–≤–æ–π –ø–µ—Ä–µ–≤–æ–¥—Å–µ—Ä–≤–∏—Å –ª...
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,{'visits': [{'url': 'https://mail.rambler.ru/#...,"[b'mail.rambler.ru', b'news.rambler.ru', b'mai...","[mail.rambler.ru, news.rambler.ru, mail.ramble...",1,4,"[1427272415001, 1427272415000, 1427271294001, ...",212,613917001,2895835.0,25,212,8.48,"[–Ω–∞–¥–µ–∂–Ω—ã–π, –≤–µ—Å—å, —Å–ø–∞–º, –Ω–µ—Å–∫–æ–ª—å–∫–æ, –ø–æ—á—Ç–æ–≤—ã–π, –±–µ...",–Ω–∞–¥–µ–∂–Ω—ã–π –≤–µ—Å—å —Å–ø–∞–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø–æ—á—Ç–æ–≤—ã–π –±–µ—Å–∫–æ–Ω–µ—á...


In [59]:
df_cols = ['uid', 'gender_cat', 'age_cat', 'time_list',
           'visits', 'time_range', 'visit_rate', 'unique_domains_cnt', 
           'tot_domains_cnt', 'avg_visits_per_domain']

In [60]:
%%time

tmp = df.norm_domain_list.apply(pd.Series)\
.merge(df[['uid','norm_domain_list']], left_index = True, right_index = True)\
.drop(['norm_domain_list'], axis = 1)\
.melt(id_vars = ['uid'], value_name = 'domain')\
.drop('variable', axis = 1).dropna().reset_index(drop=True)

CPU times: user 1min 20s, sys: 25.7 s, total: 1min 46s
Wall time: 1min 50s


In [61]:
%%time
tmp2 = tmp.set_index('domain').join(theta.set_index('domain')).fillna(0).groupby(['uid']).sum().reset_index()

CPU times: user 12.5 s, sys: 3.16 s, total: 15.7 s
Wall time: 15.5 s


In [62]:
%%time
df_upd = df.set_index('uid').join(tmp2.set_index('uid')).reset_index()

CPU times: user 323 ms, sys: 466 ms, total: 790 ms
Wall time: 713 ms


In [63]:
#–ø–æ–ª—É—á–∏–ª–∏ —Ñ–∏—á–∏ –∏–∑ BigARTM
df_upd.head(2)

Unnamed: 0,uid,gender,age,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,F,18-24,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,...,0.720798,0.012963,0.019755,0.012793,0.379969,0.012793,0.012793,0.012793,0.013258,0.012792
1,d502331d-621e-4721-ada2-5d30b2c3801f,M,25-34,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,...,0.824418,1.375952,16.978559,0.410539,0.410379,0.662424,6.530401,0.411401,0.410369,10.322586


In [65]:
df_upd.to_pickle('df_features.pkl', compression='bz2')

## Train

In [60]:
new_df_cols = ['visits', 'time_range', 'visit_rate', 'unique_domains_cnt', 
           'tot_domains_cnt', 'avg_visits_per_domain']

In [61]:
features = list(new_df_cols) + list(theta.columns[:-1])
target = ['gender_cat']

In [62]:
def identity_tokenizer(text):
    return text

tfidf_extractor = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features = 3000)    
sparse_tfidf_domain_list = tfidf_extractor.fit_transform(df_upd['domain_list'])

In [63]:
mask_test = df_upd['gender_cat']==-1

In [64]:
tf_idf_matrix = sparse_tfidf_domain_list.todense()[~mask_test]
tf_idf_matrix = pd.DataFrame(tf_idf_matrix)

In [65]:
df_matrix = df_upd[~mask_test][features]

In [66]:
tf_idf_matrix.shape

(36138, 3000)

In [67]:
df_matrix.shape

(36138, 36)

In [68]:
result = df_matrix.join(tf_idf_matrix)

In [69]:
X = result.values
y = df_upd[~mask_test]['gender_cat'].values.ravel()

In [70]:
X.shape, y.shape

((36138, 3036), (36138,))

In [73]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((25296, 3036), (10842, 3036), (25296,), (10842,))

In [193]:
%%time
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_comb = 20

# specify parameters via map
params = {'n_estimators': [200, 300, 400],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [1,3,5,10],
    'min_child_weight': 1,
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 2, 5]}

CPU times: user 63 ¬µs, sys: 151 ¬µs, total: 214 ¬µs
Wall time: 245 ¬µs


In [None]:
%%time
xgb = XGBClassifier(learning_rate=0.1, objective='binary:logistic', random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=param_comb, scoring='roc_auc', 
                                   cv=skf, verbose=3, random_state=42 )

random_search.fit(X_train, y_train)

## LightGBM

In [74]:
from scipy import sparse
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [75]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [76]:
lgb_train = lgb.Dataset(X, y.ravel())

In [None]:
%%time

res = lgb.cv(params,
             lgb_train,
             num_boost_round=2000,
             nfold=5,
             stratified=True,
             shuffle=True, 
             early_stopping_rounds=100,
             verbose_eval=1)

In [None]:
num_boost = np.argmax(res['auc-mean'])
num_boost, res['auc-mean'][num_boost], res['auc-stdv'][num_boost]