In [6]:
import datetime, re, os, sys, json, pickle 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from bs4 import BeautifulSoup as bs

#import requests
#import lxml.html

from scipy import sparse
#import lightgbm as lgb

from functions import dict_gender_to_gendercategory
from functions import dict_age_to_agecategory
from functions import dict_gendercategory_to_gender
from functions import dict_agecategory_to_age

from functions import load_user_json
from functions import url2domain
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter

import pymystem3
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV

In [7]:
#https://bigartm.readthedocs.io/en/stable/installation/linux.html
import artm

In [8]:
import gc
gc.collect()

7

## read data + basic features

In [9]:
%%time

file_path = '../data/share/project01/gender_age_dataset.txt'
df = pd.read_csv(file_path, sep='\t')

CPU times: user 8.28 s, sys: 1.59 s, total: 9.88 s
Wall time: 10.7 s


In [10]:
%%time

df['user_json'] = load_user_json(df['user_json'])
df['domain_list'] = df['user_json'].map(lambda x: [url2domain(visit['url']) for visit in x['visits']])
df['norm_domain_list'] = df['domain_list'].apply(lambda x: [kek[2:-1] for kek in x])

CPU times: user 1min 39s, sys: 3.63 s, total: 1min 43s
Wall time: 1min 46s


In [11]:
%%time

df['gender_cat'] = df['gender'].map(dict_gender_to_gendercategory)
df['age_cat'] = df['age'].map(dict_age_to_agecategory)

df['time_list'] = df['user_json'].map(lambda x: [visit['timestamp'] for visit in x['visits']])

df['visits'] = df['time_list'].apply(len)
df['time_range'] = df['time_list'].apply(np.ptp)
df['visit_rate'] = df['time_range']/df['visits']

df['unique_domains_cnt'] = df['domain_list'].apply(lambda x: len(set(x)))
df['tot_domains_cnt'] = df['domain_list'].apply(lambda x: len(x))
df['avg_visits_per_domain'] = df['tot_domains_cnt']/df['unique_domains_cnt']

CPU times: user 2.89 s, sys: 364 ms, total: 3.25 s
Wall time: 3.42 s


## from json

#### read json

In [12]:
import json

In [13]:
!pwd

/Users/antonina.goryacheva/Desktop/content_bigdata10_proj1_kek/notebooks


In [14]:
%%time
with open('../data/titles-aiohttp.json', 'r') as f:
    results = json.load(f)

CPU times: user 1.56 s, sys: 230 ms, total: 1.79 s
Wall time: 2.29 s


In [15]:
results

{'юрист-михаил-бабин.рф': {'domain': 'юрист-михаил-бабин.рф',
  'url': 'http://юрист-михаил-бабин.рф',
  'title': None,
  'keywords': None,
  'error': 'Cannot connect to host юрист-михаил-бабин.рф:80 ssl:None [nodename nor servname provided, or not known]'},
 'эфирныемасла.рф': {'domain': 'эфирныемасла.рф',
  'url': 'http://эфирныемасла.рф',
  'title': 'Интернет магазин эфирных масел, аромакосметики и ароматерапии «ЦА Ирис».',
  'keywords': None},
 'эролюб.рф': {'domain': 'эролюб.рф',
  'url': 'http://эролюб.рф',
  'title': 'Эролюб.рф - страстные знакомства для взрослых с реальными фото. | Сайт для серьёзных отношений, любви и романтики для встреч и личной жизни.',
  'keywords': None},
 'шарикиоптом.рф': {'domain': 'шарикиоптом.рф',
  'url': 'http://шарикиоптом.рф',
  'title': 'Фитолампы, фитосветильники, прожекторы для растений, рассады, цветов. Системы освещения тепличных хозяйств. Лампы для гроубоксов, аквариумов. Низкие цены, курьерская доставка по всей стране.',
  'keywords': None

In [16]:
len(results)

75608

## Чистим полученные данные

##### получила через вот такую команду

cat content_bigdata10_proj1_kek/data/titles-aiohttp.json | jq '.[] | .title' | sort | uniq -c | sort -rn | head 

In [17]:
bad_titles = ['403 Forbidden',
'404 - Ladefehler der Webseite',
'404 - Unable to load website',
'Loading...',
'Главная',
'410 - Website deleted',
'404 Not Found',
'— @дневники: асоциальная сеть',
'404 - Impossible de télécharger le site',
'Главная страница',
'404 - Не удалось загрузить сайт',
'Just a moment...',
'Персональный сайт - Главная страница',
'Welcome!',
'502 Bad Gateway',
'503 Service Temporarily Unavailable',
'Этот домен припаркован компанией Timeweb',
'Домен не прилинкован ни к одной из директорий на сервере!',
'Attention Required! | Cloudflare',
'Konto ist gesperrt',
'Apache HTTP Server Test Page powered by CentOS',
'Ошибка при открытии страницы',
'Welcome to nginx!',
'Index of /',
'Срок регистрации домена закончился. Купить домен можно тут.',
'Ресурс заблокирован - Resource is blocked',
'Работа сайта временно приостановлена',
'В ближайшие сутки работа сайта восстановится.',
'Персональный сайт - Главная',
'[.m] masterhost - профессиональный хостинг сайта',
'Waiting for the redirectiron...',
'job.ru переехал на hh.ru',
'Account disabled by server administrator',
'Этот сайт заблокирован',
'Ведутся технические работы',
'Account Suspended',
'Error 404 (Not Found)!!1',
'404 - Not Found',
'Этот домен продаётся',
'500 Internal Server Error',
'Access Denied',
'Истёк срок регистрации домена\xa0zubovskaya-banya.ru']

In [18]:
%%time
clean_results = {}
for result in results:
    if (results[result]['title'] not in bad_titles) and \
    (results[result]['title'] is not None or results[result]['keywords'] is not None):
        clean_results[result] = results[result]

CPU times: user 130 ms, sys: 3.76 ms, total: 134 ms
Wall time: 136 ms


In [19]:
len(clean_results)

65328

In [20]:
# titles = []
# keywords = []
# descriptions = []
# for domain in clean_results:
#     titles.append(clean_results[domain].get('title', ''))
#     keywords.append(clean_results[domain].get('keywords', ''))
#     descriptions.append(clean_results[domain].get('description', ''))

In [21]:
#counts_keywords = Counter(keywords)
#top30_keywords =  sorted(list(counts_keywords.items()), key=lambda tup: tup[1], reverse=True)[:30]

#### train texts

In [22]:
%%time

dom_info = []
for domain in clean_results:
    info = str(clean_results[domain].get('title', '')) \
         + str(clean_results[domain].get('keywords', '')) \
         + str(clean_results[domain].get('description', ''))
    dom_info.append([domain, info])

CPU times: user 1.01 s, sys: 309 ms, total: 1.32 s
Wall time: 1.51 s


In [23]:
len(dom_info)

65328

Делаю предобработку по частям, ибо постоянно ловила broken pipe. Вообще все в одну функцию запихнуть можно.

In [24]:
def text_to_wordlist(text):
    text = re.sub('[^a-zA-Zа-яА-ЯёЁ]', ' ', text)
    text = re.sub('none', ' ', text.lower())
    words = text.lower().strip().split()
    return words

In [25]:
def clean(words, stopWords):
    new_words = [word for word in words if (word not in stopWords) and len(word) > 2]
    return new_words

In [26]:
mystem = pymystem3.Mystem()
wordnet_lemmatizer = WordNetLemmatizer()
def clean_v2(words):    
    new_words = [mystem.lemmatize(x)[0] for x in words]
    new_new_words = [wordnet_lemmatizer.lemmatize(x, pos=wordnet.VERB) for x in new_words]
    return new_new_words

In [27]:
domain_info = pd.DataFrame(data=dom_info, columns=['domain', 'info'])

In [28]:
#берем только буквы + split по пробелам
domain_info['norm_info'] = domain_info['info'].apply(lambda x: text_to_wordlist(x))

In [29]:
%%time
#убираем стоп-слова + убираем слова длинной <= 2
stopWords = stopwords.words(['russian', 'english'])
domain_info['norm_info_v2'] = domain_info['norm_info'].apply(lambda x: clean(x, stopWords))

CPU times: user 9.37 s, sys: 91 ms, total: 9.46 s
Wall time: 9.9 s


In [30]:
%%time
#лемматизация
domain_info['norm_info_v3'] = domain_info['norm_info_v2'].apply(lambda x: clean_v2(x))

CPU times: user 1min 11s, sys: 22.2 s, total: 1min 33s
Wall time: 3min 25s


In [31]:
%%time
#берем уникальные слова для домена
domain_info['norm_info_v4'] = domain_info['norm_info_v3'].apply(lambda x: list(set(x)))

CPU times: user 314 ms, sys: 28.3 ms, total: 342 ms
Wall time: 364 ms


In [32]:
#убираем домены без инфы
domain_info['is_empty'] = domain_info['norm_info_v4'].apply(lambda x: 0 if x else 1)
domain_info = domain_info[domain_info['is_empty'] == 0]

In [33]:
domain_info.shape

(64285, 7)

In [34]:
domain_info.head()

Unnamed: 0,domain,info,norm_info,norm_info_v2,norm_info_v3,norm_info_v4,is_empty
0,эфирныемасла.рф,"Интернет магазин эфирных масел, аромакосметики...","[интернет, магазин, эфирных, масел, аромакосме...","[интернет, магазин, эфирных, масел, аромакосме...","[интернет, магазин, эфирный, масло, аромакосме...","[эфирный, масло, интернет, магазин, ирис, аром...",0
1,эролюб.рф,Эролюб.рф - страстные знакомства для взрослых ...,"[эролюб, рф, страстные, знакомства, для, взрос...","[эролюб, страстные, знакомства, взрослых, реал...","[эролюб, страстный, знакомство, взрослый, реал...","[страстный, сайт, фото, романтик, реальный, жи...",0
2,шарикиоптом.рф,"Фитолампы, фитосветильники, прожекторы для рас...","[фитолампы, фитосветильники, прожекторы, для, ...","[фитолампы, фитосветильники, прожекторы, расте...","[фитолампа, фитосветильник, прожектор, растени...","[фитолампа, весь, гроубокс, аквариум, растение...",0
3,чинамобил.рф,Каталог запчастей CHERY GEELY LIFAN Brilliance...,"[каталог, запчастей, chery, geely, lifan, bril...","[каталог, запчастей, chery, geely, lifan, bril...","[каталог, запчасть, chery, geely, lifan, brill...","[черя, автомобиль, комплектующий, chery, brill...",0
4,чекиспб.рф,Любые чеки СПб! - ГлавнаяNone,"[любые, чеки, спб, главная]","[любые, чеки, спб, главная]","[любой, чек, спб, главный]","[главный, чек, спб, любой]",0


In [35]:
domain_dict = dict(zip(domain_info.domain, domain_info.norm_info_v4))

In [36]:
#еще надо убрать всякую муть типа "домен продается"

## Bag of words for user

Каждому юзеру присоединяем его список слов по списку доменов

In [37]:
def bag_of_words_for_user(norm_domain_list, domain_dict):
    bag_of_words = []
    for dom in norm_domain_list:
        try:
            bag_of_words.append(domain_dict[dom])
        except: 
            pass
    return sum(bag_of_words, [])

In [38]:
%%time
df['bag_of_words'] = df['norm_domain_list'].apply(lambda x: bag_of_words_for_user(x, domain_dict))

CPU times: user 2min, sys: 10.9 s, total: 2min 11s
Wall time: 2min 13s


In [39]:
df.head()

Unnamed: 0,gender,age,uid,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,time_range,visit_rate,unique_domains_cnt,tot_domains_cnt,avg_visits_per_domain,bag_of_words
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,6978153933,1395631000.0,3,5,1.666667,"[интернет, магазин, ребенок, креативный, много..."
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,2266588550,22221460.0,26,102,3.923077,"[весь, торги, трейдер, forexторги, полезный, ф..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,{'visits': [{'url': 'http://ru.oriflame.com/pr...,"[b'ru.oriflame.com', b'ru.oriflame.com', b'ru....","[ru.oriflame.com, ru.oriflame.com, ru.oriflame...",0,1,"[1418840296062, 1418667832733, 1418667717223, ...",44,8284914026,188293500.0,6,44,7.333333,"[cosmetics, oriflame, cosmetics, oriflame, cos..."
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,{'visits': [{'url': 'http://translate-tattoo.r...,"[b'translate-tattoo.ru', b'nadietah.ru', b'1ob...","[translate-tattoo.ru, nadietah.ru, 1obl.ru, 1o...",0,1,"[1418217864467, 1418124701342, 1417866007812, ...",14,693126229,49509020.0,3,14,4.666667,"[профессиональный, перевод, живой, переводсерв..."
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,{'visits': [{'url': 'https://mail.rambler.ru/#...,"[b'mail.rambler.ru', b'news.rambler.ru', b'mai...","[mail.rambler.ru, news.rambler.ru, mail.ramble...",1,4,"[1427272415001, 1427272415000, 1427271294001, ...",212,613917001,2895835.0,25,212,8.48,"[надежный, весь, спам, несколько, почтовый, бе..."


## Get features from Bag Of Words

Берем топ 3000 слов из датафрейма с юзерами df (датафрейм, в котором 1 строка = 1 юзер), т.к. если брать из domain_info (датафрем, в котором 1 строка = 1 домен), то получается всякий шлак.

In [40]:
%%time
df['bag_of_words_text'] = df['bag_of_words'].apply(lambda x: str(' '.join(x)).strip())

CPU times: user 4.08 s, sys: 1.24 s, total: 5.33 s
Wall time: 6.45 s


In [41]:
#можно сюда еще какие-нибудь слова добавить, которые покажутся лишними
bad_words = ['домен', 'укр', 'рер', 'рес', 'регистрация', 'истекать', 'www', 
             'domain', 'многое', 'net', 'ооо', 'наш', 'опт', 'website', 'сайт', 'com',
             'ваш', 'который', 'это', 'современный', 'различный', 'разный', 'любой', 'хотеть','возможность',
             'собирать']

stopWords = stopwords.words(['russian', 'english'])
stopWords.extend(bad_words) 

In [42]:
%%time

#получим список слов, которые будем далее рассматривать (чтобы без всякого мусора)
count_vect_user = CountVectorizer(max_features=3000, stop_words=stopWords)
temp_matrix_user = count_vect_user.fit_transform(df['bag_of_words_text'].values) 
                                             
matrix_counts = temp_matrix_user.toarray()  

CPU times: user 1min 18s, sys: 4.88 s, total: 1min 22s
Wall time: 1min 27s


In [43]:
#список слов, которые будем в bigARTM учитывать
good_words = [x[0] for x in sorted(count_vect_user.vocabulary_.items(), key=lambda x: x[1])] 

Обучаем CountVectorizer для датафрейма domain_info, чтобы на вход модели подать.

In [44]:
%%time
norm_info = domain_info.norm_info_v3.tolist()
domain_info['good_words'] = [list(filter(lambda x: x in good_words, sublist)) for sublist in norm_info]

CPU times: user 1min 6s, sys: 7.87 s, total: 1min 14s
Wall time: 1min 17s


In [45]:
domain_info.head(3)

Unnamed: 0,domain,info,norm_info,norm_info_v2,norm_info_v3,norm_info_v4,is_empty,good_words
0,эфирныемасла.рф,"Интернет магазин эфирных масел, аромакосметики...","[интернет, магазин, эфирных, масел, аромакосме...","[интернет, магазин, эфирных, масел, аромакосме...","[интернет, магазин, эфирный, масло, аромакосме...","[эфирный, масло, интернет, магазин, ирис, аром...",0,"[интернет, магазин, масло]"
1,эролюб.рф,Эролюб.рф - страстные знакомства для взрослых ...,"[эролюб, рф, страстные, знакомства, для, взрос...","[эролюб, страстные, знакомства, взрослых, реал...","[эролюб, страстный, знакомство, взрослый, реал...","[страстный, сайт, фото, романтик, реальный, жи...",0,"[знакомство, взрослый, реальный, фото, серьезн..."
2,шарикиоптом.рф,"Фитолампы, фитосветильники, прожекторы для рас...","[фитолампы, фитосветильники, прожекторы, для, ...","[фитолампы, фитосветильники, прожекторы, расте...","[фитолампа, фитосветильник, прожектор, растени...","[фитолампа, весь, гроубокс, аквариум, растение...",0,"[растение, цветок, система, освещение, аквариу..."


In [46]:
%%time
domain_info['good_words_text'] = domain_info['good_words'].apply(lambda x: str(' '.join(x)).strip())

CPU times: user 122 ms, sys: 26.9 ms, total: 149 ms
Wall time: 154 ms


In [47]:
cv_dom = CountVectorizer()
temp_matrix_dom = cv_dom.fit_transform(domain_info['good_words_text'].values)
n_wd = temp_matrix_dom.T
vocabulary = cv_dom.get_feature_names()

In [48]:
%%time

cv_words = [x[0] for x in sorted(cv_dom.vocabulary_.items(), key=lambda x: x[1])] 
cv_dom = pd.DataFrame(temp_matrix_dom.toarray() , columns=cv_words).head()
cv_dom['id_domain'] = domain_info['domain'] 

CPU times: user 2.56 s, sys: 2.11 s, total: 4.68 s
Wall time: 5.48 s


### BigARTM

https://github.com/bigartm/bigartm/blob/master/README.md

In [49]:
%%time
bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

CPU times: user 1min 8s, sys: 880 ms, total: 1min 9s
Wall time: 1min 11s


In [50]:
%%time

# количество тем
T = 30   
# Learn simple LDA model (or you can use advanced artm.ARTM)
model = artm.LDA(num_topics=T, dictionary=bv.dictionary, cache_theta = True)
model.fit_offline(bv, num_collection_passes=20)

CPU times: user 1min 6s, sys: 4.84 s, total: 1min 11s
Wall time: 21.8 s


In [51]:
model.get_top_tokens()

[['профессиональный',
  'образование',
  'имя',
  'хостинг',
  'сервер',
  'скидка',
  'высокий',
  'готовый',
  'решение',
  'реферат'],
 ['хороший',
  'украина',
  'клуб',
  'кино',
  'группа',
  'новинка',
  'club',
  'любитель',
  'харьков',
  'камера'],
 ['ребенок',
  'детский',
  'здоровье',
  'жизнь',
  'красота',
  'человек',
  'питание',
  'развитие',
  'женский',
  'диета'],
 ['школа',
  'урок',
  'обучение',
  'юридический',
  'курсы',
  'разработка',
  'art',
  'смс',
  'комплекс',
  'дизайн'],
 ['отдых',
  'тур',
  'мода',
  'отель',
  'sie',
  'путешествие',
  'туризм',
  'крым',
  'гостиница',
  'поздравление'],
 ['оборудование',
  'официальный',
  'техника',
  'спортивный',
  'технология',
  'бытовой',
  'машина',
  'цифровой',
  'электроника',
  'велосипед'],
 ['дом',
  'квартира',
  'дома',
  'блог',
  'строительство',
  'дизайн',
  'реклама',
  'интерьер',
  'часы',
  'дача'],
 ['новость',
  'мир',
  'интересный',
  'событие',
  'спорт',
  'последний',
  'самый',
  '

In [52]:
#соответсвие тем и ключевых слов (для интерпретации тем)
topic_words = dict(zip(model.get_theta().index, model.get_top_tokens()))

In [53]:
#матрица, в которой по строкам - темы, по столбцам - входные тексты (то есть у нас 1 столбец = 1 домен)
model.get_theta().head()

Unnamed: 0,19000,19001,19002,19003,19004,19005,19006,19007,19008,19009,...,25990,25991,25992,25993,25994,25995,25996,25997,25998,25999
topic_0,0.000654,0.000551,0.000885,0.306307,0.000323,0.111617,0.421976,0.000449,0.000752,0.003031,...,0.004348,0.001901,0.000426,0.004348,0.000354,0.033333,0.004348,0.007693,0.000753,0.007693
topic_1,0.132333,0.00016,0.000885,0.003031,0.000322,0.00039,0.000518,0.273757,0.000799,0.003031,...,0.004348,0.001892,0.039464,0.439131,0.107181,0.033333,0.004348,0.007693,0.000783,0.007693
topic_2,0.000661,0.000156,0.000885,0.003333,0.217653,0.000366,0.162358,0.000449,0.000812,0.00303,...,0.004348,0.001887,0.712535,0.004348,0.001097,0.033333,0.004348,0.007692,0.000785,0.007693
topic_3,0.000654,0.000147,0.184703,0.003061,0.000333,0.000368,0.25264,0.000451,0.000755,0.003253,...,0.004348,0.001887,0.083464,0.004348,0.000353,0.033333,0.004351,0.007693,0.000766,0.007693
topic_4,0.000654,0.000138,0.000885,0.003031,0.00032,0.000366,0.000554,0.000449,0.000757,0.003031,...,0.004348,0.001887,0.000412,0.004348,0.000353,0.033333,0.004348,0.007692,0.000876,0.007693


In [54]:
theta = model.get_theta().T
theta['domain'] = domain_info['domain'].values
theta.reset_index(drop=True, inplace=True)

In [55]:
theta.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,domain
0,0.000654,0.132333,0.000661,0.000654,0.000654,0.000661,0.066229,0.000654,0.000654,0.000666,...,0.000654,0.146375,0.000663,0.000657,0.001022,0.000665,0.000654,0.133742,0.001131,эфирныемасла.рф
1,0.000551,0.00016,0.000156,0.000147,0.000138,0.000214,0.147771,0.043515,0.000142,0.000159,...,0.000162,0.000265,0.000178,0.000167,0.08554,0.075686,0.000136,0.000152,0.088793,эролюб.рф
2,0.000885,0.000885,0.000885,0.184703,0.000885,0.303151,0.000885,0.000885,0.000959,0.000946,...,0.000885,0.178284,0.000885,0.000885,0.000904,0.310609,0.000885,0.000885,0.000885,шарикиоптом.рф
3,0.306307,0.003031,0.003333,0.003061,0.003031,0.003031,0.003031,0.004084,0.003031,0.300533,...,0.003031,0.003031,0.003031,0.003032,0.003031,0.003038,0.003031,0.003031,0.003031,чинамобил.рф
4,0.000323,0.000322,0.217653,0.000333,0.00032,0.00032,0.00032,0.000378,0.000333,0.270602,...,0.00032,0.00044,0.000322,0.000324,0.00032,0.00032,0.00032,0.255499,0.000322,чекиспб.рф


Теперь сопоставим каждому пользователю его список доменов.
Коэффициенты при темах суммируем.

In [56]:
topic_cols = list(topic_words.keys())

In [57]:
def user_interest(dom_list, theta):
    topic_vector = [0]*(theta.shape[1]-1)
    for domain in dom_list:
        try:
            topic_vector += theta[theta['domain'] == domain].values[0][:-1]
        except:
            pass
    return topic_vector

In [58]:
df.head()

Unnamed: 0,gender,age,uid,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,time_range,visit_rate,unique_domains_cnt,tot_domains_cnt,avg_visits_per_domain,bag_of_words,bag_of_words_text
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,6978153933,1395631000.0,3,5,1.666667,"[интернет, магазин, ребенок, креативный, много...",интернет магазин ребенок креативный многое дом...
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,2266588550,22221460.0,26,102,3.923077,"[весь, торги, трейдер, forexторги, полезный, ф...",весь торги трейдер forexторги полезный форекс ...
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,{'visits': [{'url': 'http://ru.oriflame.com/pr...,"[b'ru.oriflame.com', b'ru.oriflame.com', b'ru....","[ru.oriflame.com, ru.oriflame.com, ru.oriflame...",0,1,"[1418840296062, 1418667832733, 1418667717223, ...",44,8284914026,188293500.0,6,44,7.333333,"[cosmetics, oriflame, cosmetics, oriflame, cos...",cosmetics oriflame cosmetics oriflame cosmetic...
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,{'visits': [{'url': 'http://translate-tattoo.r...,"[b'translate-tattoo.ru', b'nadietah.ru', b'1ob...","[translate-tattoo.ru, nadietah.ru, 1obl.ru, 1o...",0,1,"[1418217864467, 1418124701342, 1417866007812, ...",14,693126229,49509020.0,3,14,4.666667,"[профессиональный, перевод, живой, переводсерв...",профессиональный перевод живой переводсервис л...
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,{'visits': [{'url': 'https://mail.rambler.ru/#...,"[b'mail.rambler.ru', b'news.rambler.ru', b'mai...","[mail.rambler.ru, news.rambler.ru, mail.ramble...",1,4,"[1427272415001, 1427272415000, 1427271294001, ...",212,613917001,2895835.0,25,212,8.48,"[надежный, весь, спам, несколько, почтовый, бе...",надежный весь спам несколько почтовый бесконеч...


In [59]:
df_cols = ['uid', 'gender_cat', 'age_cat', 'time_list',
           'visits', 'time_range', 'visit_rate', 'unique_domains_cnt', 
           'tot_domains_cnt', 'avg_visits_per_domain']

In [60]:
%%time

tmp = df.norm_domain_list.apply(pd.Series)\
.merge(df[['uid','norm_domain_list']], left_index = True, right_index = True)\
.drop(['norm_domain_list'], axis = 1)\
.melt(id_vars = ['uid'], value_name = 'domain')\
.drop('variable', axis = 1).dropna().reset_index(drop=True)

CPU times: user 1min 20s, sys: 25.7 s, total: 1min 46s
Wall time: 1min 50s


In [61]:
%%time
tmp2 = tmp.set_index('domain').join(theta.set_index('domain')).fillna(0).groupby(['uid']).sum().reset_index()

CPU times: user 12.5 s, sys: 3.16 s, total: 15.7 s
Wall time: 15.5 s


In [62]:
%%time
df_upd = df.set_index('uid').join(tmp2.set_index('uid')).reset_index()

CPU times: user 323 ms, sys: 466 ms, total: 790 ms
Wall time: 713 ms


In [63]:
#получили фичи из BigARTM
df_upd.head(2)

Unnamed: 0,uid,gender,age,user_json,domain_list,norm_domain_list,gender_cat,age_cat,time_list,visits,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,F,18-24,{'visits': [{'url': 'http://zebra-zoya.ru/2000...,"[b'zebra-zoya.ru', b'news.yandex.ru', b'sotovi...","[zebra-zoya.ru, news.yandex.ru, sotovik.ru, ne...",0,0,"[1419688144068, 1426666298001, 1426666298000, ...",5,...,0.720798,0.012963,0.019755,0.012793,0.379969,0.012793,0.012793,0.012793,0.013258,0.012792
1,d502331d-621e-4721-ada2-5d30b2c3801f,M,25-34,{'visits': [{'url': 'http://sweetrading.ru/?p=...,"[b'sweetrading.ru', b'sweetrading.ru', b'sweet...","[sweetrading.ru, sweetrading.ru, sweetrading.r...",1,1,"[1419717886224, 1419717884437, 1419717816375, ...",102,...,0.824418,1.375952,16.978559,0.410539,0.410379,0.662424,6.530401,0.411401,0.410369,10.322586


In [65]:
df_upd.to_pickle('df_features.pkl', compression='bz2')

## Train

In [60]:
new_df_cols = ['visits', 'time_range', 'visit_rate', 'unique_domains_cnt', 
           'tot_domains_cnt', 'avg_visits_per_domain']

In [61]:
features = list(new_df_cols) + list(theta.columns[:-1])
target = ['gender_cat']

In [62]:
def identity_tokenizer(text):
    return text

tfidf_extractor = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_features = 3000)    
sparse_tfidf_domain_list = tfidf_extractor.fit_transform(df_upd['domain_list'])

In [63]:
mask_test = df_upd['gender_cat']==-1

In [64]:
tf_idf_matrix = sparse_tfidf_domain_list.todense()[~mask_test]
tf_idf_matrix = pd.DataFrame(tf_idf_matrix)

In [65]:
df_matrix = df_upd[~mask_test][features]

In [66]:
tf_idf_matrix.shape

(36138, 3000)

In [67]:
df_matrix.shape

(36138, 36)

In [68]:
result = df_matrix.join(tf_idf_matrix)

In [69]:
X = result.values
y = df_upd[~mask_test]['gender_cat'].values.ravel()

In [70]:
X.shape, y.shape

((36138, 3036), (36138,))

In [73]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((25296, 3036), (10842, 3036), (25296,), (10842,))

In [193]:
%%time
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_comb = 20

# specify parameters via map
params = {'n_estimators': [200, 300, 400],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [1,3,5,10],
    'min_child_weight': 1,
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 2, 5]}

CPU times: user 63 µs, sys: 151 µs, total: 214 µs
Wall time: 245 µs


In [None]:
%%time
xgb = XGBClassifier(learning_rate=0.1, objective='binary:logistic', random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=param_comb, scoring='roc_auc', 
                                   cv=skf, verbose=3, random_state=42 )

random_search.fit(X_train, y_train)

## LightGBM

In [74]:
from scipy import sparse
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [75]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [76]:
lgb_train = lgb.Dataset(X, y.ravel())

In [None]:
%%time

res = lgb.cv(params,
             lgb_train,
             num_boost_round=2000,
             nfold=5,
             stratified=True,
             shuffle=True, 
             early_stopping_rounds=100,
             verbose_eval=1)

In [None]:
num_boost = np.argmax(res['auc-mean'])
num_boost, res['auc-mean'][num_boost], res['auc-stdv'][num_boost]