In [1]:
import numpy as np
import pandas as pd
import os, sys

import json

import urllib.parse
from urllib.parse import unquote
from urllib.parse import urlparse

# Version IV

## BagOfWord and XGBoost

### Загружаем данные

In [2]:
#Параметры вычислений
file_path = '/data/share/project01/gender_age_dataset.txt'
file_limit = None

In [3]:
%%time
#Процедура. Фильтрует домен из url
def toDomain( url ):
    if url.startswith('http://http') : url = url[7:]
    if url.startswith('http://&referrer=') : url = url[17:]
        
    parsed_url = urlparse( urllib.parse.unquote( url ).strip() )
    if parsed_url.scheme not in ['http','https']: return None

    url = parsed_url.netloc.strip()

    if url.startswith('www.') : url = url[4:]

    dpoint = url.rfind(':')     
    if dpoint != -1 : url = url[:dpoint]    

    dpoint = url.find('&')     
    if dpoint != -1 : url = url[:dpoint]    

    dpoint = url.rfind('@')     
    if dpoint != -1 : url = url[dpoint+1:]    
       
    return url if url.rfind('.') != -1 else None

#Процедура разбирает JSON и возвращет домен и timestamp
def workupDomain( szDomain ):
    theCollection = [str(toDomain ( value['url']) ) + ';'   for value in json.loads( szDomain )['visits']]
    return  str('').join ( theCollection  ).replace('None;', '').replace('-', '').replace('.', '').replace(';', ' ')

# Загружаем файл
theUserCorpus = pd.read_csv(file_path, sep='\t', nrows=file_limit  )
theUserCorpus.head()

#Перебираем элементы, сохраняя из данных тока домен
theUserCorpus['domain'] = theUserCorpus['user_json'].apply( workupDomain )
theUserCorpus.drop(['user_json'], axis=1, inplace=True)

#формируем мега признак
theUserCorpus['target'] = theUserCorpus.gender+theUserCorpus.age
theUserCorpus.drop(['gender', 'age'], axis=1, inplace=True )
theUserCorpus.set_index(['uid'], inplace=True)

#Создаем карту групп признаков: где чего лежит
theTargetName = theUserCorpus.target.unique()
if len((np.where(theTargetName == '--'))[0] ) == 0 : theTargetName = np.append(theTargetName, ['--']) #Заплатке на частичную выборку
theTargetMap = pd.DataFrame( {'code':range(1, len(theTargetName)+1) }, index = theTargetName )
theTargetMap.code.loc['--'] = 0
theTargetMap.sort_values('code', inplace=True)

#Генерируем номера групп согластно карте признаков
theUserCorpus['targetID'] = theUserCorpus['target'].apply( lambda x:  theTargetMap.code.loc[x] )
theUserCorpus.drop(['target'], axis=1, inplace=True )
theUserCorpus.sort_values(by=['targetID'], inplace=True)

#Рассчитываем положения их смещения в общем массиве
theTargetMap['len'] = theTargetMap['code'].apply( lambda type:  len(theUserCorpus[theUserCorpus.targetID == type]) )
theTargetMap['begin'] = [theTargetMap[theTargetMap.code < type ]['len'].sum() \
                        if type > 0 else 0 \
                        for type in range(0, 11) ]
theTargetMap['end'] = [theTargetMap[theTargetMap.code <= type ]['len'].sum() \
                        if type > 0 else int(theTargetMap[theTargetMap.code == type ]['len']) \
                        for type in range(0, 11) ]

#Эта тупая тварь не сохраняет в файл индексы. Ставим заплатку
theUserCorpus.reset_index(inplace=True)
theUserCorpus.to_csv('~/project/user_corpus.csv', sep=',', index=False)
theTargetMap.to_csv('~/project/target_map.csv', sep=',', index=False)
theUserCorpus.set_index(['uid'], inplace=True)

CPU times: user 1min 7s, sys: 2.65 s, total: 1min 10s
Wall time: 1min 10s


### Любуемся результатом

In [4]:
theTargetMap

Unnamed: 0,code,len,begin,end
--,0,5000,0,5000
F18-24,1,2886,5000,7886
M25-34,2,8666,7886,16552
F25-34,3,6791,16552,23343
M>=55,4,784,23343,24127
F45-54,5,2597,24127,26724
F35-44,6,4271,26724,30995
M35-44,7,5089,30995,36084
F>=55,8,895,36084,36979
M18-24,9,2012,36979,38991


In [5]:
theUserCorpus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41138 entries, fe1e01f3-5877-4a34-a300-3cfffc2f48e1 to f16a67ec-5122-4f67-8546-415b22982009
Data columns (total 2 columns):
domain      41138 non-null object
targetID    41138 non-null int64
dtypes: int64(1), object(1)
memory usage: 964.2+ KB


### Запихиваем данные в мешок

In [8]:
%%time
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

#Процедура токенизации
def tokenise( text ):
    words = [word.lower() for word in word_tokenize(text)]
    return words

#Обучаем векторизатор и генерируем Bag of Words
theCorpus = list(theUserCorpus['domain'])
print ('Size of theCorpus is ', len(theCorpus) )

theVectorizer = CountVectorizer(tokenizer=tokenise)
theBagOfWords = theVectorizer.fit_transform(theCorpus).toarray()
print('Size of theBagOfWords is ', theBagOfWords.shape)

Size of theCorpus is  41138
Size of theBagOfWords is  (41138, 116637)
CPU times: user 1min 2s, sys: 2.92 s, total: 1min 4s
Wall time: 1min 5s


### Выделяем диапазон для исследований

In [27]:
partSize = 1000
if partSize is None :
    fullData = theBagOfWords[:41138]
    fullTarget = (theUserCorpus['targetID'].values)[:41138]
else :
    fullData = theBagOfWords[:partSize]
    fullTarget = (theUserCorpus['targetID'].values)[:partSize]
    for type in range(1, 11):
        end = theTargetMap[theTargetMap.code == type].len[0]
        end = partSize if end >= partSize else end
        begin = theTargetMap[theTargetMap.code == type].begin[0]
        fullData = np.concatenate((fullData, theBagOfWords[begin:begin+partSize]) )
        fullTarget = np.concatenate((fullTarget, (theUserCorpus['targetID'].values)[begin:begin+partSize]))    

fullData.shape, fullTarget.shape 

((11000, 116637), (11000,))

### Загоняем в xgboost

In [28]:
%%time
from sklearn.model_selection import train_test_split
#Делим мастер данные на массивы
masterData, researchmentData, masterTarget, researchmentTarget = \
    train_test_split(fullData, fullTarget, test_size= 0.33, random_state=33 )

CPU times: user 3.43 s, sys: 10.5 s, total: 14 s
Wall time: 14.1 s


In [29]:
%%time
import xgboost

# У кого xgboost длиннее тот и прав
model = xgboost.XGBRegressor(n_estimators = 100, nthread= -1, max_depth = 6, objective = 'multi:softmax', num_class = 11 ) #
model.fit(masterData, masterTarget)
print( model.score(masterData, masterTarget) )

0.04222300908057908
CPU times: user 14h 42min 7s, sys: 7min 48s, total: 14h 49min 55s
Wall time: 31min 57s


In [30]:

researchmentLabel = model.predict(researchmentData)
researchmentPredict = model.predict(theBagOfWords[:partSize])

In [31]:
from sklearn.metrics import classification_report
report = classification_report(researchmentTarget, researchmentLabel, target_names=list(theTargetMap.index)) #['...', '...']
print(report)

              precision    recall  f1-score   support

          --       0.14      0.05      0.08       349
      F18-24       0.21      0.17      0.19       339
      M25-34       0.23      0.12      0.16       377
      F25-34       0.18      0.08      0.11       328
       M>=55       0.22      0.15      0.18       238
      F45-54       0.13      0.52      0.21       385
      F35-44       0.15      0.10      0.12       307
      M35-44       0.17      0.10      0.13       308
       F>=55       0.30      0.17      0.21       317
      M18-24       0.23      0.29      0.25       334
      M45-54       0.21      0.13      0.16       348

    accuracy                           0.18      3630
   macro avg       0.20      0.17      0.16      3630
weighted avg       0.20      0.18      0.16      3630



In [32]:
researchmentPredict

array([ 0.,  8.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  0.,  5.,  5.,  5.,
        5.,  5.,  0.,  7.,  0.,  5.,  0.,  0.,  0.,  4.,  5.,  0.,  9.,
        5.,  5.,  7.,  0.,  5.,  0.,  7.,  4.,  5.,  0.,  1.,  3.,  0.,
        5.,  5.,  5.,  0.,  7.,  0.,  5.,  5.,  6.,  5.,  5.,  1.,  0.,
        2.,  5.,  0.,  5.,  8.,  5.,  5.,  0.,  2.,  5.,  4.,  2.,  5.,
        2.,  5.,  7.,  0.,  5.,  8.,  0.,  9.,  5.,  0.,  0.,  5.,  8.,
        2.,  0.,  9.,  5.,  0.,  4.,  8.,  5.,  5.,  5.,  5.,  5.,  5.,
        9.,  0.,  3.,  5.,  5.,  3.,  9.,  9.,  8.,  5.,  8.,  1.,  5.,
        5.,  5.,  2.,  5.,  8.,  0.,  0.,  0.,  5.,  0.,  5.,  5.,  9.,
        9.,  1.,  5.,  0.,  0.,  5.,  9.,  5.,  5.,  6.,  0.,  1.,  7.,
        3.,  0.,  5.,  0.,  5.,  9.,  2.,  0.,  8.,  9.,  5.,  7.,  0.,
        1.,  5.,  9.,  5.,  5.,  5.,  0.,  9.,  2.,  5.,  6.,  5.,  5.,
        0.,  0.,  0.,  5.,  0.,  9.,  5.,  5.,  0.,  9., 10.,  5.,  0.,
        5.,  5.,  6.,  5.,  4.,  0., 10., 10.,  0.,  3.,  3., 10