In [1]:
import numpy as np
import pandas as pd
import os, sys

import json

import urllib.parse
from urllib.parse import unquote
from urllib.parse import urlparse


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import *

from sklearn.grid_search import GridSearchCV



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from IPython.display import Audio
sound_file = "/data/home/mikhail.burmistrov/A-Tone-His_Self-1266414414.wav"

# Version V

##  XGBoost only

### Загружаем данные

In [4]:
#Параметры вычислений
file_path = '~/Labs/prj/fetch.csv'
file_limit = None

In [5]:
# Загружаем файл
theFetch = pd.read_csv(file_path, sep=',', nrows=file_limit  )

In [6]:
#формируем мега признак
theFetch['target'] = theFetch.gender+theFetch.age
theFetch.drop(['gender', 'age'], axis=1, inplace=True )
theFetch.set_index(['uid'], inplace=True)

In [7]:
#Создаем карту групп признаков: где чего лежит
theTargetName = sorted(theFetch.target.unique())
theTargetMap = pd.DataFrame( {'code':range(0, len(theTargetName)) }, index = theTargetName )

In [8]:
#Генерируем номера групп согластно карте признаков
theFetch['targetID'] = theFetch['target'].apply( lambda x:  theTargetMap.code.loc[x] )
theFetch.drop(['target'], axis=1, inplace=True )
theFetch.sort_values(by=['targetID'], inplace=True)
theFetch.fillna(0, inplace=True)
theFetch.columns = ['P' + str(i) for i in range(1, len(theFetch.columns))] + ['targetID']

In [9]:
#Рассчитываем положения их смещения в общем массиве
theTargetMap['len'] = theTargetMap['code'].apply( lambda type:  len(theFetch[theFetch.targetID == type]) )
theTargetMap['begin'] = [theTargetMap[theTargetMap.code < type ]['len'].sum() \
                        if type > 0 else 0 \
                        for type in range(0, 11) ]
theTargetMap['end'] = [theTargetMap[theTargetMap.code <= type ]['len'].sum() \
                        if type > 0 else int(theTargetMap[theTargetMap.code == type ]['len']) \
                        for type in range(0, 11) ]

In [10]:
theFetch.reset_index( inplace=True)
theFetch.set_index(['uid', 'targetID'], inplace=True)

for column in theFetch.columns:
    mean = theFetch[column].mean();
    std = theFetch[column].std();
    theFetch[column] = (theFetch[column]-mean)/std

In [11]:
#Эта тупая тварь не сохраняет в файл индексы. Ставим заплатку
theFetch.reset_index(inplace=True)
theFetch.to_csv('~/project/xgb_fetch.csv', sep=',', index=False)
theFetch.to_csv('~/project/xgb_fetch_map.csv', sep=',', index=False)
theFetch.set_index(['uid', 'targetID'], inplace=True)

### Любуемся результатом

In [12]:
theTargetMap

Unnamed: 0,code,len,begin,end
--,0,5000,0,5000
F18-24,1,2886,5000,7886
F25-34,2,6791,7886,14677
F35-44,3,4271,14677,18948
F45-54,4,2597,18948,21545
F>=55,5,895,21545,22440
M18-24,6,2012,22440,24452
M25-34,7,8666,24452,33118
M35-44,8,5089,33118,38207
M45-54,9,2147,38207,40354


In [13]:
theFetch.describe()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P76,P77,P78,P79,P80,P81,P82,P83,P84,P85
count,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,...,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0,41138.0
mean,8.158214e-15,4.0889510000000005e-17,-4.123065e-15,-3.864502e-15,-8.411444e-15,6.979859e-16,2.396811e-15,-7.712418e-16,5.278886e-15,-8.761442e-16,...,1.617893e-15,2.20334e-15,-8.951069000000001e-17,2.848808e-15,-6.524835e-15,3.755825e-15,-1.301012e-14,-2.018962e-15,-7.651581e-16,-4.950041e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.03854878,-0.01393146,-0.02262425,-0.02273501,-0.0256947,-0.005708429,-0.06966028,-0.01029442,-0.02051984,-0.06120557,...,-0.06908749,-0.008539837,-0.04866369,-0.05082998,-0.04405108,-0.008256803,-0.04755806,-0.02619758,-0.01246266,-0.008051403
25%,-0.03854878,-0.01393146,-0.02262425,-0.02273501,-0.0256947,-0.005708429,-0.06966028,-0.01029442,-0.02051984,-0.06120557,...,-0.06908749,-0.008539837,-0.04866369,-0.05082998,-0.04405108,-0.008256803,-0.04755806,-0.02619758,-0.01246266,-0.008051403
50%,-0.03854878,-0.01393146,-0.02262425,-0.02273501,-0.0256947,-0.005708429,-0.06966028,-0.01029442,-0.02051984,-0.06120557,...,-0.06908749,-0.008539837,-0.04866369,-0.05082998,-0.04405108,-0.008256803,-0.04755806,-0.02619758,-0.01246266,-0.008051403
75%,-0.03854878,-0.01393146,-0.02262425,-0.02273501,-0.0256947,-0.005708429,-0.06966028,-0.01029442,-0.02051984,-0.06120557,...,-0.06908749,-0.008539837,-0.04866369,-0.05082998,-0.04405108,-0.008256803,-0.04755806,-0.02619758,-0.01246266,-0.008051403
max,72.15634,184.1297,144.2205,130.6503,108.6789,201.9283,62.22783,155.8118,111.5405,66.76245,...,77.12934,156.1301,170.4394,71.38234,122.9949,157.6949,117.5812,196.2614,176.7769,165.6013


### Выделяем диапазон для исследований

In [14]:
predictData = (theFetch.values)[0:5000]
fullData = (theFetch.values)[5000:41138]
fullTarget = np.array( theFetch.index.get_level_values('targetID') )[5000:41138]
fullData.shape, fullTarget.shape 

((36138, 85), (36138,))

### Загоняем в GradientBoostingClassifier

In [15]:
GradientBoostingClassifier%%time
from sklearn.model_selection import train_test_split
#Делим мастер данные на массивы
masterData, researchmentData, masterTarget, researchmentTarget = \
    train_test_split(fullData, fullTarget, test_size= 0.33, random_state=33)

CPU times: user 72 ms, sys: 4 ms, total: 76 ms
Wall time: 74.5 ms


In [16]:
%%time
from sklearn.model_selection import RandomizedSearchCV


# После перебора всех параметров, был оставлен GradientBoostingClassifier с параметрами по-умолчанию. 
# RandomizedSearchCV дает те параметры которые указаны ниже, но они дают максимально лучший результат 0,27(на LVL-2). 
# Параметры по-умолчанию дают результат 0,32 на fetch.csv, 0.31 на fetch2S, fetch3S
# в конечном итоге возможно нужно будет прогнать все  файлы фичей, но пока что эта лучшая конфигурация. 

# model= GradientBoostingClassifier(subsample=0.95,n_estimators= 10,
#  min_samples_split=0.1,
#  min_samples_leaf= 0.2090909090909091,
#  max_features= 'sqrt',
#  max_depth= 8,
#  loss= 'deviance',
#  learning_rate= 0.2,
#  criterion='mae')

model= GradientBoostingClassifier()

#model= RandomizedSearchCV(GradientBoostingClassifier(), parameters, cv=5, n_jobs=-1)
model.fit(masterData, masterTarget)

CPU times: user 1min 27s, sys: 20 ms, total: 1min 27s
Wall time: 1min 27s


In [17]:
# Если воспроизвести эту ячейку, после обучения модели прозвенит звонок
#Audio(filename=sound_file, autoplay=True)

In [18]:
researchmentLabel = model.predict(researchmentData)
researchmentPredict = model.predict_proba(researchmentData)

In [19]:
from sklearn.metrics import classification_report
report = classification_report(researchmentTarget, researchmentLabel, target_names=list(theTargetMap.index[1:])) #['...', '...']
print(report)

              precision    recall  f1-score   support

      F18-24       0.26      0.05      0.08       986
      F25-34       0.27      0.21      0.24      2195
      F35-44       0.30      0.10      0.15      1408
      F45-54       0.13      0.01      0.02       797
       F>=55       0.12      0.00      0.01       305
      M18-24       0.17      0.01      0.01       663
      M25-34       0.27      0.80      0.40      2897
      M35-44       0.29      0.11      0.16      1699
      M45-54       0.15      0.02      0.03       707
       M>=55       0.05      0.00      0.01       269

    accuracy                           0.27     11926
   macro avg       0.20      0.13      0.11     11926
weighted avg       0.24      0.27      0.19     11926



In [20]:
good_i=[n for n,i in enumerate (researchmentPredict) if i.max()>0.238501980473300]
print(len(good_i))

6301


In [21]:
%%time

len_predict=int(len(researchmentPredict)/2)
full_pogreh=[i.max() for i in researchmentPredict]
full_pogreh.sort()
pogreh= max(full_pogreh[len_predict-1:len_predict])


good_i=[n for n,i in enumerate (researchmentPredict) if i.max()>=pogreh]

print(len(good_i),len(researchmentPredict))

5964 11926
CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 69.8 ms


# Загоняем в GradientBoostingClassifier LVL -2

In [22]:
researchmentData_2 = (researchmentData)[good_i]
researchmentTarget_2=(researchmentTarget)[good_i]

In [23]:
researchmentLabel_2 = model.predict(researchmentData_2)

In [24]:
report = classification_report(researchmentTarget_2, researchmentLabel_2, target_names=list(theTargetMap.index[1:])) #['...', '...']
print(report)

              precision    recall  f1-score   support

      F18-24       0.32      0.05      0.08       420
      F25-34       0.28      0.32      0.30       937
      F35-44       0.33      0.17      0.22       636
      F45-54       0.08      0.01      0.02       352
       F>=55       0.17      0.01      0.02       120
      M18-24       0.19      0.01      0.03       361
      M25-34       0.33      0.78      0.47      1662
      M35-44       0.33      0.16      0.21       963
      M45-54       0.12      0.02      0.03       378
       M>=55       0.06      0.01      0.01       135

    accuracy                           0.32      5964
   macro avg       0.22      0.15      0.14      5964
weighted avg       0.28      0.32      0.25      5964

