In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, f1_score, classification_report
%matplotlib inline
# from matplotlib import pyplot as plt
# import seaborn as sns

from sklearn.linear_model import SGDClassifier
from sklearn.externals import joblib

from sklearn.preprocessing import OneHotEncoder
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [2]:
FEATURES = [
    #'browserId', 
    #'ip',
    #from
    'method',
    'operation',
    #'referrer',
    'requestType',
    #'timestamp',
    #to
    #url
    #userAgent
    #userId
    #hour
    #'target',
    'DeviceClass',
    'DeviceName',
    'DeviceBrand',
    'DeviceCpu',
    'DeviceCpuBits',
    'OperatingSystemClass',
    'OperatingSystemName',
    'OperatingSystemVersion',
    'OperatingSystemNameVersion',
    'OperatingSystemVersionBuild',
    'LayoutEngineClass',
    'LayoutEngineName',
    'LayoutEngineVersion',
    'LayoutEngineVersionMajor',
    'LayoutEngineNameVersion',
    'LayoutEngineNameVersionMajor',
    'AgentClass',
    'AgentName',
    'AgentVersion',
    'AgentVersionMajor',
    'AgentNameVersion',
    'AgentNameVersionMajor'
]

### Подготовим данные

In [3]:
SAMPLE_FRAC = 1

DATA_DIR = '../../data/batches/'
MODEL_DIR = '../../model/'
GRAPH_DIR = '../../graph/'

http = pq.read_table(os.path.join(DATA_DIR, 'http-20180217_1718_parsedUAA_target.parquet'),
                     columns=FEATURES + ['target']).to_pandas()
http = http.sample(frac=SAMPLE_FRAC)
http.shape

(228428, 26)

In [4]:
y = http['target']
X = http.drop('target', axis=1)

In [5]:
X_train, X_holdout, y_train, y_holdout = train_test_split(http, y, test_size=0.3, random_state=17)

In [26]:
http.head()

Unnamed: 0,method,operation,requestType,DeviceClass,DeviceName,DeviceBrand,DeviceCpu,DeviceCpuBits,OperatingSystemClass,OperatingSystemName,...,LayoutEngineVersionMajor,LayoutEngineNameVersion,LayoutEngineNameVersionMajor,AgentClass,AgentName,AgentVersion,AgentVersionMajor,AgentNameVersion,AgentNameVersionMajor,target
0,POST,AnonymFeed,REQ,Desktop,Desktop,Unknown,32,Desktop,Windows NT,Windows 7,...,Blink 63,Browser,Opera,50.0.2762.67,50,Opera 50.0.2762.67,Opera 50,,,0
1,POST,videoStatNew,REQ,Desktop,Desktop,Unknown,Intel x86_64,64,Desktop,Windows NT,...,Trident 7.0,Trident 7,Browser,Internet Explorer,11.0,11,Internet Explorer 11.0,Internet Explorer 11,,0
2,POST,AnonymLogin,REQ,Desktop,Linux Desktop,Unknown,Intel x86_64,64,Desktop,Linux,...,Blink 58.0,Blink 58,Browser,Chrome,58.0.3029.81,58,Chrome 58.0.3029.81,Chrome 58,,0
3,,CSP,CSP,Desktop,Desktop,Unknown,Intel x86_64,64,Desktop,Windows NT,...,Blink 64.0,Blink 64,Browser,Chrome,64.0.3282.167,64,Chrome 64.0.3282.167,Chrome 64,,0
4,POST,videoStatNew,REQ,Desktop,Desktop,Unknown,Intel x86_64,64,Desktop,Windows NT,...,Blink 64.0,Blink 64,Browser,Chrome,64.0.3282.167,64,Chrome 64.0.3282.167,Chrome 64,,0


### Основные transformers и estimators

In [6]:
# class HashingTrick(TransformerMixin):
    
#     def __init__(self, hashing_trick_mods = default_dict):   
#         self.hashing_trick_mods = hashing_trick_mods
#         #self._HASH_SPARSE = 1000
#         #self._hash_trick_mod = {}
# #         for col in X.columns:
# #             self._hash_trick_mod[col] = len(X[col].unique()) * self._HASH_SPARSE # для полностью уникальных значений(browserId) получается слишком много
            
# #     def _hashing_trick(self, X):
# #         for col in features:
# #             lX[col] = X[col].apply(lambda x: hash(col + '=' + str(x)) % self._hash_trick_mod[col])
# #         return lX

# #     def fit(self, X):
# #         pass

#     def fit(self, *_):
#         return self
        
#     def transform(self, X, *_):
#         #return self._hashing_trick(self, X)
#         pass
    
#     def fit_transform(self, X):
#         #return self.transform(X)
        
#         lX = pd.DataFrame()
#         for col in features:
#             lX[col] = X[col].apply(lambda x: hash(col + '=' + str(x)) % self._hash_trick_mod[col])
#         return lX
    
# #     def get_params(self, **kwargs):
# #         return {"hash_trick_mod": self._hash_trick_mod}
    
# #     def set_params(self, **kwargs):
# #         """Set the parameters of this estimator."""

In [7]:
class HashingTrick(TransformerMixin):
    
    _default_hashing_trick_modulars = {
        'DeviceClass': 10000,
        'DeviceName': 10000,
        'DeviceBrand': 10000,
        'DeviceCpu': 10000,
        'DeviceCpuBits': 10000,
        'OperatingSystemClass': 10000,
        'OperatingSystemName': 10000,
        'OperatingSystemVersion': 10000,
        'OperatingSystemNameVersion': 10000,
        'OperatingSystemVersionBuild': 10000,
        'LayoutEngineClass': 10000,
        'LayoutEngineName': 10000,
        'LayoutEngineVersion': 10000,
        'LayoutEngineVersionMajor': 10000,
        'LayoutEngineNameVersion': 10000,
        'LayoutEngineNameVersionMajor': 10000,
        'AgentClass': 10000,
        'AgentName': 10000,
        'AgentVersion': 10000,
        'AgentVersionMajor': 10000,
        'AgentNameVersion': 10000,
        'AgentNameVersionMajor': 10000,
        'requestType': 10000,
        'operation': 10000,
        'method': 1000
    }
    
    def __init__(self, hashing_trick_modulars = _default_hashing_trick_modulars):
        self.hashing_trick_modulars = hashing_trick_modulars
    
    def set_params(self, **kwargs):
        """Set the parameters of this estimator."""
    
    def get_params(self, **kwargs):
        return {"hashing_trick_modulars": self.hashing_trick_modulars}
        
    def _hashing_trick(self, x, n):
        return hash(x) % n

    def _column_hashing_trick(self, col_name):
        self.http[col_name] = self.http[col_name].apply(self._hashing_trick, args=(self.hashing_trick_modulars[col_name],))
  
    def fit_transform(self, X, *_):
        return self.transform(X)
        
    def transform(self, X, *_):
        self.http = X
        for feature in FEATURES:
            self._column_hashing_trick(feature)

        return self.http
    
    def fit(self, *_):
        return self

In [8]:
ht = HashingTrick()
ohe = OneHotEncoder(sparse=True, handle_unknown='ignore')
clf = SGDClassifier(random_state=17, loss='log')

pipeline = Pipeline(steps=[('hashing_trick', ht),
                          ('one_hot_encoder', ohe),
                          ('model', clf)])

### Подберем гиперпараметры для классификатора

In [9]:
clf.get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'verbose', 'warm_start'])

In [11]:
%%time
ALPHA_RANGE = np.fromfunction(lambda i: 1e-15 * pow(10, i), (15,), dtype=float)
MAX_ITER_RANGE = np.fromfunction(lambda i: 10 * (i + 1), (20,), dtype=float)


params = {'model__alpha': ALPHA_RANGE, 'model__max_iter': MAX_ITER_RANGE}

search = RandomizedSearchCV(pipeline, params, cv=5, n_jobs=1, verbose=True, scoring='f1', n_iter=10)

search.fit(X_train, y_train)
print(search.best_params_, search.best_score_)
print(classification_report(y_holdout, search.predict(X_holdout), target_names = ['users', 'bots']))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 11.9min finished


{'model__max_iter': 130.0, 'model__alpha': 1e-09} 1.0
             precision    recall  f1-score   support

      users       1.00      1.00      1.00     67498
       bots       1.00      1.00      1.00      1031

avg / total       1.00      1.00      1.00     68529

CPU times: user 12min 10s, sys: 17.2 s, total: 12min 27s
Wall time: 12min 9s


#### Нужно отдельно создать пацанов

In [12]:
ht = HashingTrick().fit(X)
ohe = OneHotEncoder(sparse=True, handle_unknown='ignore').fit(ht.transform(X))
clf = SGDClassifier(random_state = 17, 
                    max_iter = search.best_params_['model__max_iter'], 
                    alpha = search.best_params_['model__alpha'],
                    loss = 'log')

### Сохраним модель

In [13]:
joblib.dump(ht, os.path.join(MODEL_DIR, 'http_ua_ht.pkl'))
joblib.dump(ohe, os.path.join(MODEL_DIR, 'http_ua_ohe.pkl'))
joblib.dump(clf, os.path.join(MODEL_DIR, 'http_ua_clf.pkl'))

['../../model/http_ua_clf.pkl']

## Обучим по кусочкам

In [14]:
ht = joblib.load(os.path.join(MODEL_DIR, 'http_ua_ht.pkl'))
ohe = joblib.load(os.path.join(MODEL_DIR, 'http_ua_ohe.pkl'))
clf = joblib.load(os.path.join(MODEL_DIR, 'http_ua_clf.pkl'))

In [15]:
train_batches = pq.read_table(DATA_DIR + 'http-20180217_1718_parsedUAA_batches_train.parquet', 
                        columns=FEATURES + ['target']).to_batches(50_000)

In [16]:
first = True
i = 0
for batch in train_batches:
    print('iteration #',i)
    i += 1
    http = batch.to_pandas()
    X = http[FEATURES]
    y = http['target']
    X = ht.transform(X)
    X = ohe.transform(X)
    if (first):
        clf.fit(X, y) # Проблема, когда 'y' состоит только из пользователей
        first = False
    else:
        clf.partial_fit(X, y)

iteration # 0
iteration # 1
iteration # 2
iteration # 3


In [17]:
joblib.dump(clf, MODEL_DIR + 'http_ua_clf_fitted_on_batches.pkl')

['../../model/http_ua_clf_fitted_on_batches.pkl']

### Прогнозы для большой выборки

In [18]:
ht = joblib.load(os.path.join(MODEL_DIR, 'http_ua_ht.pkl'))
ohe = joblib.load(os.path.join(MODEL_DIR, 'http_ua_ohe.pkl'))
clf = joblib.load(os.path.join(MODEL_DIR, 'http_ua_clf_fitted_on_batches.pkl'))

In [19]:
# todo: поменять на другой df
predict_batches = pq.read_table(DATA_DIR + 'http-20180217_1718_parsedUAA_batches_test.parquet', 
                        columns=FEATURES + ['target']).to_batches(100_000)

In [20]:
predicted_writer = pq.ParquetWriter(DATA_DIR + 'http-20180217_1718_parsedUA_predicted.parquet', 
                                    pa.Table.from_pandas(pd.DataFrame({'predict_proba': []}, dtype=np.float64)).schema)

In [21]:
for batch in predict_batches:
    http = batch.to_pandas()
    X = http[FEATURES]
    X = ht.transform(X)
    X = ohe.transform(X)
    predicted = clf.predict_proba(X)
    table = pa.Table.from_pandas(pd.DataFrame({'predict_proba': predicted[:,1]}, dtype=np.float64))
    predicted_writer.write_table(table)

In [22]:
predicted_writer.close()

#### Теперь сравним с тем что должно было получиться

In [23]:
predicted_proba = pq.read_table(os.path.join(DATA_DIR, 'http-20180217_1718_parsedUA_predicted.parquet'),
                                columns=['predict_proba']).to_pandas()

real = pq.read_table(os.path.join(DATA_DIR, 'http-20180217_1718_parsedUAA_batches_test.parquet'),
                                columns=['target']).to_pandas()

In [24]:
bound = 0.5
predicted = (predicted_proba.values.ravel() > bound).astype(np.int32)

In [25]:
print(classification_report(real, predicted, target_names = ['users', 'bots']))

             precision    recall  f1-score   support

      users       0.99      1.00      1.00     67470
       bots       0.72      0.65      0.68      1059

avg / total       0.99      0.99      0.99     68529

