In [1]:
import numpy as np

DATA_DIR = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/data/'
GRAPH_PATH = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/graph/'
MODEL_DIR = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/model/'
SAMPLE_SIZE = 1000 # -1 если использовать все даннные
BATCH_SIZE = 1_000_000
DEPTH_RANGE = range(1, 20, 2)
KNN_NEIGHBORS_RANGE = range(1, 20, 2)
SVM_MAX_ITER_RANGE = range(10, 100, 10)
SVM_ALPHA_RANGE = np.fromfunction(lambda i: 0.0000001 * pow(10, i), (7,), dtype=float)

FEATURES = [
    'userAgentIsBot',
    'userAgentIsMobile',
    'userAgentIsTablet',
    'userAgentIsTouchCapable',
    'userAgentIsPC',
    'userAgentOSFamily',
    'userAgentOSVersion0',
    'userAgentOSVersion1',
    'userAgentOSVersion2',
    'userAgentBrowserFamily',
    'userAgentBrowserVersion0',
    'userAgentBrowserVersion1',
    'userAgentBrowserVersion2',
    'userAgentDeviceFamily',
    'userAgentDeviceBrand',
    'userAgentDeviceModel',
    'from',
    'to',
    'url',
    'requestType',
    'operation'
]

In [2]:
!pip install pyarrow
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import seaborn as sns
import user_agents
%matplotlib inline
from matplotlib import pyplot as plt
pd.set_option("display.precision", 2)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import export_graphviz
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from sklearn.model_selection import RandomizedSearchCV

import os

import warnings
warnings.filterwarnings('ignore')

[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
http = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet', columns=FEATURES ).to_pandas().head(SAMPLE_SIZE)
target = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet', columns=['isBot'] ).to_pandas().head(SAMPLE_SIZE)

## Подготока моделей

#### Hashing Trick - единственный нормальный подход https://habr.com/company/ods/blog/326418/

In [4]:
class HashingTrick(TransformerMixin):
    
    _default_hashing_trick_modulars = {
        'userAgentOSFamily': 10000,
        'userAgentDeviceFamily': 10000,
        'userAgentDeviceBrand': 10000,
        'userAgentDeviceModel': 10000,
        'userAgentBrowserFamily': 10000,
        'from': 10000,
        'to': 10000,
        'url': 10000000,
        'requestType': 10000,
        'operation': 10000
    }
    
    def __init__(self, hashing_trick_modulars = _default_hashing_trick_modulars):
        self.hashing_trick_modulars = hashing_trick_modulars
    
    def set_params(self, **kwargs):
        """Set the parameters of this estimator."""
    
    def get_params(self, **kwargs):
        return {"hashing_trick_modulars": self.hashing_trick_modulars}
        
    def _hashing_trick(self, x, n):
        return hash(x) % n

    def _column_hashing_trick(self, col_name):
        self.http[col_name] = self.http[col_name].apply(self._hashing_trick, args=(self.hashing_trick_modulars[col_name],))
    
    def _to_numeric(self, col_name):
        self.http[col_name] = pd.to_numeric(self.http[col_name], 'coerce').fillna(0).astype(int)
    
    def fit_transform(self, X, *_):
        return self.transform(X)
        
    def transform(self, X, *_):
        self.http = X
        self._column_hashing_trick('userAgentOSFamily')
        self._column_hashing_trick('userAgentDeviceFamily')
        self._column_hashing_trick('userAgentDeviceBrand')
        self._column_hashing_trick('userAgentDeviceModel')
        self._column_hashing_trick('userAgentBrowserFamily')
        self._column_hashing_trick('from')
        self._column_hashing_trick('to')
        self._column_hashing_trick('url')
        self._column_hashing_trick('requestType')
        self._column_hashing_trick('operation')
        
        self._to_numeric('userAgentOSVersion0')
        self._to_numeric('userAgentOSVersion1')
        self._to_numeric('userAgentOSVersion2')
        self._to_numeric('userAgentBrowserVersion0')
        self._to_numeric('userAgentBrowserVersion1')
        self._to_numeric('userAgentBrowserVersion2')
        return self.http
    
    def fit(self, *_):
        return self

In [5]:
ht = HashingTrick()
ohe = OneHotEncoder(handle_unknown='ignore')
clf = SGDClassifier(random_state=17, max_iter=1000, alpha=0.0000001, loss='log')
pipeline = Pipeline(
    [('ht', ht)] +
    [('ohe', ohe)]+
    [('sgd', clf)]
)

## Поиск гиперпаратметров
#### Используется RandomizedSearchCV, так полный GridSearchCV работает долго

In [6]:
X_train, X_holdout, y_train, y_holdout = train_test_split(http, target, test_size=0.1, random_state=17)

In [7]:
params = {'sgd__alpha': (0.1, 0.01, 0.001, 0.00001, 0.0000001),
              'sgd__max_iter': (5, 10, 50, 80, 500, 1000)}
search = RandomizedSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=True, scoring='f1', n_iter = 10)
search.fit(X_train, y_train)
print(search.best_params_, search.best_score_)
print(classification_report(y_holdout, search.predict(X_holdout), target_names = ['users', 'bots']))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   43.4s finished


{'sgd__max_iter': 80, 'sgd__alpha': 0.001} 0.897702478678218
             precision    recall  f1-score   support

      users       0.90      0.87      0.89        54
       bots       0.85      0.89      0.87        46

avg / total       0.88      0.88      0.88       100



## Создание моделей по полученным из search гиперпараметрам

In [8]:
clf = SGDClassifier(random_state = 17, 
                    max_iter = search.best_params_['sgd__max_iter'], 
                    alpha = search.best_params_['sgd__alpha'],
                    loss = 'log')
ht = HashingTrick().fit(http)
ohe = OneHotEncoder(handle_unknown='ignore').fit(ht.transform(http))

## Сохранение моделей

In [9]:
joblib.dump(ht, MODEL_DIR + 'http_ua_from_parquet_onehotencoder_hasging_trick.pkl') 
joblib.dump(ohe, MODEL_DIR + 'http_ua_from_parquet_onehotencoder_ohe.pkl')
joblib.dump(clf, MODEL_DIR + 'http_ua_from_parquet_onehotencoder_sgd.pkl')

['/home/nsuprotivniy/Documents/Работа/OKru/Antispam/model/http_ua_from_parquet_onehotencoder_sgd.pkl']

## Обучение на большой выборке 

In [10]:
ht = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_hasging_trick.pkl') 
ohe = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_ohe.pkl')
clf = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_sgd.pkl')

In [11]:
train_batches = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet', 
                        columns=FEATURES + ['isBot']).to_batches(BATCH_SIZE)

In [12]:
first = True
for batch in train_batches:
    http = batch.to_pandas()
    X = http[FEATURES]
    y = http['isBot']
    X = ht.transform(X)
    X = ohe.transform(X)
    if (first):
        clf.fit(X, y) # Проблема, когда 'y' состоит только из пользователей
        first = False
    else:
        clf.partial_fit(X, y)

In [13]:
joblib.dump(clf, MODEL_DIR + 'http_ua_from_parquet_onehotencoder_sgd_fitted.pkl')

['/home/nsuprotivniy/Documents/Работа/OKru/Antispam/model/http_ua_from_parquet_onehotencoder_sgd_fitted.pkl']

## Классификация для большой выборки

In [14]:
ht = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_hasging_trick.pkl') 
ohe = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_ohe.pkl')
clf = joblib.load(MODEL_DIR + 'http_ua_from_parquet_onehotencoder_sgd_fitted.pkl')

In [15]:
predict_batches = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet', 
                        columns=FEATURES + ['isBot']).to_batches(BATCH_SIZE)

In [16]:
predicted_writer = pq.ParquetWriter(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA_predicted.parquet', 
                                    pa.Table.from_pandas(pd.DataFrame({'predict_proba': []}, dtype=np.float64)).schema)

In [17]:
for batch in predict_batches:
    http = batch.to_pandas()
    X = http[FEATURES]
    X = ht.transform(X)
    X = ohe.transform(X)
    predicted = clf.predict_proba(X)
    table = pa.Table.from_pandas(pd.DataFrame({'predict_proba': predicted[:,1]}, dtype=np.float64))
    predicted_writer.write_table(table)

In [18]:
predicted_writer.close()

## Тестирование для большой выборки

In [19]:
predicted_proba = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA_predicted.parquet',
                                columns=['predict_proba']).to_pandas()

real = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet',
                                columns=['isBot']).to_pandas()

In [20]:
bound = 0.5
predicted = (predicted_proba.values.ravel() > bound).astype(np.int32)

In [21]:
print(classification_report(real, predicted, target_names = ['users', 'bots']))

             precision    recall  f1-score   support

      users       0.98      1.00      0.99   1010370
       bots       0.88      0.02      0.03     20751

avg / total       0.98      0.98      0.97   1031121

