In [1]:
DATA_DIR = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/data/'

In [2]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
pd.set_option("display.precision", 2)
from sklearn import preprocessing
import datetime
pd.options.mode.chained_assignment = None
from sklearn.metrics import f1_score

In [3]:
columns=['USER_ID', 'TIMESTAMP']
botsLogins = pq.read_table(DATA_DIR + 'botsLogins.parquet', columns=columns).to_pandas()
usersLogins = pq.read_table(DATA_DIR + 'usersLogins.parquet', columns=columns).to_pandas()

In [4]:
botsLogins['isBot'] = 1
usersLogins['isBot'] = 0
logins = botsLogins.append(usersLogins)

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin

class TSClassifier(BaseEstimator, ClassifierMixin):  

    def __init__(self, quantile=0.2, parts=4, short_parts=168):
        self.quantile = quantile
        self.parts = parts
        self.short_parts = short_parts


    def fit(self, X, y=None):
        self.max_ts = X['TIMESTAMP'].max()
        self.min_ts = X['TIMESTAMP'].min()        
        self.period = (self.max_ts - self.min_ts) / self.parts
        self.short_period = self.period / self.short_parts
        
        X['TS_GROUP'] = X['TIMESTAMP'].apply(self._group_ts)
    
        bots_quantile = X[y == 1]\
                    .groupby(['TS_GROUP', 'USER_ID']).size()\
                    .groupby('TS_GROUP').value_counts()\
                    .groupby('TS_GROUP').quantile(self.quantile)
            
        users_quantile = X[y == 0]\
                    .groupby(['TS_GROUP', 'USER_ID']).size()\
                    .groupby('TS_GROUP').value_counts()\
                    .groupby('TS_GROUP').quantile(self.quantile)
        
        self.quantile_table = pd.DataFrame(bots_quantile, columns=['size'])\
            .join(pd.DataFrame(users_quantile, columns=['size']),\
                  how='outer', lsuffix='_bots', rsuffix='_users').fillna(0)
        
        return self

    def _group_ts(self, ts):
        p = int((ts - self.min_ts) / self.period)
        sp = int(((ts - self.min_ts) % self.period) / self.short_period)
        return p * self.short_parts + sp 

    def predict(self, X, y=None):
        X['TS_GROUP'] = X['TIMESTAMP'].apply(self._group_ts)
        
        test_quantile_table = pd.DataFrame(X.groupby(['USER_ID','TS_GROUP']).size(), columns=['size'])
        test_table = test_quantile_table.join(self.quantile_table, how='left').fillna(0)
        test_table['TEST'] = np.sign(abs(test_table['size'] - test_table['size_users']) - abs(test_table['size'] - test_table['size_bots']))
        div_sum = test_table['TEST'].groupby('USER_ID').sum().apply(lambda x: x > 0)
        return div_sum

    def score(self, X, y=None):
        return f1_score(y.groupby('USER_ID').first(), self.predict(X))

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
X = logins.set_index(['USER_ID', logins.index.values])[['TIMESTAMP', 'isBot']]
y = X['isBot']
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)

In [None]:
pd.options.mode.chained_assignment = None
params = {'parts': [1, 2, 3, 4, 5, 6, 7, 8], 
          'short_parts': [1, 10, 20, 30, 40, 50, 100, 150, 200],
          'quantile': [0.001, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5]}
grid = GridSearchCV(TSClassifier(), params, n_jobs=2, verbose=True)
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
KeyboardInterrupt
Traceback (most recent call last):
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/nsuprotivniy/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs

In [11]:
# parts = grid.best_params_['parts']
# short_parts = grid.best_params_['short_parts']
# quantile = grid.best_params_['quantile']
parts = 7
short_parts = 1
quantile = 0.2
clf = TSClassifier(quantile, parts, short_parts)
clf.fit(X_train, y_train)
clf.score(X_holdout, y_holdout)

0.048071061569276326