In [1]:
DATA_DIR = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/data/'
GRAPH_PATH = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/graph/'
SAMPLE_SIZE = -1
window = 60000
shift = 10000
features = ['API_SESSION', 'API_TOKEN', 'AUTH_CODE',
            'EMAIL', 'JSESSION_ID', 'LOGIN', 'MESSENGER', 'PHONE',
            'SOCIAL_API_TOKEN', 'TWO_FACTOR', 'USER_ID_TYPE',
            'USER_ID_NO_PASSWORD']
batch_window = 43_200_000

In [13]:
!pip install pyarrow
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import seaborn as sns
import user_agents
%matplotlib inline
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import export_graphviz
from sklearn import svm
from sklearn.linear_model import SGDClassifier

from random import *

[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
columns=['USER_ID', 'TIMESTAMP', 'LOGIN_CREDENTIAL_TYPE']
botsLogins = pq.read_table(DATA_DIR + 'botsLogins.parquet', columns=columns).to_pandas().head(SAMPLE_SIZE >> 1)
usersLogins = pq.read_table(DATA_DIR + 'usersLogins.parquet', columns=columns).to_pandas().head(SAMPLE_SIZE >> 1)

In [4]:
botsLogins['isBot'] = 1
usersLogins['isBot'] = 0
logins = botsLogins.append(usersLogins)

In [5]:
def time_preprocessing(table, window, shift, ts_min):
    nshift = int(window / shift)
    window_count = pd.DataFrame()
    for i in range(0, int(window / shift)):
        ts_i_col = "TS_PART_".format(i)
        table[ts_i_col] = table.apply(lambda x: int((x.name - ts_min + i * shift) / window) * nshift + i, axis=1)
        ts_part_cred_count = table.groupby([ts_i_col,'USER_ID', 'isBot', 'LOGIN_CREDENTIAL_TYPE']).size().unstack(fill_value=0)
        ts_part_cred_count.rename(index=str, columns={"USER_ID": "USER_ID_TYPE", ts_i_col : "TS_PART"}, inplace=True)
        ts_part_cred_count.reset_index(inplace=True)
        window_count = window_count.append(ts_part_cred_count)
    return window_count

def feature_preprocessing(window_count, features):
    X = window_count.reindex(columns=features, fill_value=0)
    y = window_count['isBot'].astype('int64')
    return X, y


In [6]:
logins.sort_values(by=['TIMESTAMP'], inplace=True)
ts_min = logins['TIMESTAMP'].min()
ts_max = logins['TIMESTAMP'].max()
logins.set_index('TIMESTAMP', inplace=True)

In [7]:
train_test_split = int(logins.shape[0] * 0.7)
train_logins = logins[:train_test_split]
test_logins = logins[train_test_split:]

In [8]:
pd.options.mode.chained_assignment = None
clf = SGDClassifier(random_state=17, max_iter=1000)
for i in range(ts_min, ts_max - batch_window, batch_window):
    batch = train_logins.loc[i:i + batch_window]
    if (batch.shape[0] == 0):
        continue
    print("Batch", i, i + batch_window)
    window_count = time_preprocessing(batch, window, shift, ts_min)
    X_batch, y_batch = feature_preprocessing(window_count, features)
    %%time clf.partial_fit(X_batch, y_batch, classes=[0,1])

Batch 1517432401933 1517475601933
CPU times: user 167 ms, sys: 56 ms, total: 223 ms
Wall time: 223 ms
Batch 1517475601933 1517518801933
CPU times: user 320 ms, sys: 68 ms, total: 388 ms
Wall time: 388 ms
Batch 1517518801933 1517562001933
CPU times: user 155 ms, sys: 68 ms, total: 223 ms
Wall time: 223 ms
Batch 1517562001933 1517605201933
CPU times: user 282 ms, sys: 108 ms, total: 390 ms
Wall time: 390 ms
Batch 1517605201933 1517648401933
CPU times: user 181 ms, sys: 24.1 ms, total: 205 ms
Wall time: 205 ms
Batch 1517648401933 1517691601933
CPU times: user 262 ms, sys: 39.9 ms, total: 302 ms
Wall time: 302 ms
Batch 1517691601933 1517734801933
CPU times: user 151 ms, sys: 68.1 ms, total: 219 ms
Wall time: 219 ms
Batch 1517734801933 1517778001933
CPU times: user 345 ms, sys: 80 ms, total: 425 ms
Wall time: 425 ms
Batch 1517778001933 1517821201933
CPU times: user 182 ms, sys: 31.9 ms, total: 214 ms
Wall time: 214 ms
Batch 1517821201933 1517864401933
CPU times: user 318 ms, sys: 84.1 ms, t

In [9]:
window_count = time_preprocessing(test_logins, window, shift, ts_min)
X_test, y_test = feature_preprocessing(window_count, features)

In [10]:
f1_score(y_test, clf.predict(X_test))

0.13064507556295732

In [11]:
# cv_scores, holdout_scores = [], []
# split = np.arange(0.1, 0.8, 0.1)

# for i in split:
#     X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=i, random_state=17)
#     clf = clf = svm.SVC(random_state=17)
#     cv_scores.append(np.mean(cross_val_score(clf, X_train, y_train, cv=2, scoring='f1')))
#     clf.fit(X_train, y_train)
#     holdout_scores.append(f1_score(y_holdout, clf.predict(X_holdout)))

# plt.plot(split * X.shape[0], cv_scores, label='CV')
# plt.plot(split * X.shape[0], holdout_scores, label='holdout')
# plt.title('Forest logins time window test split')
# plt.savefig(GRAPH_PATH + 'forest_logins_time_window_testsplit.png', dpi=900)
# plt.legend();

In [19]:
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00  14387307
          1       0.26      0.09      0.13     42318

avg / total       1.00      1.00      1.00  14429625

