In [None]:
DATA_DIR = '/home/nsuprotivniy/Documents/Работа/OKru/Antispam/data/'
SAMPLE_SIZE = -1 # -1 если использовать все даннные

In [None]:
!pip install pyarrow
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import seaborn as sns
import user_agents
%matplotlib inline
from matplotlib import pyplot as plt
pd.set_option("display.precision", 2)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.tree import export_graphviz

In [None]:
botsHTTPRequests = pq.read_table(DATA_DIR + 'botsHTTPRequests-20180217_1718.parquet').to_pandas().head(SAMPLE_SIZE >> 1)
usersHTTPRequests = pq.read_table(DATA_DIR + 'usersHTTPRequests-20180217_1718.parquet').to_pandas().head(SAMPLE_SIZE >> 1)
botsHTTPRequests['isBot'] = 1
usersHTTPRequests['isBot'] = 0
http = botsHTTPRequests.append(usersHTTPRequests)

In [None]:
def parse_user_agent(x):
    x = user_agents.parse(x)
    return pd.Series({
        'userAgentIsBot': x.is_bot, 
        'userAgentIsMobile': x.is_mobile,
        'userAgentIsTablet': x.is_tablet,
        'userAgentIsTouchCapable': x.is_touch_capable,
        'userAgentIsPC': x.is_pc,
        'userAgentOSFamily': x.os.family,
        'userAgentOSVersion': x.os.version,
        'userAgentBrowserFamily': x.browser.family,
        'userAgentBrowserVersion': x.browser.version,
        'userAgentDeviceFamily': x.device.family,
        'userAgentDeviceBrand': x.device.brand,
        'userAgentDeviceModel': x.device.model    
    })

http = http.merge(http[~http['userAgent'].isnull()]['userAgent'].apply(parse_user_agent), left_index=True, right_index=True)

In [None]:
http['url'] = http['url'].astype('str').fillna('None')
http['from'] = http['from'].astype('str').fillna('None')
http['to'] = http['to'].astype('str').fillna('None')
http['requestType'] = http['requestType'].astype('str').fillna('None')
http['operation'] = http['operation'].astype('str').fillna('None')

http['userAgentBrowserVersion'] = http['userAgentBrowserVersion'].fillna(pd.Series(['0', '0', '0']))
http[['userAgentBrowserVersion0', 
      'userAgentBrowserVersion1', 
      'userAgentBrowserVersion2']] = http['userAgentBrowserVersion'].apply(pd.Series)
http['userAgentBrowserVersion'] = http.drop('userAgentBrowserVersion')
http['userAgentBrowserVersion0'] = http['userAgentBrowserVersion0'].astype('str').fillna('0')
http['userAgentBrowserVersion1'] = http['userAgentBrowserVersion1'].astype('str').fillna('0')
http['userAgentBrowserVersion2'] = http['userAgentBrowserVersion2'].astype('str').fillna('0')

http['userAgentDeviceBrand'] = http['userAgentDeviceBrand'].astype('str').fillna('None')
http['userAgentDeviceFamily'] = http['userAgentDeviceFamily'].astype('str').fillna('None')
http['userAgentDeviceModel'] = http['userAgentDeviceModel'].astype('str').fillna('None')
http['userAgentOSFamily'] = http['userAgentOSFamily'].astype('str').fillna('None')

http['userAgentOSVersion'] = http['userAgentOSVersion'].fillna(pd.Series(['0', '0', '0']))
http[['userAgentOSVersion0', 
      'userAgentOSVersion1', 
      'userAgentOSVersion2']] = http['userAgentOSVersion'].apply(pd.Series)
http['userAgentOSVersion'] = http.drop('userAgentOSVersion')
http['userAgentOSVersion0'] = http['userAgentOSVersion0'].astype('str').fillna('0')
http['userAgentOSVersion1'] = http['userAgentOSVersion1'].astype('str').fillna('0')
http['userAgentOSVersion2'] = http['userAgentOSVersion2'].astype('str').fillna('0')

http['userAgentIsBot'] = http['userAgentIsBot'].astype('bool').fillna(False)
http['userAgentIsMobile'] = http['userAgentIsMobile'].astype('bool').fillna(False)
http['userAgentIsPC'] = http['userAgentIsPC'].astype('bool').fillna(False)
http['userAgentIsTablet'] = http['userAgentIsTablet'].astype('bool').fillna(False)
http['userAgentIsTouchCapable'] = http['userAgentIsTouchCapable'].astype('bool').fillna(False)


In [None]:
pq.write_table(pa.Table.from_pandas(http), DATA_DIR + 'botsHTTPRequests-20180217_1718_parsedUA.parquet')

In [None]:
def encode(col):
    le = LabelEncoder()
    le.fit(col)
    return le.transform(col)
    
X = http[[
    'userAgentIsBot',
    'userAgentIsMobile',
    'userAgentIsTablet',
    'userAgentIsTouchCapable',
    'userAgentIsPC',
    'userAgentOSFamily',
    'userAgentOSVersion',
    'userAgentBrowserFamily',
    'userAgentBrowserVersion',
    'userAgentDeviceFamily',
    'userAgentDeviceBrand',
    'userAgentDeviceModel',
    'from',
    'to',
    'url',
    'requestType',
    'operation'
]].apply(encode, axis=0)

y = http['isBot']

In [None]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)

tree = DecisionTreeClassifier(random_state=17)
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)

In [None]:
tree_params = {'max_depth': range(1, 50),'max_features': range(1, 17)}
tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True, scoring='f1')
tree_grid.fit(X_train, y_train)
tree_grid.best_params_, tree_grid.best_score_

In [None]:
f1_score(y_holdout, tree_grid.predict(X_holdout))

In [None]:
cv_scores, holdout_scores = [], []
max_depth = range(1, 50)

for k in max_depth:

    tree = DecisionTreeClassifier(random_state=17, max_depth=k)
    cv_scores.append(np.mean(cross_val_score(tree, X_train, y_train, cv=5, scoring='f1')))
    tree.fit(X_train, y_train)
    holdout_scores.append(f1_score(y_holdout, tree.predict(X_holdout)))

plt.plot(max_depth, cv_scores, label='CV')
plt.plot(max_depth, holdout_scores, label='holdout')
plt.title('Easy task. Tree fails')
plt.legend();

In [None]:
knn_params = {'knn__n_neighbors': range(1, 20, 2)}
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True, scoring='f1')
knn_grid.fit(X_train, y_train)
knn_grid.best_params_, knn_grid.best_score_

In [None]:
f1_score(y_holdout, knn_grid.predict(X_holdout))

In [None]:
cv_scores, holdout_scores = [], []
n_neighb = range(1, 20, 2)

for k in n_neighb:

    knn = KNeighborsClassifier(n_neighbors=k)
    cv_scores.append(np.mean(cross_val_score(knn, X_train, y_train, cv=5, scoring='f1')))
    knn.fit(X_train, y_train)
    holdout_scores.append(f1_score(y_holdout, knn.predict(X_holdout)))

plt.plot(n_neighb, cv_scores, label='CV')
plt.plot(n_neighb, holdout_scores, label='holdout')
plt.title('Easy task. kNN fails')
plt.legend();

In [None]:
forest_params = {'max_depth': range(1,50),'max_features': range(1,18)}
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=True, scoring='f1')
forest_grid.fit(X_train, y_train)
forest_grid.best_params_, forest_grid.best_score_ 

In [None]:
f1_score(y_holdout, forest_grid.predict(X_holdout))

In [None]:
cv_scores, holdout_scores = [], []
max_depth = range(1, 50)

for k  in max_depth:
    forest = RandomForestClassifier(n_estimators=100, 
                                max_depth = k,
                                max_features = forest_grid.best_params_['max_features'],
                                n_jobs=-1, 
                                random_state=17)
    cv_scores.append(np.mean(cross_val_score(forest, X_train, y_train, cv=5, scoring='f1')))
    forest.fit(X_train, y_train)
    holdout_scores.append(f1_score(y_holdout, forest.predict(X_holdout)))

plt.plot(max_depth, cv_scores, label='CV')
plt.plot(max_depth, holdout_scores, label='holdout')
plt.title('Easy task. Forest fails')
plt.legend();

In [None]:
cv_scores, holdout_scores = [], []
max_features = range(1, 18)

for k  in max_features:
    forest = RandomForestClassifier(n_estimators=100, 
                                    max_depth = forest_grid.best_params_['max_depth'],  
                                    max_features = k,
                                    n_jobs=-1, 
                                    random_state=17)
    cv_scores.append(np.mean(cross_val_score(forest, X_train, y_train, cv=5, scoring='f1')))
    forest.fit(X_train, y_train)
    holdout_scores.append(f1_score(y_holdout, forest.predict(X_holdout)))

plt.plot(max_features, cv_scores, label='CV')
plt.plot(max_features, holdout_scores, label='holdout')
plt.title('Easy task. Forest fails')
plt.legend();