In [1]:
from clickhouse_driver import Client
from gensim.models import Word2Vec, Doc2Vec
from urllib import parse
import numpy as np
import pandas as pd
import re
import time
import gensim
from auto_profiling_utils import *
from auto_profiling_model import *
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec

def check_cs(index):
    try:
        client = Client('192.168.0.42', port='9001', send_receive_timeout=int(600000), settings={'max_threads': int(10)})
        client.connection.force_connect()
        if client.connection.connected:
            return client
        else:
            return check_cs(index + 1)
    except:
        return check_cs(index + 1)

def execute_ch(sql, param=None, with_column_types=True):
    client = check_cs(0)
    if client == None:
        sys.exit(1)
    
    result = client.execute(sql, params=param, with_column_types=with_column_types)

    client.disconnect()
    return result

def normal_query(start_date, end_date, limit, interval):
    sql = """
    select
        toStartOfInterval(logtime, INTERVAL {interval}) as lgtime, src_ip, dst_ip,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_host), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as host,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_agent), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as agent,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_query), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as query,
        'normal' as label
    
    from dti.dti_sh_demo_log
    where (logtime >= '{start_date}' and logtime < '{end_date}')
    and hash == 'normal'
    group by lgtime, src_ip, dst_ip
    limit {limit}
    """.replace('{interval}', interval).replace('{start_date}', start_date).replace('{end_date}', end_date).replace('{limit}',limit).replace('{interval}',interval)

    return sql

def attack_query(attack, start_date, end_date, limit, interval):
    sql = """
    select
        toStartOfInterval(logtime, INTERVAL {interval}) as lgtime, src_ip, dst_ip,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_host), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as host,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_agent), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as agent,
        arrayStringConcat(groupUniqArray(replaceRegexpAll(replaceRegexpAll(replace(decodeURLComponent(http_query), '/..', ' pathsearcherdetected '), '[\-%./!@#$?,;:&*)(+=0-9_]', ' '), '(\\\\b\\\\w{1}\\\\b)', ' ')), ' ') as query,
        '{attack}' as label
    
    from dti.dti_sh_demo_log
    where (logtime >= '{start_date}' and logtime < '{end_date}')
    and hash == '{attack}'
    group by lgtime, src_ip, dst_ip
    limit {limit}
    """.replace('{interval}', interval).replace('{start_date}', start_date).replace('{end_date}', end_date).replace('{limit}',limit).replace('{interval}',interval).replace('{attack}',attack)
    
    return sql

def create_w2v(version, data):
    if version == 'host':
        length = host_len
        vec = host_vec
    elif version == 'agent':
        length = agent_len
        vec = agent_vec
    elif version == 'path':
        length = path_len
        vec = path_vec
    elif version == 'query':
        length = query_len
        vec = query_vec        
    model = Word2Vec.load("w2v_model/{}.model".format(version))
    temp_list = []
    for i in range(length):
        if i < len(data):
            try:
                temp_list.append(model.wv[data[i]].tolist())
            except:
                temp_list.append([0 for i in range(vec)])
        else:
            temp_list.append([0 for i in range(vec)])
    return temp_list

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
config = {
    "common": {
        "model_name": "cnn_model",
        "model_path": "w2v_model",
        "scaler":"minmaxscaler",
        "vec":"vectorization",
        "encoder":"onehotencoder"
    },
    "train": {
        "data_load": 0,
        "crontab": "*/30 * * * *",
        "now_delta": "minutes=0",
        "prev_delta": "days=30",
        "max_depth":100, ## decision tree depth
        "optimizer_help": ['Adam', 'SGD'],
        "optimizer": 'Adam',
        "learning_rate": 0.0001,
        "batch_size": 32,
        "epochs": 500,
        "result_table": "result"
    },
    "predict": {
        "crontab": "*/1 * * * *",
        "now_delta": "minutes=0",
        "prev_delta": "days=6",
        "batch_size": 8
    }
}

## Data Load

In [3]:
start_time = time.time()

data,meta = execute_ch(normal_query('2021-06-01 00:00:00', '2021-07-01 00:00:00', '10000', '30 minute'))
feats = [m[0] for m in meta]
normal_data = pd.DataFrame(data = data, columns = feats)

attack_data = pd.DataFrame()
for i in ['XSS', 'BEACONING', 'SQL_INJECTION', 'CREDENTIAL']:
    data,meta = execute_ch(attack_query(i, '2021-06-01 00:00:00', '2021-07-01 00:00:00', '10000', '30 minute'))
    feats = [m[0] for m in meta]
    temp_df = pd.DataFrame(data = data, columns = feats)
    attack_data = pd.concat([attack_data, temp_df])
    
data = pd.concat([normal_data, attack_data])
data.reset_index(drop = True, inplace = True)

time.time() - start_time

1.2228684425354004

In [4]:
data['label'].value_counts()

normal           10000
XSS               7926
BEACONING         5349
SQL_INJECTION     4461
CREDENTIAL        4404
Name: label, dtype: int64

## Data Split

In [5]:
# Train/Test data split
y = data[['label']]
x = data.drop('label', axis = 1)

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=1004)

train_x.reset_index(drop = True, inplace = True)
test_x.reset_index(drop = True, inplace = True)
train_y.reset_index(drop = True, inplace = True)
test_y.reset_index(drop = True, inplace = True)

train_y = pd.get_dummies(train_y['label'])
test_y = pd.get_dummies(test_y['label'])

print(f"No. of training examples: {train_x.shape[0]}")
print(f"No. of testing examples: {test_x.shape[0]}")

save_test_x = test_x.copy()

No. of training examples: 22498
No. of testing examples: 9642


## Data Toknization

In [6]:
feat_list = ['host', 'agent', 'query']

host_vec = 10
agent_vec = 10
query_vec = 10

host_len = 20
agent_len = 20
query_len = 20

In [7]:
## Train data toknizing
for i in feat_list:
    train_x[i] = train_x[i].str.lower().str.split()
    
## Train data toknizing
for i in feat_list:
    test_x[i] = test_x[i].str.lower().str.split()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


## Word length

In [8]:
# data['host_len'], data['agent_len'], data['query_len'] = np.NaN, np.NaN, np.NaN
# data['host_len'] = data['host'].apply(lambda x : len(x))
# data['agent_len'] = data['agent'].apply(lambda x : len(x))
# data['query_len'] = data['query'].apply(lambda x : len(x))

# print(min(data['host_len']), max(data['host_len']), data['host_len'].mean())
# print(min(data['agent_len']), max(data['agent_len']), data['agent_len'].mean())
# print(min(data['query_len']), max(data['query_len']), data['query_len'].mean())

# import matplotlib.pyplot as plt

# plt.figure(figsize= (15,2))
# plt.hist(data['host_len'], bins = 5)
# plt.show()
# plt.figure(figsize= (15,2))
# plt.hist(data['agent_len'], bins = 5)
# plt.show()
# plt.figure(figsize= (15,2))
# plt.hist(data['query_len'], bins = 5)
# plt.show()

## Word2Vec

In [9]:
## Train data Vectorizing
for i in feat_list:
    model = Word2Vec(sentences=train_x[i], vector_size=host_vec, window=5, min_count=1, workers=-1, sg = 0)
    model.save("w2v_model/{}.model".format(i))
    train_x[i] = train_x[i].apply(lambda x : x[0:globals()['{}_len'.format(i)]])
    train_x['{}_vec'.format(i)] = train_x[i].apply(lambda x : create_w2v(i, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [10]:
## Test data Vectorizing
for i in feat_list:
    model = Word2Vec.load("w2v_model/{}.model".format(i))
    test_x[i] = test_x[i].apply(lambda x : x[0:globals()['{}_len'.format(i)]])
    test_x['{}_vec'.format(i)] = test_x[i].apply(lambda x : create_w2v(i, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

## Model Fitting

In [11]:
train_x_vec = train_x[['{}_vec'.format(i) for i in feat_list]].copy()
train_x_vec = np.array(np.array(train_x_vec).tolist())

config["x_data_shape"] = train_x_vec.shape
config["y_data_shape"] = train_y.shape

model = AttackClassification(version='1209', mode='train', config=config)
model.optimize_nn(train_x_vec, train_y)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 00023: early stopping


('MODEL HAS BEEN SAVED TO /home/ctilab/workspace/sophia/NLP/w2v_model/1209',
 <tensorflow.python.keras.callbacks.History at 0x7f4924747890>)

In [12]:
true, pred = model.validation(train_x_vec, train_y.values)

CONFUSION MATRIX
[[3628   71   52    1   19]
 [  73 2980   20    0    3]
 [ 137   43 2847    0   98]
 [   3    7    0 5514    2]
 [   9    1   25    1 6964]]
ACCURACY SCORE : 0.974886656591697


In [13]:
test_x_vec = test_x[['{}_vec'.format(i) for i in feat_list]].copy()
test_x_vec = np.array(np.array(test_x_vec).tolist())

true, pred = model.validation(test_x_vec, test_y.values)

CONFUSION MATRIX
[[1471   52   38    5   12]
 [  50 1243   27    1    7]
 [  75   44 1157    1   59]
 [   4    4    0 2391    1]
 [   3    1    7    0 2989]]
ACCURACY SCORE : 0.9594482472516076
