In [19]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from auto_profiling_model import *
from auto_profiling_utils import *

In [20]:
config = {
    "common": {
        "model_name": "cnn_model",
        "model_path": "auto_profiling_model",
        "scaler":"minmaxscaler",
        "vec":"vectorization",
        "encoder":"onehotencoder"
    },
    "train": {
        "data_load": 0,
        "crontab": "*/30 * * * *",
        "now_delta": "minutes=0",
        "prev_delta": "days=30",
        "max_depth":100, ## decision tree depth
        "optimizer_help": ['Adam', 'SGD'],
        "optimizer": 'Adam',
        "learning_rate": 0.0001,
        "batch_size": 32,
        "epochs": 500,
        "result_table": "result"
    },
    "predict": {
        "crontab": "*/1 * * * *",
        "now_delta": "minutes=0",
        "prev_delta": "days=6",
        "batch_size": 8
    }
}

In [21]:
pwd = os.getcwd()

## TRAIN VERSION SETTING
start = datetime.now().replace(microsecond=0) + timedelta(hours=9)
train_version = start.strftime("%Y%m%d_%H")

## Data load

In [22]:
data = pd.DataFrame()

for i in ['normal', 'SQL_INJECTION', 'XSS', 'BEACONING', 'CREDENTIAL']:
    print(i)
    data, meta = execute_ch("select * from dti.dti_sh_demo_log WHERE hash = '{}' limit 10000".format(i))
    feats = [m[0] for m in meta]
    globals()['{}_df'.format(i)] = pd.DataFrame(data, columns = feats)

data = pd.concat([globals()['{}_df'.format('normal')], globals()['{}_df'.format('SQL_INJECTION')],
                  globals()['{}_df'.format('XSS')], globals()['{}_df'.format('BEACONING')], globals()['{}_df'.format('CREDENTIAL')]])

data_y = data[['hash']]
data_x = data.drop('hash', axis = 1)
data_x['all'] = data_x[['http_host']].values + ' ' +data[['http_agent']].values + ' ' +data[['http_query']].values

normal
SQL_INJECTION
XSS
BEACONING
CREDENTIAL


In [23]:
# Train/Test data split
y = data_y
x = data_x

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=1004)

train_x.reset_index(drop = True, inplace = True)
test_x.reset_index(drop = True, inplace = True)
train_y.reset_index(drop = True, inplace = True)
test_y.reset_index(drop = True, inplace = True)
valid_y = test_y.copy()

print(f"No. of training examples: {train_x.shape[0]}")
print(f"No. of testing examples: {test_x.shape[0]}")

save_test_x = test_x.copy()

No. of training examples: 22695
No. of testing examples: 9727


## Train data preprocessing

In [24]:
train_prep = DataPreprocessing(version=train_version, mode='train', config=config)

train_x = train_prep.vec_module(train_x, col_list = 'all')
train_x = train_prep.scale_module(train_x)
train_y = train_prep.encoder_module(train_y)

In [25]:
train_prep.save_model(list(train_y), 'train_label')

# AI model fit

* Decision Tree

In [26]:
model = DecisionTreeClassification(version=train_version, mode='train', config=config)
model.fit_decision_tree(train_x, train_y)
true, pred = model.validation(train_x, train_y)

CONFUSION MATRIX
[[3792    0    0    0    0]
 [   0 3078    0    0    0]
 [   0    0 3140    0    0]
 [   0    0    0 5678    0]
 [   0    0    0    0 7007]]
ACCURACY SCORE : 1.0


* cnn model

In [27]:
for i in list(train_y):
    if i == 'normal':
        pass
    else:
        print("\n ******** {} MODEL FITTING START ********".format(i))
        normal_y = train_y[train_y['normal'] == 1].copy()
        attack_y = train_y[train_y[i] == 1].copy()
        temp_y = pd.concat([normal_y, attack_y])
        temp_y = temp_y[['normal',i]].copy()
        temp_x = train_x.iloc[temp_y.index]
        random_idx = np.random.permutation(len(temp_x))
        temp_x = temp_x.iloc[random_idx]
        temp_y = temp_y.iloc[random_idx]
        
        cnn_train_x = np.array(temp_x).reshape(temp_x.shape[0], 1, temp_x.shape[1], 1)
        cnn_train_y = np.array(temp_y).reshape(temp_y.shape[0], -1)        
        config["x_data_shape"] = cnn_train_x.shape
        config["y_data_shape"] = cnn_train_y.shape
        config["att_name"] = i
        print(cnn_train_x.shape)
        model = AttackClassification(version=train_version, mode='train', config=config)
        
        _, globals()['ai_history_{}'.format(i)] = model.optimize_nn(cnn_train_x, cnn_train_y)
        true, pred = model.validation(cnn_train_x, cnn_train_y)
        print("{} MODEL FITTING FINISH".format(i))


 ******** BEACONING MODEL FITTING START ********
(10799, 1, 10391, 1)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 00004: early stopping
CONFUSION MATRIX
[[7007    0]
 [   0 3792]]
ACCURACY SCORE : 1.0
BEACONING MODEL FITTING FINISH

 ******** CREDENTIAL MODEL FITTING START ********
(10085, 1, 10391, 1)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 00004: early stopping
CONFUSION MATRIX
[[7007    0]
 [   0 3078]]
ACCURACY SCORE : 1.0
CREDENTIAL MODEL FITTING FINISH

 ******** SQL_INJECTION MODEL FITTING START ********
(10147, 1, 10391, 1)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 00006: early stopping
CONFUSION MATRIX
[[7007    0]
 [   0 3140]]
ACCURACY SCORE : 1.0
SQL_INJECTION MODEL FITTING FINISH

 ******** XSS MODEL FITTING START ********
(12685, 1, 10391, 1)
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 00006: early stopping
CONFUSION MATRIX
[[7007    0]
 [   0 5678]]
ACCURACY SCORE : 1.0
XSS MOD

## PREDICT VERSION SETTING

In [28]:
start = datetime.now().replace(microsecond=0) + timedelta(hours=9)
pred_version = start.strftime("%Y%m%d_%H")

for timerange in range(500):
    if not os.path.exists('{}/{}/{}'.format(pwd, config['common']['model_path'],pred_version)):
        new_time = start - timedelta(hours=timerange+1)
        pred_version = new_time.strftime("%Y%m%d_%H")
    else:
        break

## Test data preprocessing

In [29]:
test_prep = DataPreprocessing(version=train_version, mode='test', config=config)

test_x = test_prep.vec_module(test_x, col_list = 'all')
test_x = test_prep.scale_module(test_x)
test_y = test_prep.encoder_module(test_y)

# AI model prediction

* Deicions Tree

In [30]:
model = DecisionTreeClassification(version=train_version, mode='test', config=config)
true, pred = model.validation(test_x, test_y)

/home/ctilab/valid_model/auto_profiling_model/auto_profiling_model/20211214_14/dt_model.pickle
CONFUSION MATRIX
[[1573   16   32    0    0]
 [   6 1308   44    0    0]
 [  29   13 1316    0    0]
 [   0    0    0 2397    0]
 [   0    0    0    0 2993]]
ACCURACY SCORE : 0.9856070730955073


In [31]:
train_label = test_prep.load_model('train_label')
dt_pred = [train_label[i] for i in pred]

* cnn model

In [32]:
save_test_x['ai_label_pred'] = np.NaN

for i in list(set(dt_pred)):
    y_index = [index for index, att_name in enumerate(dt_pred) if att_name == i]

    if i == 'normal':
        save_test_x.at[y_index,'ai_label_pred'] = 'NORMAL'
    else:
        print("\n ******** {} MODEL PREDICTION START ********".format(i))
        temp_x = test_x.iloc[y_index]
        temp_y = test_y.iloc[y_index][['normal',i]].copy()

        cnn_test_x = np.array(temp_x).reshape(temp_x.shape[0], 1, temp_x.shape[1], 1)
        cnn_test_y = np.array(temp_y).reshape(temp_y.shape[0], -1)        
        config["x_data_shape"] = cnn_test_x.shape
        config["y_data_shape"] = cnn_test_y.shape
        config["att_name"] = i
        
        model = AttackClassification(version=train_version, mode='predict', config=config)
        true, pred = model.validation(cnn_test_x, cnn_test_y)
        save_test_x.at[y_index,'ai_label_pred'] = pred.tolist()
        save_test_x['ai_label_pred'] = np.where(save_test_x['ai_label_pred'] == 1, i, save_test_x['ai_label_pred'])
        print("{} MODEL PREDICTION FINISH".format(i))
save_test_x['ai_label_pred'] = np.where(save_test_x['ai_label_pred'].isin(['0.0',0,'nan']), 'NORMAL', save_test_x['ai_label_pred'])
save_test_x['version'] = pred_version


 ******** SQL_INJECTION MODEL PREDICTION START ********
CONFUSION MATRIX
[[   0   76]
 [   0 1316]]
ACCURACY SCORE : 0.9454022988505747
SQL_INJECTION MODEL PREDICTION FINISH

 ******** BEACONING MODEL PREDICTION START ********
CONFUSION MATRIX
[[   0   35]
 [   0 1573]]
ACCURACY SCORE : 0.9782338308457711
BEACONING MODEL PREDICTION FINISH

 ******** XSS MODEL PREDICTION START ********
CONFUSION MATRIX
[[2397]]
ACCURACY SCORE : 1.0
XSS MODEL PREDICTION FINISH

 ******** CREDENTIAL MODEL PREDICTION START ********
CONFUSION MATRIX
[[   4   25]
 [   0 1308]]
ACCURACY SCORE : 0.981301421091997
CREDENTIAL MODEL PREDICTION FINISH


In [33]:
valid_y['hash'] = np.where(valid_y['hash'] == 'normal', 'NORMAL', valid_y['hash'])

accuracy_score(valid_y,save_test_x['ai_label_pred'])

from sklearn.metrics import confusion_matrix
confusion_matrix(valid_y, save_test_x['ai_label_pred'])

array([[1573,   12,    4,   32,    0],
       [   6, 1308,    0,   44,    0],
       [   0,    0, 2993,    0,    0],
       [  29,   13,    0, 1316,    0],
       [   0,    0,    0,    0, 2397]])

In [35]:
accuracy_score(valid_y,save_test_x['ai_label_pred'])


0.9856070730955073

In [34]:
# execute_ch("INSERT INTO dti.kisa_auto_profiling_result VALUES", save_test_x.to_dict('records'))