In [1]:
#library import
import numpy as np
import tensorflow as tf

from keras import backend as K
from keras import models, layers, optimizers
from keras import utils
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Add, concatenate, Input
from keras.layers.convolutional import Conv2D, Conv1D, MaxPooling2D, MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

import pandas as pd
import sys
import os
import cv2, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
def set_seed(seed):
    np.random.seed(seed)
    tf.set_random_seed(seed)
    print('[*]Set seed: {}'.format(seed))

In [3]:
'''
model 평가함수 작성
recall + precision 을 이용하여 f1 score 계산
model Callback 함수에서 사용
'''
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
 

In [4]:
'''
데이터 수, batch_size를 이용하여 step 계산
'''
def get_steps(num_samples, batch_size):
    if (num_samples % batch_size) > 0:
        return (num_samples // batch_size) + 1
    else:
        return num_samples // batch_size

In [5]:
'''
Epoch 종료 시 마다 수행
'''
def get_callback(model_path, lr, mode='val_f1_m', patient=10, warmup_epoch=5, min_lr=0.00001):
    
    mode = 'acc' if mode is 'acc' else 'val_f1_m'
    direction = 'max' if mode=='val_f1_m' else 'min'
    
    callbacks = [
        #조기 종료
        EarlyStopping(monitor='val_f1_m',
                      patience=patient,
                      mode=direction,
                      verbose=1),
        #모델 저장
        ModelCheckpoint(filepath=model_path,
                        monitor='val_f1_m',
                        verbose=1,
                        save_best_only=True,
                        mode=direction),
        #lr decay
        ReduceLROnPlateau(monitor = 'val_f1_m',
                          factor = 0.5,
                          patience = patient / 4,
                          min_lr=min_lr,
                          verbose=1,
                          mode=direction,
                          warmup_epoch=warmup_epoch)
    ]
    return callbacks

In [6]:
#read file and return binary data
'''
read file -> binary values
'''
def getBinaryData(filename):
	binaryValues = []
	file = open(filename, "rb")
	data = file.read(1)
	while data != b"":
		try:
			binaryValues.append(ord(data))

		except TypeError:
			pass

		data = file.read(1)

	return binaryValues

In [7]:
#byte frequency
def balanceByte(filename, table):
    
    fn = filename.split("/")[-1]
    zero = float(table[0])/4096.0
    low = float(sum(table[1:31]))/4096.0
    ascii = float(sum(table[32:127]))/4096.0
    high = float(sum(table[128:254]))/4096.0
    ff = float(table[255])/4096.0
    return [zero, low, ascii, high, ff]

def get_frequency(filename):
    table = [0] * 256
    data = open(filename, 'rb')
    buff = data.read(2 ** 20)
    while buff:
        for c in buff:
            table[c] += 1
        buff = data.read(2 ** 20)
    data.close()
    table.extend(balanceByte(filename, table))
    
    return table

In [8]:
def make_path(path_dir):
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)
        print('[*]Make dir: {}'.format(path_dir))

In [9]:
def make_data(raw_file, data_dir):
    if os.path.exists(data_dir):
        return
    os.makedirs(data_dir)
    print('[*]Make dir: {}'.format(data_dir))
    chunk_size = 4096
    idx = 0
    print('[*]Make bin file')
    with open(raw_file, "rb") as f:
        chunk = f.read(chunk_size)
        while chunk:
            with open(os.path.join(data_dir, str(idx)), "wb") as chunk_file:
                chunk_file.write(chunk)
            idx += 1
            chunk = f.read(chunk_size)
            print('>>{0:<10}'.format(idx), end='\r', flush=True)

In [29]:
def add_frequency(df, data_dir):
    #add frequency col
    byte_col = []
    for col in range(0, 256):
        df[col]=None
        byte_col.append(col)
    file_info = ['0x00', 'low', 'ascii', 'high','0xff']
    for col in file_info:
        df[col] = None
        byte_col.append(col)
    rs = []
    for idx in range(0, len(df)):
        table = get_frequency(os.path.join(data_dir, df.iloc[idx]['file']))
        series = dict(zip(byte_col, table))
        rs.append(series)
    df.loc[:, byte_col] = pd.DataFrame(rs)

In [35]:
def add_bindata(df, data_dir):
    df['data'] = df['file'].map(lambda x: getBinaryData(os.path.join(data_dir, x)))

In [12]:
def add_type_big(df):
    df['type_big'] = df['type'].str.split('-').map(lambda x: x[0])

In [36]:
'''
add binary data
standard scaling
'''
def add_col(df, data_dir):
    df = df.copy()
    df['file'] = df['file'].astype(str)
    #add data column
    add_frequency(df, data_dir)
    add_bindata(df, data_dir)
    add_type_big(df)
    print('[*]Add col')
    return df

In [14]:
def apply_scaler(df, scaler):
    df = df.copy()
    #standard scaler
    for col in df.columns:
        if df[col].dtype != 'object':
            scaler.fit(df[[col]])
            df[col] = scaler.transform(df[[col]])
    print('[*]Apply Scaler')
    return df

In [15]:
def CNN1D(filter_num,filter_size, dim, num_classes, 
                  activation='relu', maxpool_size=2):
    #cnn input
    cnn_input = Input(shape=(dim, 1))
    #cnn layer
    cnn_layer = Conv1D(kernel_size=filter_size,
                         filters=filter_num)(cnn_input)
    cnn_layer = BatchNormalization()(cnn_layer)
    cnn_layer = Activation(activation)(cnn_layer)
    cnn_layer = MaxPooling1D(pool_size=(maxpool_size))(cnn_layer)
    
    cnn_layer = Conv1D(kernel_size=filter_size,
                         filters=filter_num)(cnn_layer)
    cnn_layer = BatchNormalization()(cnn_layer)
    cnn_layer = Activation(activation)(cnn_layer)
    cnn_layer = MaxPooling1D(pool_size=(maxpool_size))(cnn_layer)
    
    cnn_layer = Conv1D(kernel_size=filter_size,
                         filters=filter_num)(cnn_layer)
    cnn_layer = BatchNormalization()(cnn_layer)
    cnn_layer = Activation(activation)(cnn_layer)
    cnn_layer = MaxPooling1D(pool_size=(maxpool_size))(cnn_layer)
    cnn_layer = Flatten()(cnn_layer)
    
    #byte input
    byte_input = Input(shape=(261,1), name='byte_input')
    #byte layer
    byte_layer = Flatten()(byte_input)
    
    merge_layer = concatenate([cnn_layer, byte_layer])
    
    merge_layer = Dense(256)(merge_layer)
    merge_layer = BatchNormalization()(merge_layer)
    merge_layer = Activation(activation)(merge_layer)
    
    merge_layer = Dense(64)(merge_layer)
    merge_layer = BatchNormalization()(merge_layer)
    merge_layer = Activation(activation)(merge_layer)
    
    merge_layer = Dense(num_classes)(merge_layer)
    merge_layer = Activation('softmax')(merge_layer)
    final_model = Model(inputs=[cnn_input, byte_input], outputs=merge_layer)
    
    return final_model


In [16]:
'''
data unbalanced -> remove
'''
def remove_oversample(df, target):
    df = df.copy()
    min_val = df[target].value_counts().min()
    drop_indexes = []
    vals = df['type_big'].unique()
    for val in vals:
        indexes = df[df['type_big']==val].index
        if len(indexes) > min_val:
            df = df.drop(indexes[min_val:])
            drop_indexes.extend(indexes[min_val:])
        print('[*]Drop {}: {}->{}'.format(target, len(indexes), min_val))
    return df, drop_indexes

In [17]:
'''
target -> label encoding
'''
def add_label(df, target):
    df = df.copy()
    target_label = '{}_label'.format(target)
    le = preprocessing.LabelEncoder()
    le.fit(df[target])
    df[target_label] = le.transform(df[target])
    print('[*]Add col: {}'.format(target_label))
    return df, target_label

In [18]:
'''
split x, y by kfold index
'''
def get_train_data(df, cnn_col, byte_cols, target, num_class, index):
    array_dim = len(index)
    byte_dim = len(byte_cols)
    data_dim =len(df['data'][0])
    X_train = np.array([df[cnn_col].iloc[index]]).reshape((array_dim, data_dim, -1))
    X_train_byte = np.array([df[byte_cols].iloc[index,:].values]).reshape((array_dim, len(byte_cols), -1))
    
    Y_train = np.array([df[target].iloc[index]])
    Y_train = utils.to_categorical(Y_train, num_classes=num_class).reshape((array_dim, num_class))
    return X_train, X_train_byte, Y_train

In [19]:
def train(df, opt, skf, train_fold_step, byte_cols, cnn_col, target, model_dir):
    if cnn_col not in df.columns:
        print('[*]{} is not in colummns'.format(cnn_col))
    for col in byte_cols:
        if col not in df.columns:
            print('[*]{} is not in colummns'.format(col))
            
    histories = []
        
    for idx1, filter_num in enumerate(params['test_filter_nums']):
        for idx2, filter_size in enumerate(params['test_filter_sizes']):
            print('==========================filter_num: {} / filter_size: {}=========================='.format(filter_num, filter_size))
            #set model name
            #save best model of kfold models
            model_name = params['model_name_format'].format(filter_num, filter_size)  # save model path
            model_path = os.path.join(model_dir, model_name)
            
            for fold_step, (train_index, valid_index) in enumerate(skf.split(df['type_big'], df['type_big'])):
                #num class
                num_class = len(df[target].unique())
                #cnn dim
                cnn_dim = len(df[cnn_col][0])
                #set train
                X_train, X_train_byte, Y_train = get_train_data(df, cnn_col, byte_cols, target, num_class, train_index)
                #set test
                X_test, X_test_byte, Y_test = get_train_data(df, cnn_col, byte_cols, target, num_class, valid_index)
                #set model
                model = CNN1D(filter_num=filter_num, filter_size=filter_size, dim=cnn_dim, num_classes=num_class)
                model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', f1_m])
                
                #train
                history = model.fit([X_train,X_train_byte], Y_train, 
                                    batch_size=params['batch_size'],
                                    epochs=params['epochs'],
                                    validation_data=([X_test, X_test_byte], Y_test),
                                    shuffle=True,
                                    callbacks=get_callback(model_path, params['lr']),
                                    verbose=1)
                histories.append(history)

                #for train one fold. optional.
                if fold_step == train_fold_step-1:
                    break
    return histories

In [20]:
#get history dictionary to dataframe
def histories_to_df(histories, target_col):
    history_df = pd.DataFrame()
    for idx1, filter_num in enumerate(params['test_filter_nums']):
        for idx2, filter_size in enumerate(params['test_filter_sizes']):
            history = histories[idx1*len(params['test_filter_sizes'])+idx2].history
            max_val_idx = history[target_col].index(max(history[target_col]))
            temp = {}
            for key in history.keys():
                temp[key] = history[key][max_val_idx]
            history_df['num_{}_size_{}'.format(filter_num, filter_size)] = pd.Series(temp)
    return history_df.T

In [21]:
#get model
def get_model_with_weight(select_filter_num, select_filter_size, model_dir, cnn_dim, num_class):
    model = CNN1D(filter_num=select_filter_num, filter_size=select_filter_size, dim=cnn_dim, num_classes=num_class)
    model_path = params['model_name_format'].format(select_filter_num, select_filter_size)
    weight_path = os.path.join(model_dir, model_path)
    model.load_weights(weight_path)
    return model

In [22]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    #fig.tight_layout()
    return ax

In [23]:
def plot_cm(model, df):
    X_test = X_test_byte = Y_test = None
    #get test index from k fold
    for fold_step, (train_index, valid_index) in enumerate(skf.split(df['type_big_label'], df['type_big_label'])):
        X_test, X_test_byte, Y_test = get_train_data(df,
                                                 cnn_col='data',
                                                 byte_cols=df.columns[4:-2],
                                                 target=target_label, 
                                                 num_class=len(df[target_label].unique()),
                                                 index=valid_index)
        break
    #get y_true
    y_true = Y_test
    #get y_pred
    y_pred = model.predict([X_test, X_test_byte])
    # get class name
    classes = df[['type_big', 'type_big_label']].head(20)
    classes = classes.groupby(['type_big', 'type_big_label'],as_index=False).size()
    classes = classes.sort_values().index.levels[0].values
    plot_confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1), classes, title='CM')
    plt.show()

In [24]:
SEED = 2
set_seed(SEED)

[*]Set seed: 2


In [31]:
#input
input_dir = 'input'
#model save
model_dir = 'models'
#raw & meta csv
raw_file = os.path.join(input_dir, 'raw_data.raw')
csv_file = os.path.join(input_dir, 'part1.csv')
#binary data folder
bin_data_dir = os.path.join(input_dir, 'data_bin')
df_fix_path = os.path.join(input_dir, 'data_fix.csv')

In [26]:
if not os.path.exists(raw_file):
    print("[*]raw file isn's exists({})".format(raw_file))
if not os.path.exists(csv_file):
    print("[*]csv file isn's exists({})".format(csv_file))      

In [27]:
make_path(model_dir)
make_data(raw_file, bin_data_dir)

In [37]:
target = 'type_big'#target or target_big
df_fix = None
if os.path.exists(df_fix_path):
    df_fix = pd.read_csv(df_dix_path)
else:
    df = pd.read_csv('input/part1.csv')
    df = df.drop(['offset'], axis=1)
    df.columns = ['file', 'type']
    df_fix = add_col(df, bin_data_dir)
    df_fix = apply_scaler(df_fix, scaler=StandardScaler())
    df_fix, _ = remove_oversample(df_fix, target=target)
    df_fix, target_label = add_label(df_fix, target)
    df_fix.to_csv(df_fix_path, index=False)

[*]Add col
[*]Apply Scaler
[*]Drop type_big: 8596->5279
[*]Drop type_big: 6764->5279
[*]Drop type_big: 5761->5279
[*]Drop type_big: 5279->5279
[*]Add col: type_big_label


In [38]:
df_fix.head(1)

Unnamed: 0,file,type,0,1,2,3,4,5,6,7,...,254,255,0x00,low,ascii,high,0xff,data,type_big,type_big_label
0,0,enc-rsa,16,10,19,13,22,21,21,16,...,17,7,0.00390625,0.11499,0.36084,0.507568,0.00170898,"[99, 106, 215, 74, 148, 32, 137, 4, 92, 11, 36...",enc,1


In [None]:
params = {'fold_splits': 10,
         'batch_size': 64,
         'lr': 0.00001,
         'epochs': 2,
         'test_filter_nums': [4, 8, 16],
         'test_filter_sizes': [3, 4],
         'model_name_format': 'model_conv1d_baseline_num_{}_size_{}'}
opt = optimizers.Nadam(lr=params['lr'])  # optimizers
skf = StratifiedKFold(n_splits=params['fold_splits'], random_state=SEED)  # StaratifiedKFold -> y label 밸런스 유지하며 k fold 분할

## Train

In [None]:
histories = train(df=df_fix,
                  opt=opt, skf=skf, train_fold_step=params['fold_splits'],
                  byte_cols=df_fix.columns[4:-2],
                  cnn_col='data',
                  target=target_label,
                  model_dir=model_dir)

In [None]:
history_df = histories_to_df(histories, target_col = 'val_f1_m')
history_df

## Test

In [None]:
select_filter_num = 8
select_filter_size = 3

In [None]:
#get model
model = CNN1D(filter_num=select_filter_num, filter_size=select_filter_size, dim=cnn_dim, num_classes=num_class)
#load weight
model_path = params['model_name_format'].format(select_filter_num, select_filter_size)
weight_path = os.path.join(model_dir, model_path)
model.load_weights(weight_path)

In [None]:
#plot confusion matrix
#get test index from train data with kfold
plot_cm(test_model, df_fix)