In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import IsolationForest

In [2]:
train = pd.read_csv('data/A-log-train.csv')
train.fillna('__NaN__', inplace=True)
train

Unnamed: 0,Time,Level,Logger,Message,allowed,ugi,ip,cmd,src,dst,perm,proto,db,tbl,newtbl
0,"2022-07-01 06:59:30,944",INFO,org.apache.hadoop.hdfs.server.namenode.FSNames...,allowed=true\tugi=root (auth:SIMPLE)\tip=/127....,True,root (auth:SIMPLE),/127.0.0.1,mkdirs,/dir1,__NaN__,root:supergroup:rwxr-xr-x,rpc,__NaN__,__NaN__,__NaN__
1,"2022-07-01 06:59:31,943",INFO,org.apache.hadoop.hdfs.server.namenode.FSNames...,allowed=true\tugi=root (auth:SIMPLE)\tip=/127....,True,root (auth:SIMPLE),/127.0.0.1,getfileinfo,/dir1,__NaN__,__NaN__,rpc,__NaN__,__NaN__,__NaN__
2,"2022-07-01 06:59:31,964",INFO,org.apache.hadoop.hdfs.server.namenode.FSNames...,allowed=true\tugi=root (auth:SIMPLE)\tip=/127....,True,root (auth:SIMPLE),/127.0.0.1,getfileinfo,/dir1/testfile1.txt,__NaN__,__NaN__,rpc,__NaN__,__NaN__,__NaN__
3,"2022-07-01 06:59:31,968",INFO,org.apache.hadoop.hdfs.server.namenode.FSNames...,allowed=true\tugi=root (auth:SIMPLE)\tip=/127....,True,root (auth:SIMPLE),/127.0.0.1,getfileinfo,/dir1/testfile1.txt._COPYING_,__NaN__,__NaN__,rpc,__NaN__,__NaN__,__NaN__
4,"2022-07-01 06:59:31,979",INFO,org.apache.hadoop.hdfs.server.namenode.FSNames...,allowed=true\tugi=root (auth:SIMPLE)\tip=/127....,True,root (auth:SIMPLE),/127.0.0.1,create,/dir1/testfile1.txt._COPYING_,__NaN__,root:supergroup:rw-r--r--,rpc,__NaN__,__NaN__,__NaN__
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5024161,"2022-07-25T11:24:13,893",INFO,HiveMetaStore.audit,ugi=root\tip=unknown-ip-addr\tcmd=get_table : ...,__NaN__,root,unknown-ip-addr,get_table,__NaN__,__NaN__,__NaN__,__NaN__,db1,worker\t,__NaN__
5024162,"2022-07-25T11:24:13,934",INFO,HiveMetaStore.audit,ugi=root\tip=unknown-ip-addr\tcmd=get_table : ...,__NaN__,root,unknown-ip-addr,get_table,__NaN__,__NaN__,__NaN__,__NaN__,db1,worker\t,__NaN__
5024163,"2022-07-25T11:24:13,961",INFO,HiveMetaStore.audit,ugi=root\tip=unknown-ip-addr\tcmd=get_table : ...,__NaN__,root,unknown-ip-addr,get_table,__NaN__,__NaN__,__NaN__,__NaN__,db1,worker\t,__NaN__
5024164,"2022-07-25T11:24:36,596",INFO,HiveMetaStore.audit,ugi=root\tip=unknown-ip-addr\tcmd=Cleaning up ...,__NaN__,root,unknown-ip-addr,Cleaning up thread local RawStore...\t,__NaN__,__NaN__,__NaN__,__NaN__,__NaN__,__NaN__,__NaN__


In [3]:
# 不知道 'INFO' 和 'INFO ' 是不是一个东西

train.drop(['Level'], axis=1, inplace=True)

In [4]:
for col in tqdm([i for i in train.columns if i != 'Time']):
    train[col] = train[col].astype(str)
    
for col in tqdm([i for i in train.columns if i not in ['Time', 'Message']]):
    lbe = LabelEncoder()
    train[col] = lbe.fit_transform(train[col])

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [5]:
for col in tqdm(['Logger', 'allowed', 'ugi', 'ip', 'cmd', 'src', 
                 'dst', 'perm', 'proto', 'db', 'tbl', 'newtbl']):
    train[f'{col}_count'] = train.groupby(col)['Time'].transform('count')

  0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
%%time

def add_tfidf_feats(df, col, n_components=16):
    text = list(df[col].values)
    tf = TfidfVectorizer(min_df=1)
    tf.fit(text)
    X = tf.transform(text)
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(X)
    X_svd = svd.transform(X)
    for i in range(n_components):
        df[f'{col}_tfidf_{i}'] = X_svd[:, i]
    return df


train = add_tfidf_feats(train, 'Message', n_components=2)

CPU times: user 2min 56s, sys: 32.3 s, total: 3min 28s
Wall time: 2min 32s


In [7]:
use_cols = [col for col in train.columns if col not in ['Time', 'Message']]
use_cols

['Logger',
 'allowed',
 'ugi',
 'ip',
 'cmd',
 'src',
 'dst',
 'perm',
 'proto',
 'db',
 'tbl',
 'newtbl',
 'Logger_count',
 'allowed_count',
 'ugi_count',
 'ip_count',
 'cmd_count',
 'src_count',
 'dst_count',
 'perm_count',
 'proto_count',
 'db_count',
 'tbl_count',
 'newtbl_count',
 'Message_tfidf_0',
 'Message_tfidf_1']

In [8]:
%%time

X = train[use_cols].values
clf = IsolationForest(random_state=0).fit(X)

CPU times: user 46.1 s, sys: 17.3 s, total: 1min 3s
Wall time: 1min 3s


In [9]:
%%time

pred = clf.score_samples(X)

CPU times: user 1min 42s, sys: 15.4 s, total: 1min 58s
Wall time: 1min 58s


In [10]:
sub = pd.read_csv('data/submit_example.csv')
sub

Unnamed: 0,Time,label
0,"2022-07-01 06:59:30,944",normal
1,"2022-07-01 06:59:31,943",normal
2,"2022-07-01 06:59:31,964",normal
3,"2022-07-01 06:59:31,968",normal
4,"2022-07-01 06:59:31,979",normal
...,...,...
5024161,"2022-07-25T11:24:13,893",normal
5024162,"2022-07-25T11:24:13,934",normal
5024163,"2022-07-25T11:24:13,961",normal
5024164,"2022-07-25T11:24:36,596",normal


In [11]:
threshold = 0.6

sub['pred'] = abs(pred)
sub[sub['pred'] > threshold].shape[0] / len(sub)

0.11206815220675431

In [12]:
sub['label'] = sub['pred'].apply(lambda x: 'attack' if x > threshold else 'normal')
sub.drop(['pred'], axis=1, inplace=True)
sub

Unnamed: 0,Time,label
0,"2022-07-01 06:59:30,944",attack
1,"2022-07-01 06:59:31,943",attack
2,"2022-07-01 06:59:31,964",attack
3,"2022-07-01 06:59:31,968",attack
4,"2022-07-01 06:59:31,979",attack
...,...,...
5024161,"2022-07-25T11:24:13,893",normal
5024162,"2022-07-25T11:24:13,934",normal
5024163,"2022-07-25T11:24:13,961",normal
5024164,"2022-07-25T11:24:36,596",normal


In [13]:
sub.to_csv('baseline.csv', index=False)