In [102]:
ES_URL = "http://192.168.122.3:9200"
ES_INDEX = "logs-endpoint-winevent-sysmon-*"
COLUMNS = ["process_path", "event_id"]
DROP_NA_COLUMNS = COLUMNS

In [103]:
# append loglizer as it is not delivered as a python module

import sys

sys.path.append("dependencies/loglizer")

In [104]:
# imports
import pandas as pd
from elasticsearch import Elasticsearch  
from elasticsearch_dsl import Search
import functools
import csv
from os.path import isfile as isfile
from loglizer import dataloader, preprocessing
import hashlib

def get_data(elast_url, index, columns, limit=-1):  
        
        def save_to_csv(elast_url, index, columns, file_name):
            
            print("saving to csv as file did not exist")
            es = Elasticsearch(elast_url,timeout=600)
            s = Search(using=es, index=index).query().source(fields=columns)            
            
            with open(file_name, mode='w') as es_fd:
                writer = csv.DictWriter(es_fd, fieldnames=columns)
                writer.writeheader()
                for hit in s.scan():          
            
                    # handles nested objects in response because of multilevel keys (i.e. agent.hostname) 
                    def rgetattr(obj, attr):
                        def _getattr(obj, attr):
                            try:
                                return getattr(obj, attr)
                            # Not all rows have the attributes we need, so return None
                            except AttributeError: 
                                return None
                        return functools.reduce(_getattr, [obj] + attr.split('.'))
                    
                    hit_dict = {column: rgetattr(hit, column) for column in columns}                    
                    writer.writerow(hit_dict)
                    
                    
        def read_from_csv(csv_file):

            data = pd.read_csv(csv_file)
            return data

        file_name_clear = ("{}{}{}{}{}"
            .format(
                len(elast_url),
                elast_url,
                len(index),
                index,
                len(columns),
                ".".join(columns)))
                   
        file_name = (str(hashlib.sha1(file_name_clear.encode("UTF-8")).hexdigest()[:10]) + ".csv")
            
        print("filename: {}".format(file_name))
        
        if not isfile(file_name):
            save_to_csv(elast_url,index,columns,file_name)   
        
        data_frame = read_from_csv(file_name)
        
        if len(DROP_NA_COLUMNS) > 0:
            data_frame.dropna(subset=DROP_NA_COLUMNS,how="any")
        
        data_frame = data_frame[:limit]
        
        return data_frame[columns]
    

In [105]:
import logging
import warnings
import numpy as np

def train(*params):
    
    # setup logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    
    np.random.seed(40)
    
    elast_url = ES_URL
    index = ES_INDEX
    
    data = get_data(elast_url, index, COLUMNS, 1000)
    return data
    

In [106]:
data = train()

x_train_pd=data.sample(frac=0.8,random_state=200) #random state is a seed value
x_test_pd=data.drop(x_train_pd.index)

filename: 90b32b8a13.csv


In [107]:
from loglizer.models import LogClustering
from loglizer import preprocessing

anomaly_threshold = 0.3 #
max_dist = 0.3

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

pipeline_cluster =  (Pipeline(steps=[
            ('numpy_transformer', FunctionTransformer(lambda x: x.to_numpy())),
            ('feature_extractor', preprocessing.FeatureExtractor()),
            ('model', LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold))]))

pipeline_cluster.fit(x_train_pd[:100])

Train data shape: 100-by-25

Starting offline clustering...
Processed 100 instances.
Found 24 clusters offline.



Pipeline(steps=[('numpy_transformer',
                 FunctionTransformer(func=<function <lambda> at 0x7f347101d8b0>)),
                ('feature_extractor',
                 <loglizer.preprocessing.FeatureExtractor object at 0x7f3471001970>),
                ('model',
                 <loglizer.models.LogClustering.LogClustering object at 0x7f3471001880>)])

In [109]:
from sklearn.metrics import accuracy_score
m = pipeline_cluster.predict(x_test_pd[:200])


accuracy_score(m, np.zeros(x_test_pd.shape[0]))

Test data shape: 200-by-25



0.955

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from loglizer.models import IsolationForest

pipeline_iforest =  (Pipeline(steps=[
            ('numpy_transformer', FunctionTransformer(lambda x: x.to_numpy())),
            ('feature_extractor', preprocessing.FeatureExtractor()),
            ('model', IsolationForest(contamination=0.03))]))
pipeline_iforest.fit(x_train_pd)

Train data shape: 800-by-40



  warn(


Pipeline(steps=[('numpy_transformer',
                 FunctionTransformer(func=<function <lambda> at 0x7f347101d5e0>)),
                ('feature_extractor',
                 <loglizer.preprocessing.FeatureExtractor object at 0x7f3471001070>),
                ('model', IsolationForest())])

In [111]:
from sklearn.metrics import accuracy_score
m = pipeline_cluster.predict(x_test_pd)


accuracy_score(m, np.zeros(x_test_pd.shape[0]))

Test data shape: 200-by-25



0.955

In [112]:
anomaly_pos = [i for i, el in enumerate(m) if el == 1]
x_test_pd.iloc[anomaly_pos]


Unnamed: 0,process_path,event_id
2,c:\users\gazelle01\appdata\local\microsoft\one...,1
567,system,13
594,c:\programdata\microsoft\windows defender\plat...,13
708,c:\windows\explorer.exe,13
743,c:\programdata\microsoft\windows defender\plat...,13
816,system,13
894,c:\windows\system32\sihost.exe,12
941,c:\windows\system32\sihost.exe,12
997,c:\windows\system32\svchost.exe,11
