# MLflow Training Tutorial

This `train.pynb` Jupyter notebook is an example for using elastalert with mlflow together.

> This is the Jupyter notebook version of the `train.py` example

In [29]:
from sklearn.svm import OneClassSVM

ES_URL = "http://192.168.122.3:9200"
ES_INDEX = "logs-endpoint-winevent-sysmon-*"
COLUMNS = ["agent.hostname", "event.code"]
MODEL = OneClassSVM


In [30]:
# imports
import pandas as pd
from elasticsearch import Elasticsearch  
from elasticsearch_dsl import Search
import functools
import csv
from os.path import isfile as isfile

def get_data(elast_url, index, columns):  
        
        def save_to_csv(elast_url, index, columns, file_name):
            
            es = Elasticsearch(elast_url,timeout=600)
            s = Search(using=es, index=index).query().source(fields=columns)            
            
            with open(file_name, mode='w') as es_fd:
                writer = csv.DictWriter(es_fd, fieldnames=columns)
                writer.writeheader()
                for hit in s.scan():          
            
                    # handles nested objects in response because of multilevel keys (i.e. agent.hostname) 
                    # ac
                    def rgetattr(obj, attr):
                        def _getattr(obj, attr):
                            return getattr(obj, attr)
                        return functools.reduce(_getattr, [obj] + attr.split('.'))
                    
                    hit_dict = {column: rgetattr(hit, column) for column in columns}                    
                    writer.writerow(hit_dict)
                    
        def read_from_csv(csv_file):
            data = pd.read_csv(csv_file)
            return data
                   
        file_name = (
            str(hash("{}{}{}{}{}"
            .format(
                len(elast_url),
                elast_url,
                len(index),
                index,
                len(columns),
                ".".join(columns))))
            + ".csv"
            )
        
        if not isfile(file_name):
            save_to_csv(elast_url,index,columns,file_name)        
            
        data_frame = read_from_csv(file_name)
        
        return data_frame
    

In [18]:
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

def build_pipeline(data, *params):
    np.random.seed(40)   
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

    numeric_features = data.select_dtypes(include=['int64', 'float64']).columns

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    categorical_features = data.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])


    # create Model
    o_svm = MODEL()

    # create pipeline
    pipe = Pipeline([('preprocessor', preprocessor),
                     ('svc', o_svm)])

    return pipe

    

In [31]:
from collections import Counter
def log_output(pipe,data):
     
        mlflow.sklearn.log_model(pipe, "model")
        
        params = pipe.steps[-1][1].get_params()       
        mlflow.log_param("model_param", params)        
        
        predictions = pipe.predict(data)        
        for k,v in Counter(predictions).items():
            mlflow.log_metric("pred_{}".format(k), v)
        

In [32]:
import logging
import warnings

def train(*params):
    
    # setup logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    elast_url = ES_URL
    index = ES_INDEX
    
    data = get_data(elast_url, index, columns= ["agent.hostname", "event.code"])
    data.drop(["@timestamp"], axis=1)
    
    with mlflow.start_run():
        pipe = build_pipeline(data)
        pipe.fit(data) 
        
        log_output(pipe,data)
        return pipe

In [33]:
pipe = train()

In [34]:
pipe

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['event.code'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                        