# Script for training and scoring inference methods

In [23]:
%%writefile trainingScript.py
import argparse, os
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report 
from sklearn.externals import joblib
import pandas as pd

# Returns model object from file
def model_fn(model_dir):
    return joblib.load(os.path.join(model_dir, 'model.joblib'))

# Main guard for entry point safety purposes
if __name__ == '__main__':
    
    # Use argument parsers for all parameters
    parser = argparse.ArgumentParser()
    
    # Model parameters (for now we only use estimator count, contamination and bootstrapping parameters)
    parser.add_argument('--n-estimators', type = int, default = 100)
    parser.add_argument('--contamination', type = str or float, default = 'auto')
    parser.add_argument('--bootstrap', type = bool, default = False)
    
    # Data, model and output directories
    parser.add_argument('--model-dir', type = str, default = os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type = str, default = os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type = str, default = os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type = str, default = 'train.csv')
    parser.add_argument('--test-file', type = str, default = 'test.csv')
    parser.add_argument('--output-data-dir', type = str, default = os.environ.get('SM_OUTPUT_DATA_DIR'))

    args, _ = parser.parse_known_args()
    
    # Get train/test data/labels into dataframes and then drop label column from the data dataframes
    Xtrain = pd.read_csv(os.path.join(args.train, args.train_file))
    Xtest = pd.read_csv(os.path.join(args.test, args.test_file))
    
    # Get label columns from train/test data and drop labels from them
    Ytrain = Xtrain[['labels']].copy()
    Ytest = Xtest[['labels']].copy()
    Xtrain.drop(['labels'], axis = 1, inplace = True)
    Xtest.drop(['labels'], axis = 1, inplace = True)
    
    # Isolation forest sets -1 as outliers and 1 as non-outliers, so we change all b'normal.' connections to 1 
    # and every other label as -1
    Ytrain = [1 if label == b'normal.' else -1 for label in Ytrain['labels']]
    Ytest = [1 if label == b'normal.' else -1 for label in Ytest['labels']]
    
    # One hot encode service feature
    Xtrain = pd.get_dummies(Xtrain, 'service', columns = ['service'])
    Xtest = pd.get_dummies(Xtest, 'service', columns = ['service'])

    # Test data has some missing onehot encoded columns from train data so we'll add those columns in and assign them
    # all to 0 values
    extraCols = Xtest.columns ^ Xtrain.columns
    for extraFeature in extraCols:
        tmpCol = Xtrain[[extraFeature]].copy()
        tmpCol[extraFeature].values[:] = 0
        Xtest[extraFeature] = tmpCol
    
    # Define model and train model
    model = IsolationForest(n_estimators = args.n_estimators, contamination = args.contamination,
                            bootstrap = args.bootstrap, verbose = 1, random_state = 42, behaviour = 'new')
    model.fit(Xtrain, Ytrain)
    
    # Dump model into file
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)
    
    # Predict labels, then calculate precision, accuracy and recall for model
    predLabels = model.predict(Xtest)
    report = classification_report(Ytest, predLabels)
    print(report)

Overwriting trainingScript.py
