# Import libraries and set/get AWS properties

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3, re, sys, math, json, os, sagemaker, urllib.request, argparse
from sagemaker import get_execution_role
from sklearn.datasets import fetch_kddcup99
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Define IAM role, session, region, and prefix for data directory
role = get_execution_role()
session = sagemaker.Session()
prefix = 'sagemaker'
region = boto3.session.Session().region_name # set the region of the instance

# Set S3 bucket name to store data, as well as boto3 client for S3 services
bucketName = 'tdcx-test'
s3 = boto3.resource('s3')

# ETL on dataset

In [2]:
# Load KDD Cup dataset using scikit-learn's dataset loader
# Set subset to SF to get logged_in is possitive data, allowing us to detect abnormal data as intrusion attacks
# Should have a 0.3% of abnormal connections
rawData = fetch_kddcup99(subset = 'SF')

# Split raw data up into train test split with 70/30 split of train/test data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(rawData.data, rawData.target, test_size = 0.3, random_state = 42)

# Add label columns to dataset as we will be loading them into our S3 bucket to store said data
trainX = pd.DataFrame(Xtrain, columns = rawData.feature_names)
testX = pd.DataFrame(Xtest, columns = rawData.feature_names)

# Isolation forest sets -1 as outliers and 1 as non-outliers, so we change all b'normal.' connections to 1 and every other
# label as -1
Ytrain = [1 if label == b'normal.' else -1 for label in Ytrain]
Ytest = [1 if label == b'normal.' else -1 for label in Ytest]

# Add labels to training/testing data
trainX['labels'] = Ytrain
testX['labels'] = Ytest

# Setup training session on Sagemaker

In [5]:
# Set training/testing data to csv files
trainX.to_csv('train.csv', index = False, header = True)
testX.to_csv('test.csv', index = False, header = True)

# Create definition for input data used by Sagemaker training jobs
# trainPath = sagemaker.inputs.TrainingInput(s3_data = 's3://{}/{}/train'.format(bucketName, prefix), content_type = 'csv')

# Send data to S3 buckets
trainPath = session.upload_data(path = 'train.csv', bucket = bucketName, key_prefix = prefix)
testPath = session.upload_data(path = 'test.csv', bucket = bucketName, key_prefix = prefix)

# Create estimator

In [None]:
# Create Scikit-learn estimator from Sagemaker's SDK
estimator = SKLearn(
    entry_point = 'trainingScript.py',
    role = role,
    train_instance_count = 1,
    train_instance_type = 'ml.t2.medium',
    framework_version = '0.20.0',
    base_job_name = 'rf-scikit',
    hyperparameters = {'n-estimators': 150}
)

# Fit estimator from S3 bucket data
estimator.fit({'train': trainPath, 'test'L testPath}, wait = False)

# Deploy estimator to endpoint and get predictor
predictor = sklearn_estimator.deploy(instance_type = 'ml.t2.medium', initial_instance_count = 1)

# Get response
response = predictor.predict(data)

In [None]:
# test