# Import libraries and set/get AWS properties

In [2]:
import pandas as pd
import boto3, re, sys, math, json, os, sagemaker, urllib.request, argparse
from sagemaker import get_execution_role
from sklearn.datasets import fetch_kddcup99
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split

# Define IAM role, session, region, and prefix for data directory
role = get_execution_role()
session = sagemaker.Session()
prefix = 'sagemaker'
region = boto3.session.Session().region_name

# Create SageMaker client, used for endpoint AI deployment
mboto3 = boto3.client('sagemaker')

# Set S3 bucket name to store data, as well as boto3 client for S3 services
bucketName = 'tdcx-test'
s3 = boto3.resource('s3')

# ETL on dataset

In [3]:
# Load KDD Cup dataset using scikit-learn's dataset loader
# Set subset to SF to get logged_in is possitive data, allowing us to detect abnormal data as intrusion attacks
# Should have a 0.3% of abnormal connections
rawData = fetch_kddcup99(subset = 'SF')

# Split raw data up into train test split with 70/30 split of train/test data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(rawData.data, rawData.target, test_size = 0.3, random_state = 42)

# Add label columns to dataset as we will be loading them into our S3 bucket to store said data
trainX = pd.DataFrame(Xtrain, columns = rawData.feature_names)
testX = pd.DataFrame(Xtest, columns = rawData.feature_names)

# Add labels to training/testing data
trainX['labels'] = Ytrain
testX['labels'] = Ytest

# Setup training session on Sagemaker

In [4]:
# Set training/testing data to csv files
trainX.to_csv('train.csv', index = False, header = True)
testX.to_csv('test.csv', index = False, header = True)

# Send data to S3 buckets
trainPath = session.upload_data(path = 'train.csv', bucket = bucketName, key_prefix = prefix)
testPath = session.upload_data(path = 'test.csv', bucket = bucketName, key_prefix = prefix)

# Create estimator

In [59]:
# Create Scikit-learn estimator from Sagemaker's SDK
estimator = SKLearn(
    entry_point = 'trainingScript.py',
    role = role,
    train_instance_count = 1,
    train_instance_type = 'ml.m4.xlarge',
    framework_version = '0.20.0',
    base_job_name = 'rf-scikit',
    hyperparameters = {'n-estimators': 150}
)

# Fit estimator from S3 bucket data
estimator.fit({'train': trainPath, 'test': testPath}, wait = False)

estimator.latest_training_job.wait(logs = 'None')
artifact = mboto3.describe_training_job(TrainingJobName = estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

# Deploy estimator model
predictor = estimator.deploy(instance_type = 'ml.t2.medium', initial_instance_count = 1)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: rf-scikit-2023-02-12-19-18-32-495



2023-02-12 19:18:33 Starting - Starting the training job....
2023-02-12 19:18:56 Starting - Preparing the instances for training............
2023-02-12 19:20:00 Downloading - Downloading input data....
2023-02-12 19:20:26 Training - Downloading the training image.....
2023-02-12 19:20:56 Training - Training image download completed. Training in progress.....
2023-02-12 19:21:22 Uploading - Uploading generated training model.
2023-02-12 19:21:32 Completed - Training job completed

INFO:sagemaker:Creating model with name: rf-scikit-2023-02-12-19-21-35-010





INFO:sagemaker:Creating endpoint-config with name rf-scikit-2023-02-12-19-21-35-010
INFO:sagemaker:Creating endpoint with name rf-scikit-2023-02-12-19-21-35-010


-------!