# Predicting whether an order should be sent to a technical approver

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-2/v-5/67

## Part 1: Load and examine the data

In [1]:
data_bucket = 'mlforbusiness'
subfolder = 'ch02'
dataset = 'orders_with_predicted_value.csv'

In [2]:
import pandas as pd
from time import sleep

import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,tech_approval_required,requester_id,role,product,quantity,price,total
0,0,E2300,tech,Desk,1,664,664
1,0,E2300,tech,Keyboard,9,649,5841
2,0,E2374,non-tech,Keyboard,1,821,821
3,1,E2374,non-tech,Desktop Computer,24,655,15720
4,0,E2327,non-tech,Desk,1,758,758


In [4]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: 1000
0    807
1    193
Name: tech_approval_required, dtype: int64


## Part 2: Get the data into the right shape

In [5]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,tech_approval_required,quantity,price,total,requester_id_E2300,requester_id_E2301,requester_id_E2302,requester_id_E2303,requester_id_E2304,requester_id_E2306,...,requester_id_E2400,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,1,664,664,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,9,649,5841,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,1,821,821,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,24,655,15720,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,1,758,758,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [6]:
corrs = encoded_data.corr()['tech_approval_required'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

tech_approval_required      1.000000
role_non-tech               0.122454
role_tech                   0.122454
product_Chair               0.134168
product_Cleaning            0.191539
product_Desk                0.292137
product_Desktop Computer    0.752144
product_Keyboard            0.242224
product_Laptop Computer     0.516693
product_Mouse               0.190708
Name: tech_approval_required, dtype: float64

In [7]:
encoded_data = encoded_data[columns]
encoded_data.head()

Unnamed: 0,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,1,0,0,0
4,0,1,0,0,0,1,0,0,0,0


## Part 3: Create training, validation and test data sets

In [8]:
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=0)
val_df, test_df = train_test_split(val_and_test_data, test_size=0.333, random_state=0)

train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data) 
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [11]:
sess = sagemaker.Session()

container = sagemaker.amazon.amazon_estimator.get_image_uri(
                boto3.Session().region_name,
                'xgboost',
                'latest')

estimator = sagemaker.estimator.Estimator(
                container,
                role,
                train_instance_count=1, 
                train_instance_type='ml.m4.xlarge',
                output_path=f's3://{data_bucket}/{subfolder}/output',
                sagemaker_session=sess)

estimator.set_hyperparameters(
                max_depth=5,
                subsample=0.7,
                objective='binary:logistic',
                eval_metric = 'auc',
                num_round=100,
                early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

2019-08-17 03:44:27 Starting - Starting the training job...
2019-08-17 03:44:29 Starting - Launching requested ML instances......
2019-08-17 03:45:57 Starting - Preparing the instances for training.........
2019-08-17 03:47:01 Downloading - Downloading input data...
2019-08-17 03:47:57 Training - Training image download completed. Training in progress..
[31mArguments: train[0m
[31m[2019-08-17:03:47:58:INFO] Running standalone xgboost training.[0m
[31m[2019-08-17:03:47:58:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8606.25mb[0m
[31m[2019-08-17:03:47:58:INFO] Determined delimiter of CSV input is ','[0m
[31m[03:47:58] S3DistributionType set as FullyReplicated[0m
[31m[03:47:58] 700x9 matrix with 6300 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-08-17:03:47:58:INFO] Determined delimiter of CSV input is ','[0m
[31m[03:47:58] S3DistributionType set as FullyReplicated[0m
[31m[03

## Part 5: Host the model

In [10]:
endpoint_name = 'order-approval'
try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass    

In [11]:
predictor = estimator.deploy(initial_instance_count=1,
               instance_type='ml.m4.xlarge', 
               endpoint_name=endpoint_name)

---------------------------------------------------------------------------------------------------------------!

In [12]:
from sagemaker.predictor import csv_serializer, json_serializer
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

## Part 6: Test the model

In [13]:
def get_prediction(row):
    prediction = round(float(predictor.predict(row[1:]).decode('utf-8')))
    return prediction

with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv') as f:
    test_data = pd.read_csv(f)

cols = list(test_data.columns)
test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data = test_data[['prediction'] + cols]
test_data[:10]

Unnamed: 0,prediction,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,0
5,0,0,1,0,0,1,0,0,0,0,0
6,0,0,1,0,0,0,1,0,0,0,0
7,0,0,1,0,0,0,1,0,0,0,0
8,0,0,1,0,0,1,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,1


In [14]:
(test_data['prediction'] == test_data['tech_approval_required']).mean()

0.99

## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
sess.delete_endpoint(endpoint_name)