# Predicting whether an order should be sent to a technical approver (v2)

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-2/v-5/67

## Part 1: Load and examine the data

In [1]:
data_bucket = 'jwtest1'  # 데이터가 저장되어 있는 버킷 이름
subfolder = 'ch02' #S3 버킷의 하위 폴더
dataset = 'orders_with_predicted_value.csv' #사용할 dataset 이름

In [2]:
import sys
import pandas as pd
from time import sleep

import boto3  #파이썬으로 AWS 에 접속하는 라이브러리
import sagemaker # sagemaker 라이브러리
import s3fs # S3 버킷 안의 파일을 다루기 위한 모듈
from sklearn.model_selection import train_test_split


In [3]:
role = sagemaker.get_execution_role() #Sagemaker 역할 생성
s3 = s3fs.S3FileSystem(anon=False) #S3 버킷과 연결

In [4]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}') #데이터 불러오기(위에서 경로에 대한 객체 생성 끝낸 상태)
df.head()

Unnamed: 0,tech_approval_required,requester_id,role,product,quantity,price,total
0,0,E2300,tech,Desk,1,664,664
1,0,E2300,tech,Keyboard,9,649,5841
2,0,E2374,non-tech,Keyboard,1,821,821
3,1,E2374,non-tech,Desktop Computer,24,655,15720
4,0,E2327,non-tech,Desk,1,758,758


In [5]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: 1000
0    807
1    193
Name: tech_approval_required, dtype: int64


## Part 2: Get the data into the right shape

In [9]:
#범주형 자료에 대한 one-hot encoding
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,tech_approval_required,quantity,price,total,requester_id_E2300,requester_id_E2301,requester_id_E2302,requester_id_E2303,requester_id_E2304,requester_id_E2306,...,requester_id_E2400,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,1,664,664,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,9,649,5841,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,1,821,821,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,24,655,15720,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,1,758,758,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [10]:
#예측하고자 하는 target과 상관관계가 있는 데이터들이 중요한 input variables
corrs = encoded_data.corr()['tech_approval_required'].abs() #|변수들의 상관관계|(절대값) 출력
columns = corrs[corrs > .1].index #|corr|>0.1 변수들만 출력
corrs = corrs.filter(columns) #찾아낸 변수들을 필터 조건을 통해 사용 / filter =특정 조건에 만족하는 객체만 남기고, 조건에 맞지 않는 객체는 제외하는 함수
corrs

tech_approval_required      1.000000
role_non-tech               0.122454
role_tech                   0.122454
product_Chair               0.134168
product_Cleaning            0.191539
product_Desk                0.292137
product_Desktop Computer    0.752144
product_Keyboard            0.242224
product_Laptop Computer     0.516693
product_Mouse               0.190708
Name: tech_approval_required, dtype: float64

In [11]:
#해당 변수만을 가진 데이터 출력
encoded_data = encoded_data[columns]
encoded_data.head()

Unnamed: 0,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,1,0,0,0
4,0,1,0,0,0,1,0,0,0,0


## Part 3: Create training, validation and test data sets

In [12]:
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=2021) #학습용 70%, 나머지 30%
val_df, test_df = train_test_split(val_and_test_data, test_size=0.333, random_state=2021) #30% 중, 테스트용 33.3% 나머지 평가용

#데이터를 csv 형식으로 변경   / encode의 경우 외부와 데이터 통신 시 필요!!
train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

#csv 파일을 s3에 저장
with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data)
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 

    
#sagemaker 사용 전 csv 데이터 불러오기    
train_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [13]:
sess = sagemaker.Session()  #sagemaker 세션을 저장하는 변수 생성
#컨테이너 설정
container = sagemaker.image_uris.retrieve(  
                'xgboost',
                boto3.Session().region_name,
                'latest')
#모델 생성
estimator = sagemaker.estimator.Estimator(
                container,
                role,
                instance_count=1, 
                instance_type='ml.m4.xlarge',
                output_path=f's3://{data_bucket}/{subfolder}/output',    #학습결과 저장 위치
                sagemaker_session=sess)

estimator.set_hyperparameters(
                max_depth=5,
                subsample=0.7,
                objective='binary:logistic',
                eval_metric = 'auc',
                num_round=100,
                early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

2021-01-31 13:00:50 Starting - Starting the training job...
2021-01-31 13:01:15 Starting - Launching requested ML instancesProfilerReport-1612098050: InProgress
......
2021-01-31 13:02:15 Starting - Preparing the instances for training......
2021-01-31 13:03:16 Downloading - Downloading input data...
2021-01-31 13:03:49 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2021-01-31:13:03:50:INFO] Running standalone xgboost training.[0m
[34m[2021-01-31:13:03:50:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8422.69mb[0m
[34m[2021-01-31:13:03:50:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:03:50] S3DistributionType set as FullyReplicated[0m
[34m[13:03:50] 700x9 matrix with 6300 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-01-31:13:03:50:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:03:50] S3DistributionType s

## Part 5: Host the model

In [14]:
#별도의 서버 사용하여 학습된 모델 호스팅(추론 전용 서버)


endpoint_name = 'order-approval'
#기존 endpoint 삭제(없으면 넘어감)
try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass    
#endpoint 생성
predictor = estimator.deploy(initial_instance_count=1,
               instance_type='ml.m4.xlarge', 
               endpoint_name=endpoint_name)
#다양한 형식의 데이터를 CSV 형식 문자열로 직렬화
from sagemaker.serializers import CSVSerializer  
predictor.serializer = CSVSerializer()

-------------!

## Part 6: Test the model

In [15]:
#test dataset에 대한 추론 결과 가져오기
def get_prediction(row):
    prediction = round(float(predictor.predict(row[1:]).decode('utf-8'))) #첫 번째열 제외(target이므로)
    return prediction

#test dataset 불러오기
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv') as f:
    test_data = pd.read_csv(f)

#test dataset에 대한 추론 결과 생성, test dataset에 prediction 결과 합쳐주기
cols = list(test_data.columns)
test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data = test_data[['prediction'] + cols]
test_data[:10]

Unnamed: 0,prediction,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,1,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,0
2,0,0,1,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,1,0,0,0,0
5,1,1,1,0,0,0,0,1,0,0,0
6,1,1,1,0,0,0,0,1,0,0,0
7,0,0,1,0,0,0,0,0,1,0,0
8,0,0,1,0,0,0,1,0,0,0,0
9,0,0,1,0,0,0,1,0,0,0,0


In [16]:
(test_data['prediction'] == test_data['tech_approval_required']).mean()

0.99

## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [17]:
sess.delete_endpoint(endpoint_name)