In [1]:
import sys
import pandas as pd
from time import sleep
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv(f'./orders_with_predicted_value.csv')
df.head()

In [None]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

## Part 2: Get the data into the right shape

In [None]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

In [None]:
corrs = encoded_data.corr()['tech_approval_required'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

In [None]:
encoded_data = encoded_data[columns]
encoded_data.head()

## Part 3: Create training, validation and test data sets

In [None]:
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=0)
val_df, test_df = train_test_split(val_and_test_data, test_size=0.333, random_state=0)

train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data)
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [None]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve(
                'xgboost',
                boto3.Session().region_name,
                'latest')

estimator = sagemaker.estimator.Estimator(
                container,
                role,
                instance_count=1, 
                instance_type='ml.m4.xlarge',
                output_path=f's3://{data_bucket}/{subfolder}/output',
                sagemaker_session=sess)

estimator.set_hyperparameters(
                max_depth=5,
                subsample=0.7,
                objective='binary:logistic',
                eval_metric = 'auc',
                num_round=100,
                early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

## Part 5: Host the model

In [None]:
endpoint_name = 'order-approval'
try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass    

In [None]:
predictor = estimator.deploy(initial_instance_count=1,
               instance_type='ml.m4.xlarge', 
               endpoint_name=endpoint_name)

In [None]:
from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()

## Part 6: Test the model

In [None]:
def get_prediction(row):
    prediction = round(float(predictor.predict(row[1:]).decode('utf-8')))
    return prediction

with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv') as f:
    test_data = pd.read_csv(f)

cols = list(test_data.columns)
test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data = test_data[['prediction'] + cols]
test_data[:10]

In [None]:
(test_data['prediction'] == test_data['tech_approval_required']).mean()

## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
sess.delete_endpoint(endpoint_name)