# Instacart Market Basket Analysis

## Using XGBoost in SageMaker

## Step 1: Preprocess the data

In [1]:
# read the data into dataframe
import pandas as pd

bucket='my_bucket_name'

data_key = 'output_small/data_small.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

data = pd.read_csv(data_location)

In [2]:
# load order product train dataframe
order_product_train_key = 'data/order_products/order_products__train.csv.gz'
order_product_train_location = 's3://{}/{}'.format(bucket, order_product_train_key)

order_product_train = pd.read_csv(order_product_train_location)

In [3]:
# have a look at the data
order_product_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [4]:
# load orders dataframe
orders_key = 'data/orders/orders.csv'
orders_location = 's3://{}/{}'.format(bucket, orders_key)

orders = pd.read_csv(orders_location)
orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [5]:
# attach user_id to order_product_train
order_product_train = order_product_train.merge(orders[['user_id', 'order_id']])
order_product_train

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
0,1,49302,1,1,112108
1,1,11109,2,1,112108
2,1,10246,3,0,112108
3,1,49683,4,0,112108
4,1,43633,5,1,112108
...,...,...,...,...,...
1384612,3421063,14233,3,1,169679
1384613,3421063,35548,4,1,169679
1384614,3421070,35951,1,1,139822
1384615,3421070,16953,2,1,139822


In [6]:
# attach eval_set to data
data = data.merge(orders[orders.eval_set != 'prior'][['user_id','eval_set']])
data

Unnamed: 0,product_id,up_orders,user_mean_days_since_prior,user_period,user_distinct_products,prod_second_orders,prod_reorders,user_reorder_ratio,user_total_products,up_average_cart_position,up_first_order,user_orders,up_last_order,prod_orders,prod_first_orders,user_id,eval_set
0,19508,6,5.969574,2943,200,2869,6681,0.602434,497,7.833333,11,61,29,14905,8224,144185,test
1,42307,1,5.969574,2943,200,349,1549,0.602434,497,3.000000,55,61,55,2188,639,144185,test
2,35883,1,5.969574,2943,200,245,692,0.602434,497,8.000000,52,61,52,1377,685,144185,test
3,13539,1,5.969574,2943,200,15,23,0.602434,497,12.000000,50,61,50,108,85,144185,test
4,27966,3,5.969574,2943,200,19341,105409,0.602434,497,7.000000,53,61,61,137057,31648,144185,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1330790,16741,1,5.381579,409,27,18,57,0.684211,79,5.000000,13,14,13,116,59,63137,train
1330791,24852,1,12.000000,132,9,55166,398609,0.272727,12,2.000000,4,5,4,472565,73956,15227,train
1330792,34969,1,21.818182,240,13,9548,34812,0.454545,18,5.000000,1,3,1,54409,19597,34223,test
1330793,778,1,13.210526,251,18,107,385,0.421053,26,3.000000,3,4,3,707,322,144251,train


In [6]:
# attach target variable: reordered
data = data.merge(order_product_train[['user_id', 'product_id', 'reordered']], how = 'left')
data

Unnamed: 0,product_id,up_orders,user_mean_days_since_prior,user_period,user_distinct_products,prod_second_orders,prod_reorders,user_reorder_ratio,user_total_products,up_average_cart_position,up_first_order,user_orders,up_last_order,prod_orders,prod_first_orders,user_id,reordered
0,19508,6,5.969574,2943,200,2869,6681,0.602434,497,7.833333,11,61,29,14905,8224,144185,
1,21681,1,6.675781,3418,209,186,358,0.595703,514,10.000000,11,57,11,1599,1241,83415,
2,31487,1,10.865079,1369,67,545,1702,0.515873,132,8.000000,5,27,5,3049,1347,39777,
3,32610,1,8.377778,754,76,31,78,0.344444,107,8.000000,9,9,9,198,120,184494,
4,18171,2,12.500000,2400,70,561,1650,0.687500,202,7.500000,3,24,19,3128,1478,132468,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1330790,778,1,13.210526,251,18,107,385,0.421053,26,3.000000,3,4,3,707,322,144251,
1330791,8571,2,9.216387,4387,75,2014,11282,0.886555,497,18.500000,2,20,12,14399,3117,144159,
1330792,23405,2,7.000000,56,10,1065,1805,0.375000,13,1.500000,2,3,3,7952,6147,74381,
1330793,21137,38,4.477180,9908,265,38131,205845,0.894261,2244,9.105263,1,80,58,264683,58838,145686,


In [7]:
# a few more feature engineering, refer to the R code
data['prod_reorder_probability'] = data.prod_second_orders / data.prod_first_orders
data['prod_reorder_times'] = 1 + data.prod_reorders / data.prod_first_orders
data['prod_reorder_ratio'] = data.prod_reorders / data.prod_orders
data.drop(['prod_reorders', 'prod_first_orders', 'prod_second_orders'], axis=1, inplace=True)
data['user_average_basket'] = data.user_total_products / data.user_orders
data['up_order_rate'] = data.up_orders / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.up_orders / (data.user_orders - data.up_first_order + 1)

In [8]:
data

Unnamed: 0,product_id,up_orders,user_mean_days_since_prior,user_period,user_distinct_products,user_reorder_ratio,user_total_products,up_average_cart_position,up_first_order,user_orders,...,prod_orders,user_id,reordered,prod_reorder_probability,prod_reorder_times,prod_reorder_ratio,user_average_basket,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order
0,19508,6,5.969574,2943,200,0.602434,497,7.833333,11,61,...,14905,144185,,0.348857,1.812378,0.448239,8.147541,0.098361,32,0.117647
1,21681,1,6.675781,3418,209,0.595703,514,10.000000,11,57,...,1599,83415,,0.149879,1.288477,0.223890,9.017544,0.017544,46,0.021277
2,31487,1,10.865079,1369,67,0.515873,132,8.000000,5,27,...,3049,39777,,0.404603,2.263549,0.558216,4.888889,0.037037,22,0.043478
3,32610,1,8.377778,754,76,0.344444,107,8.000000,9,9,...,198,184494,,0.258333,1.650000,0.393939,11.888889,0.111111,0,1.000000
4,18171,2,12.500000,2400,70,0.687500,202,7.500000,3,24,...,3128,132468,,0.379567,2.116373,0.527494,8.416667,0.083333,5,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1330790,778,1,13.210526,251,18,0.421053,26,3.000000,3,4,...,707,144251,,0.332298,2.195652,0.544554,6.500000,0.250000,1,0.500000
1330791,8571,2,9.216387,4387,75,0.886555,497,18.500000,2,20,...,14399,144159,,0.646134,4.619506,0.783527,24.850000,0.100000,8,0.105263
1330792,23405,2,7.000000,56,10,0.375000,13,1.500000,2,3,...,7952,74381,,0.173255,1.293639,0.226987,4.333333,0.666667,0,1.000000
1330793,21137,38,4.477180,9908,265,0.894261,2244,9.105263,1,80,...,264683,145686,,0.648068,4.498504,0.777704,28.050000,0.475000,22,0.475000


In [10]:
# split into training and test set, test set does not have target variable
train = data[data.eval_set == 'train'].copy()
test = data[data.eval_set == 'test'].copy()
train

Unnamed: 0,product_id,up_orders,user_mean_days_since_prior,user_period,user_distinct_products,user_reorder_ratio,user_total_products,up_average_cart_position,up_first_order,user_orders,...,user_id,eval_set,reordered,prod_reorder_probability,prod_reorder_times,prod_reorder_ratio,user_average_basket,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order
14,21681,1,6.675781,3418,209,0.595703,514,10.00,11,57,...,83415,train,,0.149879,1.288477,0.223890,9.017544,0.017544,46,0.021277
15,13984,3,6.675781,3418,209,0.595703,514,5.00,10,57,...,83415,train,,0.400536,2.206638,0.546822,9.017544,0.052632,17,0.062500
16,22395,1,6.675781,3418,209,0.595703,514,5.00,15,57,...,83415,train,,0.340970,1.825202,0.452115,9.017544,0.017544,42,0.023256
17,3252,1,6.675781,3418,209,0.595703,514,4.00,48,57,...,83415,train,,0.233766,1.448052,0.309417,9.017544,0.017544,9,0.100000
18,31506,4,6.675781,3418,209,0.595703,514,3.25,16,57,...,83415,train,,0.377514,1.910764,0.476649,9.017544,0.070175,25,0.095238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1330789,9434,1,21.027027,778,25,0.432432,41,4.00,4,10,...,144232,train,,0.413609,2.047401,0.511576,4.100000,0.100000,6,0.142857
1330790,16741,1,5.381579,409,27,0.684211,79,5.00,13,14,...,63137,train,,0.305085,1.966102,0.491379,5.642857,0.071429,1,0.500000
1330791,24852,1,12.000000,132,9,0.272727,12,2.00,4,5,...,15227,train,1.0,0.745930,6.389813,0.843501,2.400000,0.200000,1,0.500000
1330793,778,1,13.210526,251,18,0.421053,26,3.00,3,4,...,144251,train,,0.332298,2.195652,0.544554,6.500000,0.250000,1,0.500000


In [11]:
# id field won't be used in model, thus make a backup of them and remove from dataframe
test_id = test[['product_id','user_id', 'eval_set']]
test.drop(['product_id','user_id', 'eval_set', 'reordered'], axis=1, inplace=True)
test

Unnamed: 0,up_orders,user_mean_days_since_prior,user_period,user_distinct_products,user_reorder_ratio,user_total_products,up_average_cart_position,up_first_order,user_orders,up_last_order,prod_orders,prod_reorder_probability,prod_reorder_times,prod_reorder_ratio,user_average_basket,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order
0,6,5.969574,2943,200,0.602434,497,7.833333,11,61,29,14905,0.348857,1.812378,0.448239,8.147541,0.098361,32,0.117647
1,1,5.969574,2943,200,0.602434,497,3.000000,55,61,55,2188,0.546166,3.424100,0.707952,8.147541,0.016393,6,0.142857
2,1,5.969574,2943,200,0.602434,497,8.000000,52,61,52,1377,0.357664,2.010219,0.502542,8.147541,0.016393,9,0.100000
3,1,5.969574,2943,200,0.602434,497,12.000000,50,61,50,108,0.176471,1.270588,0.212963,8.147541,0.016393,11,0.083333
4,3,5.969574,2943,200,0.602434,497,7.000000,53,61,61,137057,0.611129,4.330669,0.769089,8.147541,0.049180,0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1330780,9,26.212121,1730,30,0.681818,75,3.444444,1,9,9,56087,0.623136,5.326907,0.812274,8.333333,1.000000,0,1.000000
1330783,1,26.760000,669,22,0.160000,26,2.000000,4,8,4,5874,0.407210,2.406391,0.584440,3.250000,0.125000,4,0.200000
1330784,1,10.250000,82,11,0.000000,11,5.000000,2,3,2,17716,0.381535,1.917524,0.478494,3.666667,0.333333,1,0.500000
1330786,2,8.578947,489,49,0.245614,63,4.500000,1,7,4,4774,0.556540,3.252044,0.692501,9.000000,0.285714,3,0.285714


In [12]:
# convert target variable to 1/0 for training dataframe
train['reordered'] = train['reordered'].fillna(0)
train['reordered'] = train.reordered.astype(int)

In [13]:
# drop id columns as they won't be used in model
train.drop(['eval_set', 'user_id', 'product_id'], axis=1, inplace=True)

In [14]:
# this is the target variable dataframe
train_y = train[['reordered']]
# this is the dataframe without target variable
train_X = train.drop(['reordered'], axis = 1)

## Step 2: Classification

In [15]:
import pandas as pd

val_X = train_X[:20000]
train_X = train_X[20000:]

val_y = train_y[:20000]
train_y = train_y[20000:]

In [16]:
# First we make sure that the local directory in which we'd like to store the training and validation csv files exists.
import os
data_dir = 'data/xgboost'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [17]:
# First, save the test data to test.csv in the data_dir directory without label.
pd.DataFrame(test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

# Then we save the training and validation set into local disk as csv files
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [18]:
# To save a bit of memory we can set text_X, train_X, val_X, train_y and val_y to None.

train_X = val_X = train_y = val_y = None

### Uploading Training / Validation files to S3

In [19]:
import sagemaker

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = 'imba-xgboost'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

### Creating a tuned XGBoost model

In [20]:
from sagemaker import get_execution_role

role = get_execution_role()

In [21]:
import sagemaker
container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, 'latest')

In [22]:
xgb = sagemaker.estimator.Estimator(container, 
                                    role,                                    
                                    train_instance_count=1,                  
                                    train_instance_type='ml.m4.xlarge',      
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.1,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Create the hyperparameter tuner

In [23]:
# First, make sure to import the relevant objects used to construct the tuner
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner


# create the tuner object:

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 4, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

### Fit the hyperparameter tuner

In [24]:
s3_input_train = sagemaker.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.TrainingInput(s3_data=val_location, content_type='csv')
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

...................................................................................................!


### Testing the model

In [25]:
# attach the model:

xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())


2021-12-29 11:33:39 Starting - Preparing the instances for training
2021-12-29 11:33:39 Downloading - Downloading input data
2021-12-29 11:33:39 Training - Training image download completed. Training in progress.
2021-12-29 11:33:39 Uploading - Uploading generated training model
2021-12-29 11:33:39 Completed - Training job completed


Now that we have an estimator object attached to the correct training job, we can proceed as we normally would and create a transformer object.

In [26]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [27]:
# Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...................................[34mArguments: serve[0m
[34m[2021-12-29 11:40:08 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-12-29 11:40:08 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-12-29 11:40:08 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-12-29 11:40:08 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-12-29 11:40:08 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-29:11:40:08:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-29:11:40:08:INFO] Model loaded successfully for worker : 22[0m
[34m[2021-12-29 11:40:08 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-29:11:40:08:INFO] Model loaded successfully for worker : 23[0m
[34m[2021-12-29 11:40:08 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[202

In [28]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-ap-southeast-2-111611805220/xgboost-2021-12-29-11-34-24-859/test.csv.out to data/xgboost/test.csv.out


In [29]:
prediction=pd.read_csv('data/xgboost/test.csv.out', header=None, names=["prob"])
test_id = test_id.reset_index().drop(['index','eval_set'],axis = 1)
test = test.reset_index().drop(['index'],axis = 1)
pd.concat([test_id,test],axis=1).to_csv('data/xgboost/test_final.csv', index = False)
!aws s3 cp 'data/xgboost/test_final.csv' 's3://{bucket}/model_output/test_final.csv'

upload: data/xgboost/test_final.csv to s3://imba-johnny/model_output/test_final.csv


## Step 3: Putting our model to work

In [30]:
import boto3

runtime = boto3.Session().client('sagemaker-runtime')

In [31]:
# now we deploy the model as an endpoint
xgb_predictor = xgb_attached.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

-----!

In [32]:
# make a note of the model endpoint been created
xgb_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'xgboost-2021-12-29-11-41-22-248'

In [35]:
# let's invoke the endpoint and see if it works
response = runtime.invoke_endpoint(EndpointName = xgb_predictor.endpoint, 
                                       ContentType = 'text/csv',                     
                                       Body = '1,6.67578125,3418,209,0.595703125,514,10.0,11,57,11,1599,0.1498791297340854,1.2884770346494763,0.22388993120700437,9.017543859649123,0.017543859649122806,46,0.02127659574468085')

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [36]:
# here is the response: the probablility of a user buy the product
response['Body'].read().decode('utf-8')

'0.008685070089995861'

### Delete the endpoint

In [None]:
xgb_predictor.delete_endpoint()