In [1]:
import boto3
import sagemaker
import pandas as pd

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()
bucket

'sagemaker-us-east-1-762275852029'

In [15]:
# name of directory you created to save your features data
data_dir = 'data'

# set prefix, a descriptive name for a directory  
prefix = 'data'

# upload all data to S3
data_path = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [16]:
# confirm that data is in S3 bucket
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    print(obj.key)

data/train.csv
data/valid.csv
grape-dataset.zip
sagemaker-scikit-learn-2020-05-15-12-30-21-710/source/sourcedir.tar.gz


In [17]:
!pygmentize source/train.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn.externals[39;49;00m [34mimport[39;49;00m joblib

[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mfrom[39;49;00m [04m[36msklearn.ensemble[39;49;00m [34mimport[39;49;00m RandomForestClassifier

[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [33m"""Load model from the model_dir. This is the same model that is saved[39;49;00m
[33m    in the main if statement.[39;49;00m
[33m    """[39;49;00m
    [34mprint[39;49;00m([33m"[39;49;00m[33mLoading model.[39;49;00m[33m"[39;49;00m)
    
    [37m# load using joblib[39;49;00m
    model = joblib.load(os.path.join(model_dir, [33m"[39;49;00m[33mmodel.joblib[39;4

In [18]:
# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.m4.xlarge',
                    entry_point="train.py",
                    source_dir="source",
                    output_path=data_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        "data-dir": data_path,
                        "n_estimators": 10,
                        "random_state": 54
                    }
            )

In [19]:
%%time

# Train your estimator on S3 training data
estimator.fit({"train": data_path})

2020-05-15 12:41:39 Starting - Starting the training job...
2020-05-15 12:41:42 Starting - Launching requested ML instances......
2020-05-15 12:42:58 Starting - Preparing the instances for training......
2020-05-15 12:43:52 Downloading - Downloading input data...
2020-05-15 12:44:25 Training - Downloading the training image..[34m2020-05-15 12:44:46,191 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-05-15 12:44:46,194 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-05-15 12:44:46,206 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-05-15 12:44:49,520 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-05-15 12:44:49,520 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-05-15 12:44:49,520 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-05-15 12:44:49,520 

Training seconds: 72
Billable seconds: 72
CPU times: user 446 ms, sys: 26.4 ms, total: 472 ms
Wall time: 3min 42s


In [20]:
%%time

# deploy model to create a predictor
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

-----------------!CPU times: user 284 ms, sys: 6.66 ms, total: 291 ms
Wall time: 8min 32s


In [None]:
# evaluating

In [21]:
import os

# read in test data, assuming it is stored locally
test_data = pd.read_csv(os.path.join(data_dir, "valid.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

In [22]:
# model accuracy

In [23]:
# First: generate predicted, class labels
test_y_preds = predictor.predict(test_x)

In [24]:
from sklearn.metrics import accuracy_score

# Second: calculate the test accuracy
accuracy = accuracy_score(test_y, test_y_preds)

print(accuracy)


## print out the array of predicted and true labels, if you want
print('\nPredicted class labels: ')
print(test_y_preds)
print('\nTrue class labels: ')
print(test_y.values)

0.9473684210526315

Predicted class labels: 
[1 1 1 ... 2 2 2]

True class labels: 
[1 1 1 ... 2 2 2]


In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion_matrix(test_y, test_y_preds)

array([[418,   0,   0,   5],
       [  3, 443,   2,  32],
       [  0,   0, 424,   6],
       [  6,  27,  14, 425]])

In [None]:
# clean up

In [None]:
#predictor.delete_endpoint()

In [None]:
# delete bucket
#bucket_to_delete = boto3.resource('s3').Bucket(bucket)
#bucket_to_delete.objects.all().delete()