In [1]:
import sagemaker

In [5]:
import boto3
import pickle
import pandas as pd
from io import StringIO
import io

!pip install gensim
from gensim.models.word2vec import Word2Vec
from sklearn.cluster import KMeans;
from sklearn.neighbors import KDTree
import numpy as np

Collecting gensim
  Downloading gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 7.9 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 60.4 MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-2.1.0-py3-none-any.whl size=110317 sha256=53fb2eeb9e5fd992d668fcc7c01329110986fd71f61d8eab082994230ab84e37
  Stored in directory: /home/ec2-user/.cache/pip/wheels/a4/9b/d5/85705a7ab783cd6f7bd718f01d3b1396272f30044e3c36401a
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-2.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
from sklearn.datasets import load_iris

For this example I'll be using a canonical dataset, since I don't have access to the original data. 


In [7]:
iris_data = load_iris()
iris_data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [8]:
iris_data_arr = iris_data.data
iris_df = pd.DataFrame(iris_data_arr, columns = iris_data.feature_names)


In [9]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Dumping and loading a model

In [10]:
from sklearn.cluster import KMeans
from sagemaker.sklearn.estimator import SKLearn


In [11]:
kmeans = KMeans(3, random_state=12345)
kmeans.fit(iris_df)

KMeans(n_clusters=3, random_state=12345)

Using the popular python library `joblib`, we can store and load our sklearn based models conveniently and quickly. 

In [47]:
from joblib import dump, load
dump(kmeans, 'model/model.joblib') 

['model/model.joblib']

In [48]:
# for loading the model
clusterer = load('model/model.joblib')

In [49]:
# save a model as a tar.gz file then upload
!tar -czvf model.tar.gz model

model/
model/model.joblib


In [50]:
kmeans.cluster_centers_

array([[6.85      , 3.07368421, 5.74210526, 2.07105263],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])

In [51]:
clusterer.cluster_centers_

array([[6.85      , 3.07368421, 5.74210526, 2.07105263],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])

## Saving to s3

In [52]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [53]:
bucket_name = bucket
print(bucket_name)

sagemaker-eu-west-2-363162872357


In [82]:
key = 'model/model.joblib'

In [83]:
s3 = boto3.resource('s3')
s3.meta.client.upload_file(key, bucket_name, 'model/model.tar.gz')

One thing I always do is go to my AWS console and double-check that it really is there and it looks as you expect it to look. This has the double effect of letting me know I did something right and reinforcing my confidence in my AWS skill. 

For the sake of coherency and good practice, let's upload the data there too. 

In [57]:
iris_df.to_csv('demo_model/iris_data_test.csv', columns = iris_df.columns)

In [58]:
# should be the name of directory you created to save your features data
data_dir = 'demo_model/iris_data_test.csv'

# set prefix, a descriptive name for a directory  
prefix = 'data'

# upload all data to S3
s3_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket_name, key_prefix=prefix)

In [94]:
# also, here's code to use if you want to programmatically check that your data has been uploaded successfully
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

data/iris_data_test.csv
model.joblib
model.tar.gz
model/
model/model.tar.gz
Test passed!


Another important thing is that the model will be saved via joblib (we specified this at the end of the `__main__` function in the training script. 

## DEPLOYING an already trained model

You can use an already pretrained model in AWS using SKLearn**Model** rather than just SKLearn. 

I'd previously saved the model artefacts as a joblib file, however the default files expected tend to be tar.gz files so AWS will have no problem recognising that kind of file. 

Two cells down I've provided the URL of my model's parameters and then I've instantiated the enpoint. 
When doing so, I am effectively telling AWS to get the artefacts from S3 and the instructions on how to use them from my `predict_demo.py` file (located in the source_dir). 

Now when this endpoint is deployed, it will always know to follow the instructions in the predict file, even when accessed via Lambda function. 

In [60]:
from sagemaker.sklearn import SKLearnModel

In [95]:
model_location

's3://sagemaker-eu-west-2-363162872357/model.joblib'

In [101]:
import os

In [120]:
# loc = 's3.eu-west-2.amazonaws.com/'
boto3.resource('s3').Bucket(bucket_name).download_file('model/', 'model.tgz')

In [123]:
model_test = os.system('tar -zxvf model.tgz')

In [124]:
model_test

512

In [114]:
bucket_name

'sagemaker-eu-west-2-363162872357'

In [127]:
model_location

's3://sagemaker-eu-west-2-363162872357/model/'

In [129]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
s3.download_file(bucket_name, 'model/model.tar.gz', 'model/model.tar.gz')

In [144]:
from predict_demo import model_fn

model_fn('model')

ModuleNotFoundError: No module named 'sagemaker_containers'

In [140]:
model = SKLearnModel(model_data=model_location, # pointing to the model artefacts - our learned weights and coeffs
                     role = role, # using the specified ARN
                     framework_version='0.4.0', 
                     entry_point='predict_demo.py', # which file to go to to find the respective functions
                     source_dir='demo_model/train',) # directory to access
#                      predictor_cls=StringPredictor) 

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [141]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

ClientError: An error occurred (ValidationException) when calling the CreateModel operation: Requested image 764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-scikit-learn:0.4.0-cpu-py3 not found.

In [None]:
# when using deploy AWS will look to find our model using `model_fn`; the entry point is the same as it was for when we trained 
# the estimator
from sagemaker.predictor import RealTimePredictor
#Uncomment the 3 lines below when we'll be using a string based predictor. Atm this kmeans model is just being used with the iris set

# class StringPredictor(RealTimePredictor):
#     def __init__(self, endpoint_name, sagemaker_session):
#         super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')

# OPTIONAL:  Training and saving an AWS style estimator
Although not necessary for this project, it's still handy to know, especially if you have to train a model on a large amount of data or require GPUs. 

For deployment purposes we need the AWS SKLearn Estimator object. This acts as a dockerised container that allows AWS to interact with our sklearn model. I'll be training the AWS model below. I've written a training script in demo_model/train/train.py. 

In [None]:
%%time
# Train your estimator on S3 training data

output_path = 's3://{}/{}'.format(bucket_name, prefix)

estimator = SKLearn(entry_point='train.py',     # name of the training script AWS should access
                    source_dir = 'demo_model/train', # dir with training script
                    role=role,   # the role we stated higher up in the nb
                    train_instance_count=1,
                    framework_version="0.23-1",  # no need to change this
                    train_instance_type='ml.m4.xlarge' ,  # no need to change this, unless you're using pytorch models OR want to include GP
                    output_path = output_path,  # no need to change this, unless you want a different output location for the file
                    sagemaker_session = sagemaker_session,
                    hyperparameters = {
                                       'n_clusters':4,
                                        }
                   )

estimator.fit({'train': s3_data})

# END THE SESSION; DELETE THE BUCKET AND ENDPOINT
Uncomment the cell below

In [76]:
# sagemaker_session.delete_endpoint()
bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'D59AC4FB3E6F3E9D',
   'HostId': '6O5YKYIIcYLNVsRsurhlvoG41Hd+A7ISEVMrUbXSX+Y0aG2uR/B/dDEyBtQJoHLTv+HTI2Rj3WY=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '6O5YKYIIcYLNVsRsurhlvoG41Hd+A7ISEVMrUbXSX+Y0aG2uR/B/dDEyBtQJoHLTv+HTI2Rj3WY=',
    'x-amz-request-id': 'D59AC4FB3E6F3E9D',
    'date': 'Tue, 25 Aug 2020 11:07:25 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'model.joblib'}, {'Key': 'data/iris_data_test.csv'}]}]