In [None]:
#data preparation
#move data to an S3 bucket
#create the model
#train the model
#deploy the model

# data preparation

In [2]:
import urllib.request
urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip',
                           'data.zip')

('data.zip', <http.client.HTTPMessage at 0x7f6ef4a57550>)

In [3]:
!mkdir data
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [4]:
import pandas as pd

# read the data

data=pd.read_csv('data/iris.data' , header=None)

# convert to numerical values

data[4] = data[4].replace('Iris-setosa',0)
data[4] = data[4].replace('Iris-virginica',1)
data[4] = data[4].replace('Iris-versicolor',2)
data[4] = data[4].infer_objects(copy=False)

print(data)

# shuffle
data = data.sample(frac=1).reset_index(drop=True)

# change the label col index
data = data[[4,0,1,2,3]]

print(data)

#split (train,val sets)
# 80% train data
# 20% validation data

train_data = data[:120]
val_data = data[120:]


       0    1    2    3  4
0    5.1  3.5  1.4  0.2  0
1    4.9  3.0  1.4  0.2  0
2    4.7  3.2  1.3  0.2  0
3    4.6  3.1  1.5  0.2  0
4    5.0  3.6  1.4  0.2  0
..   ...  ...  ...  ... ..
145  6.7  3.0  5.2  2.3  1
146  6.3  2.5  5.0  1.9  1
147  6.5  3.0  5.2  2.0  1
148  6.2  3.4  5.4  2.3  1
149  5.9  3.0  5.1  1.8  1

[150 rows x 5 columns]
     4    0    1    2    3
0    2  5.6  3.0  4.5  1.5
1    2  5.5  2.6  4.4  1.2
2    0  4.3  3.0  1.1  0.1
3    1  7.1  3.0  5.9  2.1
4    0  5.4  3.9  1.3  0.4
..  ..  ...  ...  ...  ...
145  0  5.1  3.4  1.5  0.2
146  2  6.6  3.0  4.4  1.4
147  2  5.7  2.9  4.2  1.3
148  0  5.7  3.8  1.7  0.3
149  2  5.0  2.0  3.5  1.0

[150 rows x 5 columns]


  data[4] = data[4].replace('Iris-versicolor',2)


# move data to a S3 bucket

In [5]:
import boto3

bucket_name = 'myawsbucket-ml-deploy'

train_data.to_csv('data.csv',header = False, index = False)
key = 'data/train/data'
url = 's3://{}/{}/'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

val_data.to_csv('data.csv',header = False, index = False)
key = 'data/val/data'
url = 's3://{}/{}/'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')


# create a model

In [8]:

import sagemaker 
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker import get_execution_role

key = 'model/xgb_model'
s3_output_location = url= 's3://{}/{}'.format(bucket_name, key)

xgb_model=sagemaker.estimator.Estimator( 
    get_image_uri(boto3.Session().region_name, 'xgboost'),
    get_execution_role(), 
    train_instance_count= 1,
    train_instance_type='ml.m4.xlarge',
    train_volume_size=5,
    output_path = s3_output_location,
    sagemaker_session=sagemaker.Session()
)

xgb_model.set_hyperparameters(max_depth=5,
                              eta=0.2,
                              gamma=4,
                              min_child_weight=6,
                              silent=0,
                              objective='multi:softmax',
                              num_class=3,
                              num_round=10)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# train model

In [9]:
train_data = 's3://{}/{}'.format(bucket_name,'data/train')
val_data = 's3://{}/{}'.format(bucket_name,'data/val')

train_channel = sagemaker.session.s3_input(train_data,content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data,content_type='text/csv')

#dict data_channels with two keys
data_channels = {'train':train_channel,'validation':val_channel}

xgb_model.fit(inputs = data_channels)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-07-02-08-15-28-577


2024-07-02 08:15:28 Starting - Starting the training job...
2024-07-02 08:15:43 Starting - Preparing the instances for training...
2024-07-02 08:16:10 Downloading - Downloading input data...
2024-07-02 08:16:45 Downloading - Downloading the training image......
2024-07-02 08:17:51 Training - Training image download completed. Training in progress.
2024-07-02 08:17:51 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-07-02:08:17:45:INFO] Running standalone xgboost training.[0m
[34m[2024-07-02:08:17:45:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8463.96mb[0m
[34m[2024-07-02:08:17:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[08:17:45] S3DistributionType set as FullyReplicated[0m
[34m[08:17:45] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-07-02:08:17:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[08:17:

# deploy model

In [10]:
xgb_predictor = xgb_model.deploy(initial_instance_count = 1,
                                 instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-07-02-08-22-07-295
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-07-02-08-22-07-295
INFO:sagemaker:Creating endpoint with name xgboost-2024-07-02-08-22-07-295


-------!