In [1]:
import urllib.request

urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip',
                          'data.zip')

('data.zip', <http.client.HTTPMessage at 0x7f202459ad40>)

In [2]:
# Create a directory for extracted data
!mkdir data
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [3]:
#read data
import pandas as pd
data = pd.read_csv('data/iris.data', header=None)

#convert to numerical values 
data[4] = data[4].replace('Iris-setosa',0)
data[4] = data[4].replace('Iris-virginica',1)
data[4] = data[4].replace('Iris-versicolor',2)

#shuffle

data = data.sample(frac=1).reset_index(drop=True)

#change label column index

data = data[[4,0,1,2,3]]

#split

#80% train data
#20% validation data

train_data = data[:120]
val_data = data[120:]


In [4]:
import boto3

bucket_name = 'sagemaker-build-and-deploy-sagemaker'

train_data.to_csv('data.csv',header=False, index = False)
key = 'data/train/data'
url = 's3://{}/{}'.format(bucket_name,key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

val_data.to_csv('data.csv',header=False, index = False)
key = 'data/val/data'
url = 's3://{}/{}'.format(bucket_name,key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

In [8]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

key = 'model/xgb_model'
s3_output_location = url = 's3://{}/{}'.format(bucket_name,key)

xgb_model = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name,'xgboost'),
    get_execution_role(),
    train_instance_count = 1,
    train_instance_type = 'ml.m5.xlarge',
    train_volume_size = 5,
    output_path = s3_output_location,
    sagemaker_session = sagemaker.Session()
)

xgb_model.set_hyperparameters(max_depth = 5,
                              eta = 0.2,
                              gamma = 4,
                              min_child_weight = 6,
                              silent = 0,
                              objective = 'multi:softmax',
                              num_class = 3,
                              num_round=10)


In [9]:
train_data = 's3://{}/{}'.format(bucket_name,'data/train')
val_data = 's3://{}/{}'.format(bucket_name,'data/val')

train_channel = sagemaker.session.s3_input(train_data,content_type='text/csv') 
val_channel = sagemaker.session.s3_input(val_data,content_type='text/csv') 

data_channels = {'train' : train_channel , 'validation' : val_channel}

xgb_model.fit(inputs=data_channels)

2025-01-05 04:50:45 Starting - Starting the training job...
2025-01-05 04:50:59 Starting - Preparing the instances for training...
2025-01-05 04:51:44 Downloading - Downloading the training image......
2025-01-05 04:52:40 Training - Training image download completed. Training in progress.
2025-01-05 04:52:40 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2025-01-05:04:52:32:INFO] Running standalone xgboost training.[0m
[34m[2025-01-05:04:52:32:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8151.42mb[0m
[34m[2025-01-05:04:52:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[04:52:32] S3DistributionType set as FullyReplicated[0m
[34m[04:52:32] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2025-01-05:04:52:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[04:52:32] S3DistributionType set as FullyReplicated[0m
[34m[04:5

In [10]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                 instance_type = 'ml.m5.xlarge')

------!