<h2> Data Preparation</h2>

In [5]:
import urllib.request

urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip','data.zip')

('data.zip', <http.client.HTTPMessage at 0x7f7e36835270>)

In [6]:
!mkdir data

In [7]:
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [10]:
import pandas as pd 

data = pd.read_csv('data/iris.data',header=None)

print(data)

       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
1    4.9  3.0  1.4  0.2     Iris-setosa
2    4.7  3.2  1.3  0.2     Iris-setosa
3    4.6  3.1  1.5  0.2     Iris-setosa
4    5.0  3.6  1.4  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[150 rows x 5 columns]


In [11]:
# Converting the class labels into numerical values
data[4]=data[4].replace('Iris-setosa',0)
data[4]=data[4].replace('Iris-virginica',1)
data[4]=data[4].replace('Iris-versicolor',2)
print(data)


       0    1    2    3  4
0    5.1  3.5  1.4  0.2  0
1    4.9  3.0  1.4  0.2  0
2    4.7  3.2  1.3  0.2  0
3    4.6  3.1  1.5  0.2  0
4    5.0  3.6  1.4  0.2  0
..   ...  ...  ...  ... ..
145  6.7  3.0  5.2  2.3  1
146  6.3  2.5  5.0  1.9  1
147  6.5  3.0  5.2  2.0  1
148  6.2  3.4  5.4  2.3  1
149  5.9  3.0  5.1  1.8  1

[150 rows x 5 columns]


In [14]:
#Shuffling the full data and reseting the index as new 
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,0,1,2,3,4
0,5.4,3.4,1.7,0.2,0
1,5.1,3.8,1.6,0.2,0
2,6.1,2.8,4.0,1.3,2
3,5.4,3.4,1.5,0.4,0
4,5.9,3.2,4.8,1.8,2


In [16]:
# Moving the label column to the first column
data=data[[4,0,1,2,3]]
data.head()

Unnamed: 0,4,0,1,2,3
0,0,5.4,3.4,1.7,0.2
1,0,5.1,3.8,1.6,0.2
2,2,6.1,2.8,4.0,1.3
3,0,5.4,3.4,1.5,0.4
4,2,5.9,3.2,4.8,1.8


In [18]:
data.describe()

Unnamed: 0,4,0,1,2,3
count,150.0,150.0,150.0,150.0,150.0
mean,1.0,5.843333,3.054,3.758667,1.198667
std,0.819232,0.828066,0.433594,1.76442,0.763161
min,0.0,4.3,2.0,1.0,0.1
25%,0.0,5.1,2.8,1.6,0.3
50%,1.0,5.8,3.0,4.35,1.3
75%,2.0,6.4,3.3,5.1,1.8
max,2.0,7.9,4.4,6.9,2.5


In [19]:
# Spliting the data  (train , test)

train_data = data[:120]
val_data = data[120:]


<h2> Move Data into S3 bucket </h2

In [20]:
import boto3

bucket_name = 'sagemaker-iris-plant-classification'
train_data.to_csv('data.csv',header=False,index=False)
key='data/train/data'
url='s3://{}/{}'.format(bucket_name,key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')


val_data.to_csv('data.csv',header=False,index=False)
key='data/val/data'
url='s3://{}/{}'.format(bucket_name,key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

<h2>Create Model</h2>

In [23]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

key='model/xgb_model'

s3_output_location = url='s3://{}/{}'.format(bucket_name,key)

xgb_model = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name,'xgboost'), # Retrives the docker image of the xgboost
    get_execution_role(), # Used to get the IAM user role to run or access the AWS resources
    train_instance_count=1, # Defines the number of instances required for training 
    train_instance_type='ml.m4.xlarge', # Defines the type of instance used for training
    train_volume_size=5, # Defines the storage space need to be allocated while training the model
    output=s3_output_location, # Defines the location where the model is need to be saved
    sagemaker_session=sagemaker.Session() # Creates an session for the sagemaker to access the resources
)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


<h2>Train the Model</h2> 

In [30]:
xgb_model.set_hyperparameters(max_depth=5, #depth of the decission tree 
                              eta=0.2, # Learning rate of the model 
                              gamma=4, # The minimum error rate that should be reduced to make a split on the node
                              min_child_weight=6, # Define the number of data points should be in a branch 
                              silent=0, # Helps to print the logs while training 
                              objective='multi:softmax', # Used for the multiclass classification
                              num_class=3, # Defines the number of classes in the dataset 
                              num_round=10) # Defines the number of epochs

In [31]:
bucket_name = 'sagemaker-iris-plant-classification'
train_data = 's3://{}/{}'.format(bucket_name , 'data/train')
val_data = 's3://{}/{}'.format(bucket_name , 'data/val')

train_channel = sagemaker.session.s3_input(train_data,content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data,content_type='text/csv')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [32]:
data_channels = {'train' : train_channel , 'validation' : val_channel}
xgb_model.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: xgboost-2024-11-10-09-38-39-142


2024-11-10 09:38:40 Starting - Starting the training job...
2024-11-10 09:38:54 Starting - Preparing the instances for training...
2024-11-10 09:39:26 Downloading - Downloading input data...
2024-11-10 09:40:01 Downloading - Downloading the training image.....[34mArguments: train[0m
[34m[2024-11-10:09:40:58:INFO] Running standalone xgboost training.[0m
[34m[2024-11-10:09:40:58:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8446.32mb[0m
[34m[2024-11-10:09:40:58:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:40:58] S3DistributionType set as FullyReplicated[0m
[34m[09:40:58] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-11-10:09:40:58:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:40:58] S3DistributionType set as FullyReplicated[0m
[34m[09:40:58] 30x4 matrix with 120 entries loaded from /opt/ml/input/data/validation?format=csv&label_co

<h2> Deploy the Model </h2>

In [33]:
xgb_predictor = xgb_model.deploy(
                                 initial_instance_count=1, # No of instances to be deployed 
                                 instance_type='ml.m4.xlarge' # Type of the instance to be deployed
                                 )

INFO:sagemaker:Creating model with name: xgboost-2024-11-10-09-45-24-951
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-11-10-09-45-24-951
INFO:sagemaker:Creating endpoint with name xgboost-2024-11-10-09-45-24-951


------!