In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/hamideh/.config/sagemaker/config.yaml


In [2]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'mobiledatasagemaker'
print("using bucket" + bucket)

using bucketmobiledatasagemaker


In [3]:
df = pd.read_csv("mob_price_classification_train.csv")

In [4]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [6]:
#Find the percentage of Values are missing
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [7]:
features = list(df.columns)
label = features.pop(-1)

In [8]:
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi']

In [9]:
df.shape

(2000, 21)

In [10]:
x = df[features]  #Independent features
y = df[label]     #dependent features

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [12]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [13]:
trainX.to_csv("train-V-1.csv", index = False)
testX.to_csv("test-V-1.csv", index = False)

In [14]:
#send data to s3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(path = "train-V-1.csv", bucket= bucket, key_prefix=sk_prefix)
testpath = sess.upload_data(path='test-V-1.csv', bucket = bucket, key_prefix=sk_prefix)

In [15]:
trainpath

's3://mobiledatasagemaker/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv'

In [16]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

# function for loading the model
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":

    # Whenever you train your model in sagemaker, it requires some by-default arguments
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))


    features = list(train_df.columns)
    label = features.pop(-1)

    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()

    print('Label column is: ', label)
    print()

    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
   
    print("Training RandomForest Model....")
    print()
    model = RandomForestClassifier(n_estimators = args.n_estimators, random_state = args.random_state)
    model.fit(X_train, y_train)
    print()


    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()


    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)


Overwriting script.py


In [17]:
role = "arn:aws:iam::126408900611:role/sagemakerRole"

In [18]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
                      entry_point= "script.py",
                      role = role,
                      instance_count = 1,
                      instance_type="ml.m5.xlarge",
                      framework_version= FRAMEWORK_VERSION,
                      base_job_name = "RF-custom-sklearn",
                      hyperparameters={
                          "n_estimators": 100,
                          "random_state": 0,
                      },
                      use_spot_instances = True,
                      max_wait = 7200,
                      max_run = 3600
)

In [19]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-06-06-14-05-15-947


2024-06-06 14:05:19 Starting - Starting the training job...
2024-06-06 14:05:33 Starting - Preparing the instances for training...
2024-06-06 14:05:55 Downloading - Downloading input data...
2024-06-06 14:06:51 Training - Training image download completed. Training in progress.
2024-06-06 14:06:51 Uploading - Uploading generated training model.2024-06-06 14:06:46,466 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-06-06 14:06:46,469 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-06 14:06:46,510 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-06-06 14:06:46,691 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-06 14:06:46,703 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-06 14:06:46,714 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-06 14:0

* Retrieving Training Job Name Immediately After Running the Job

When you run a training job using the SageMaker Python SDK, the `Estimator` object contains a reference to the latest training job. You can access the job name like this:

In [20]:
# Assume 'estimator' is your Estimator object
training_job_name = sklearn_estimator.latest_training_job.name
print("The training job name is:", training_job_name)


The training job name is: RF-custom-sklearn-2024-06-06-14-05-15-947


* Retrieving Past Training Job Names Using Boto3

If you need to find the names of training jobs not directly tied to your current session, or if you want to list multiple jobs, you can use Boto3 to interact with the SageMaker service.

In [21]:
import boto3

# Create a SageMaker client
sagemaker_client = boto3.client('sagemaker')

# List training jobs. You can use pagination if there are many jobs.
response = sagemaker_client.list_training_jobs(
    MaxResults=10,  # You can specify the number of results to return
    SortBy='CreationTime',
    SortOrder='Descending'
)

# Loop through the training jobs and print their names
for job in response['TrainingJobSummaries']:
    print("Training Job Name:", job['TrainingJobName'])


Training Job Name: RF-custom-sklearn-2024-06-06-14-05-15-947
Training Job Name: RF-custom-sklearn-2024-06-05-22-26-07-035
Training Job Name: RF-custom-sklearn-2024-06-04-19-33-21-930
Training Job Name: RF-custom-sklearn-2024-06-04-19-17-16-363
Training Job Name: RF-custom-sklearn-2024-06-04-19-09-45-853


* Accessing Logs Programmatically Using Boto3

If you prefer to access logs programmatically, you can use the AWS SDK for Python (Boto3). Here’s a simple script to help you get started:

In [22]:
import boto3

# Create a CloudWatch Logs client
logs_client = boto3.client('logs')

# Define your log group and log stream names
log_group_name = '/aws/sagemaker/TrainingJobs'
log_stream_name = 'RF-custom-sklearn-2024-06-04-19-33-21-930/algo-1-1717529642'  

# Retrieve log events
response = logs_client.get_log_events(
    logGroupName=log_group_name,
    logStreamName=log_stream_name,
    startFromHead=True
)

# Print the log events
for event in response['events']:
    print(event['message'])


2024-06-04 19:34:51,978 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-06-04 19:34:51,981 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-04 19:34:52,027 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-06-04 19:34:52,191 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-04 19:34:52,204 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-04 19:34:52,216 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-06-04 19:34:52,225 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_sklearn_container.training:main",


In [23]:
import boto3
sts = boto3.client('sts')
print(sts.get_caller_identity())


{'UserId': 'AIDAR23UOYABU6ERTAOQP', 'Account': '126408900611', 'Arn': 'arn:aws:iam::126408900611:user/adm_admin_big_data', 'ResponseMetadata': {'RequestId': 'a16bd7e8-a5e7-41a2-b135-0dee371a1ebf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'a16bd7e8-a5e7-41a2-b135-0dee371a1ebf', 'content-type': 'text/xml', 'content-length': '415', 'date': 'Thu, 06 Jun 2024 14:08:09 GMT'}, 'RetryAttempts': 0}}


In [25]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job( TrainingJobName=sklearn_estimator.latest_training_job.name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-06-06 14:07:03 Starting - Preparing the instances for training
2024-06-06 14:07:03 Downloading - Downloading the training image
2024-06-06 14:07:03 Training - Training image download completed. Training in progress.
2024-06-06 14:07:03 Uploading - Uploading generated training model
2024-06-06 14:07:03 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-2-126408900611/RF-custom-sklearn-2024-06-06-14-05-15-947/output/model.tar.gz


In [27]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
     name = model_name,
     model_data = artifact,
     role = role,
     entry_point= "script.py",
     framework_version=FRAMEWORK_VERSION,
)

In [28]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x78127f437490>

In [29]:
model_name

'Custom-sklearn-model-2024-06-06-14-34-54'

In [31]:
#endpoint deployment
endpoint_name = "Custom-sklearn-model-" +  strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
       initial_instance_count=1,
       instance_type="ml.m4.xlarge",
       endpoint_name = endpoint_name,
)

EndpointName=Custom-sklearn-model-2024-06-06-14-39-50


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-06-06-14-34-54
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-06-06-14-39-50
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-06-06-14-39-50


------!

In [None]:
testX[features][0:2].values.tolist()

In [33]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]


In [None]:
sm_boto3.delete_endpoint(EndpointName = endpoint_name)