In [2]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

In [26]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "mobbucketsagemaker9999"

In [16]:
df = pd.read_csv("data_mobile_price_range.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [17]:
df.shape

(2000, 21)

In [18]:
df['price_range'].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [19]:
features = list(df.columns)
label = features.pop(-1)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi']

In [20]:
X = df[features]
y = df[label]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1700, 20)
(300, 20)
(1700,)
(300,)


In [23]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [24]:
trainX.to_csv("train-v1.csv", index=False)
testX.to_csv("test-v1.csv", index=False)

In [27]:
bucket

'mobbucketsagemaker9999'

In [28]:
prefix_pth = "sagemaker/mobile_price_prediction_sklearncontainer"
trainpath = sess.upload_data(path = "train-v1.csv", bucket = bucket, key_prefix=prefix_pth)
testpath = sess.upload_data(path="test-v1.csv", bucket=bucket, key_prefix=prefix_pth)
print("train path ", trainpath)
print("test path ", testpath)

train path  s3://mobbucketsagemaker9999/sagemaker/mobile_price_prediction_sklearncontainer/train-v1.csv
test path  s3://mobbucketsagemaker9999/sagemaker/mobile_price_prediction_sklearncontainer/test-v1.csv


In [64]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import boto3
import joblib
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    print("[INFO] exracting arguments")
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)
    parser.add_argument("--model-dir", type=str, default= os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default= "train-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-v1.csv")
    
    args, _ = parser.parse_known_args()

    print("sklearn version: ", sklearn.__version__)
    print("joblib version: ", joblib.__version__)

    print("[INFO] Reading Data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("Column Order: ")
    print(features)
    print()

    print("Label column is: ", label)
    print()

    print("Data shape: ")
    print()
    print("----- SHAPE OF TRAINING DATA (85%) ------")
    print(X_train.shape)
    print(y_train.shape)
    print()

    print("------ SHAPE OF TESTING DATA(15%) ------")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("[INFO] Training Random Forest Model")
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose=2, n_jobs=1)
    model.fit(X_train, y_train)

    print()

    model_path = os.path.join(args.model_dir , "model.joblib")
    joblib.dump(model, model_path)
    print("model saved at " + model_path)

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print(" ---- Metrics Results For Testing Data -----")
    print()
    print("Total Rows are ", X_test.shape[0])
    print("[TESTING] Model accuracy is ", test_acc)
    print("[TESTING] Testing Report:")
    print(test_rep)

Overwriting script.py


In [65]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
sklearn_estimator = SKLearn(
    entry_point = "script.py",
    role = "arn:aws:iam::600627354775:role/sagemakeraccess",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version = FRAMEWORK_VERSION,
    base_job_name = "RF-custom-sklearn",
    hyperparameters={
        "n_estimators":100,
        "random_state":0
    },
    use_spot_instance=True,
    max_run=7200
)

In [66]:
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait=True)

2025-01-20 06:41:26 Starting - Starting the training job...
2025-01-20 06:41:41 Starting - Preparing the instances for training...
2025-01-20 06:42:30 Downloading - Downloading the training image...
2025-01-20 06:43:10 Training - Training image download completed. Training in progress..2025-01-20 06:43:14,442 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-01-20 06:43:14,446 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-01-20 06:43:14,490 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-01-20 06:43:14,643 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-01-20 06:43:14,655 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-01-20 06:43:14,668 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-01-20 06:43:14,677 sagemaker-training-toolkit INFO     Invoking user 

In [67]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]


2025-01-20 06:43:34 Starting - Preparing the instances for training
2025-01-20 06:43:34 Downloading - Downloading the training image
2025-01-20 06:43:34 Training - Training image download completed. Training in progress.
2025-01-20 06:43:34 Uploading - Uploading generated training model
2025-01-20 06:43:34 Completed - Training job completed


In [45]:
artifact

's3://sagemaker-us-east-1-600627354775/RF-custom-sklearn-2025-01-19-15-07-12-029/output/model.tar.gz'

In [68]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-Sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 
model = SKLearnModel(
    name = model_name,
    model_data = artifact,
    role = "arn:aws:iam::600627354775:role/sagemakeraccess",
    entry_point = "script.py",
    framework_version = FRAMEWORK_VERSION
)

In [58]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x7f93dad0a2f0>

In [69]:
endpoint_name = "Custom-Sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 
print("EndpointName={}".format(endpoint_name))
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m4.xlarge",
    endpoint_name = endpoint_name
)

EndpointName=Custom-Sklearn-model2025-01-20-06-44-10


-----!

In [55]:
sm_client = boto3.client('sagemaker')
endpoints = sm_client.list_endpoints()
print(endpoints)

{'Endpoints': [{'EndpointName': 'Custom-SkLeanr-model2025-01-20-05-40-09', 'EndpointArn': 'arn:aws:sagemaker:us-east-1:600627354775:endpoint/Custom-SkLeanr-model2025-01-20-05-40-09', 'CreationTime': datetime.datetime(2025, 1, 20, 10, 40, 47, 306000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2025, 1, 20, 10, 44, 9, 343000, tzinfo=tzlocal()), 'EndpointStatus': 'InService'}], 'ResponseMetadata': {'RequestId': '8ab5ea44-9d00-46b5-9145-6037ed8e562b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '8ab5ea44-9d00-46b5-9145-6037ed8e562b', 'content-type': 'application/x-amz-json-1.1', 'content-length': '277', 'date': 'Mon, 20 Jan 2025 06:09:59 GMT'}, 'RetryAttempts': 1}}


In [60]:
testX[features][0:2]

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
405,1454,1,0.5,1,1,0,34,0.7,83,4,3,250,1033,3419,7,5,5,1,1,0
1190,1092,1,0.5,1,10,0,11,0.5,167,3,14,468,571,737,14,4,11,0,1,0


In [70]:
predictor.predict(testX[features][0:2].values.tolist())

array([3, 0])

In [71]:
sm_boto3.delete_endpoint(EndpointName = endpoint_name)

{'ResponseMetadata': {'RequestId': '36fc14cf-4c01-4e61-853a-737ad6e750b1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '36fc14cf-4c01-4e61-853a-737ad6e750b1',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 20 Jan 2025 06:49:35 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}