In [1]:
import sagemaker
import boto3
import os
import pandas as pd
from sklearn.model_selection import train_test_split

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/mushfiq/.config/sagemaker/config.yaml


In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
sm_boto3=boto3.client('sagemaker')

boto_session = boto3.Session(
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name=os.getenv("AWS_DEFAULT_REGION")
)

sagemaker_session = sagemaker.Session(boto_session=boto_session)
region = boto3.Session().region_name
bucket = "mobbucketsagemakerlearning"
print("Using bucket: ", bucket)

Using bucket:  mobbucketsagemakerlearning


In [4]:
df = pd.read_csv("mob_price_classification_train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df.shape

(2000, 21)

In [6]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [7]:
df["price_range"].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [8]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [9]:
X = df.drop("price_range", axis=1)
y = df["price_range"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 20)
(400, 20)
(1600,)
(400,)


In [11]:
label = "price_range"

trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [12]:
trainX.to_csv("train-v1.csv", index=False)
testX.to_csv("test-v1.csv", index=False)

In [13]:
bucket

'mobbucketsagemakerlearning'

In [14]:
# send data to s3, sagemaker will read this data

sk_prefix = "sagemaker/mob_price_classification/sklearncontainer"

train_data = sagemaker_session.upload_data(
    path=os.path.abspath("train-v1.csv"),
    bucket=bucket,
    key_prefix=sk_prefix
)
test_data = sagemaker_session.upload_data(
    path=os.path.abspath("test-v1.csv"),
    bucket=bucket,
    key_prefix=sk_prefix
)

print(train_data)
print(test_data)

s3://mobbucketsagemakerlearning/sagemaker/mob_price_classification/sklearncontainer/train-v1.csv
s3://mobbucketsagemakerlearning/sagemaker/mob_price_classification/sklearncontainer/test-v1.csv


## script used by AWS Sagemaker to train the model

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score
import sklearn
import joblib
import boto3
import os
import pathlib
from io import StringIO
import argparse
import pandas as pd
import numpy as np


def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == '__main__':
    print("[INFO] Extracting arguments...")
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--max_depth', type=int, default=6)
    parser.add_argument('--random_state', type=int, default=42)

    # Sagemaker specific arguments
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))

    # Arguments for training data
    parser.add_argument('--train_file', type=str, default='train-v1.csv')
    parser.add_argument('--test_file', type=str, default='test-v1.csv')

    args, _ = parser.parse_known_args()
    print("Sklearn version: {}".format(sklearn.__version__))

    print("[INFO] Loading the datasets...")

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    X_train = train_df.drop('price_range', axis=1)
    y_train = train_df['price_range']

    X_test = test_df.drop('price_range', axis=1)
    y_test = test_df['price_range']

    print("[INFO] Training dataset shape: {}".format(X_train.shape))
    print("[INFO] Testing dataset shape: {}".format(X_test.shape))

    print("[INFO] Training the model...")

    clf = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.random_state, n_jobs=-1, verbose=1)
    clf.fit(X_train, y_train)

    print("[INFO] Saving the model...")

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(clf, model_path)

    print("Model saved at: {}".format(model_path))

    y_pred_test = clf.predict(X_test)

    print("[INFO] Calculating metrics...")

    accuracy = accuracy_score(y_test, y_pred_test)
    test_report = classification_report(y_test, y_pred_test)

    print("[INFO] Accuracy: {}".format(accuracy))
    print("[INFO] Classification Report: {}".format(test_report))    

Writing script.py


## aws sagemaker entrypoint to execute the training script

In [21]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role=os.getenv("SAGEMAKER_ROLE"),
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='sklearn-mob-price-classification',
    hyperparameters={
        'n_estimators': 100,
        'max_depth': 6,
        'random_state': 42
    },
    use_spot_instances=True,
    max_run=3600,
    max_wait=7200,
)

In [22]:
# launch the training job
sklearn_estimator.fit({'train': train_data, 'test': test_data}, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sklearn-mob-price-classification-2025-07-27-16-54-40-726


2025-07-27 16:54:50 Starting - Starting the training job...
2025-07-27 16:55:27 Downloading - Downloading input data...
2025-07-27 16:55:52 Downloading - Downloading the training image...
2025-07-27 16:56:33 Training - Training image download completed. Training in progress..2025-07-27 16:56:37,433 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-07-27 16:56:37,436 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-07-27 16:56:37,478 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-07-27 16:56:37,663 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-07-27 16:56:37,675 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-07-27 16:56:37,687 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-07-27 16:56:37,696 sagemaker-training-toolkit INFO     Invoking user script
Trai

## to get the model from s3

In [25]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifacts = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']
print(artifacts)


2025-07-27 16:57:01 Starting - Preparing the instances for training
2025-07-27 16:57:01 Downloading - Downloading the training image
2025-07-27 16:57:01 Training - Training image download completed. Training in progress.
2025-07-27 16:57:01 Uploading - Uploading generated training model
2025-07-27 16:57:01 Completed - Training job completed
s3://sagemaker-us-east-1-823151422696/sklearn-mob-price-classification-2025-07-27-16-54-40-726/output/model.tar.gz
