In [2]:
import sagemaker 
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os 


In [None]:
sm_boto = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'bucket250420251'
print((f"Using bucket: {bucket}"))

Using bucket: bucket250420251


In [7]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [8]:
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
X=train_df.drop('price_range',axis=1)
y=train_df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 20), (400, 20), (1600,), (400,))

In [12]:
label = 'price_range'
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test



In [13]:
print(f"trainX shape: {trainX.shape}")
print(f"testX shape: {testX.shape}")

trainX shape: (1600, 21)
testX shape: (400, 21)


In [14]:
trainX.to_csv('trainX-v1.csv', index=False)
testX.to_csv('testX-v1.csv', index=False)

In [15]:

# upload the csv files to s3
sk_prefix = 'sagemaker/mobile-price-classification/sklearncontainer'
train_path = sess.upload_data(path='trainX-v1.csv', bucket=bucket, key_prefix=sk_prefix)
test_path = sess.upload_data(path='testX-v1.csv', bucket=bucket, key_prefix=sk_prefix)
print(f"train_path: {train_path}")

print(f"test_path: {test_path}")

train_path: s3://bucket250420251/sagemaker/mobile-price-classification/sklearncontainer/trainX-v1.csv
test_path: s3://bucket250420251/sagemaker/mobile-price-classification/sklearncontainer/testX-v1.csv


In [None]:
%%writefile script.py

import subprocess
import sys

# Install requirements
subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker"])

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score    
import sklearn 
import joblib 
import pandas as pd
import os
import argparse
import numpy as np
import logging
import json
import sys
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearn

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ == '__main__':

    print("[Info] Extracting arguments from the command line")
    parser = argparse.ArgumentParser()
    parser.add_argument('n_estimators', type=int, default = 100, help='number of trees in the forest')
    parser.add_argument('--random_state', type=int, default = 0, help='random state')

    parser.add_argument('--train', type =str, default = os.environ.get('SM_CHANNEL_TRAIN'), help='training data location')
    parser.add_argument('--test', type = str, default = os.environ.get('SM_CHANNEL_TEST'), help ='test data location')
    parser.add_argument('--model-dir', type = str, default =os.environ.get('SM_MODEL_DIR') , help ='model location')
    parser.add_argument('--train-file', type = str, default = 'trainX-v1.csv', help ='train file name')
    parser.add_argument('--test-file', type = str, default = 'testX-v1.csv', help ='test file name')
    

    args, _ = parser.parse_known_args()
    print(f"Arguments: {args}")

    print(f"sklearn_version: {sklearn.__version__}")
    

    print(f"reading training data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    print(f"train_df shape: {train_df.shape}")
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    print(f"test_df shape: {test_df.shape}")

    features = list(train_df.columns)
    lable = features.pop(-1)
    x_train = train_df[features]
    x_test = test_df[features]
    y_train = train_df[lable]
    y_test = test_df[lable]
    
    print("Buidling training and test datasets")
    
    print(f'training random forest classfifier')
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(x_train, y_train)
    print(f"model training complete")

    model_path = os.path.join(args.model_dir, 'model.joblib')
    print(f"Saving model to {model_path}")
    joblib.dump(model, model_path)  

    y_pred = model.predict(x_test)
    print(f"model prediction complete")
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"test accuracy: {test_accuracy}")
    print(f"test classification report: {classification_report(y_test, y_pred)}")
    print(f"test confusion matrix: {confusion_matrix(y_test, y_pred)}")
    print(f"test precision: {precision_score(y_test, y_pred, average='weighted')}") 


    




Overwriting script.py


In [35]:
from sagemaker.sklearn import SKLearn
from sagemaker import get_execution_role

#arn:aws:iam::257334481187:user/sagemakerexample
framework_version = '0.23-1'
sklearn_estimator = SKLearn(
    entry_point='script.py',
    role="arn:aws:iam::257334481187:role/service-role/AmazonSageMaker-ExecutionRole-20250425T114402",
    instance_type='ml.m5.large',
    instance_count=1,
    framework_version=framework_version,
    py_version='py3',
    sagemaker_session=sess,
    hyperparameters={
        'n_estimators': 100,
        'random_state': 0
    },
    metric_definitions=[
        {'Name': 'accuracy', 'Regex': 'test accuracy: ([0-9\\.]+)'},
        {'Name': 'precision', 'Regex': 'test precision: ([0-9\\.]+)'}
    ],
    use_spot_instances=True,
    max_run=3600,
    max_wait=3600
)



In [36]:
sklearn_estimator.fit({'train': train_path, 'test': test_path}, wait=False)