In [3]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd
import joblib

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name

BUCKET = "royzacawsbucket"

print("using bucket: " + BUCKET)


df = pd.read_csv('Data_Set.csv')

using bucket: royzacawsbucket


In [4]:

features = list(df.columns)
label = features.pop(-1)

x = df[features]
y = df[label]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=0)

In [5]:
print(type(X_train))
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

<class 'pandas.core.frame.DataFrame'>
(1700, 20)
(300, 20)
(1700,)
(300,)


In [6]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [7]:
sk_prefix = "sagemaker/gender_classification/sklearncontainer"

trainX.to_csv("train-V-1.csv", index=False)
testX.to_csv("test-V-1.csv", index=False)

trainpath = sess.upload_data(
    path="train-V-1.csv",
    bucket=BUCKET,
    key_prefix=sk_prefix,
)

testpath = sess.upload_data(
    path="test-V-1.csv",
    bucket=BUCKET,
    key_prefix=sk_prefix
)


print(trainpath)
print(testpath)

s3://royzacawsbucket/sagemaker/gender_classification/sklearncontainer/train-V-1.csv
s3://royzacawsbucket/sagemaker/gender_classification/sklearncontainer/test-V-1.csv


In [8]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    """loads the model file and returns"""
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()
    
    # hyperparamters send by the lient are passed as cli arguments to the model
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)
    
    # data, model, and output directories
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    
    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    
    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ", label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    
    print("Training Random Forest Model.....")
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()
    
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)
    print("Test Accuracy: ", test_acc)
    
    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print("[TESTING] Model Accuracy: ", test_acc)
    print("[TESTING] Testing Report: ", test_acc)

    print(test_rep)
    
    
    
    

    

Overwriting script.py


In [9]:
from sagemaker.sklearn.estimator import SKLearn


FRAMEWORK_VERSION = "0.23-1"


sklearn_estimator = SKLearn(
    entry_point="script.py",
    role = "arn:aws:iam::954247117315:role/sagemaker",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-customer-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0
        
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [10]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: RF-customer-sklearn-2023-08-11-01-14-48-385


2023-08-11 01:14:49 Starting - Starting the training job...
2023-08-11 01:15:03 Starting - Preparing the instances for training......
2023-08-11 01:16:07 Downloading - Downloading input data...
2023-08-11 01:16:38 Training - Downloading the training image...
2023-08-11 01:17:13 Training - Training image download completed. Training in progress..2023-08-11 01:17:19,042 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2023-08-11 01:17:19,046 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-08-11 01:17:19,088 sagemaker_sklearn_container.training INFO     Invoking user training script.
2023-08-11 01:17:19,235 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-08-11 01:17:19,247 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-08-11 01:17:19,260 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-08-11 01:

In [11]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-08-11 01:17:39 Starting - Preparing the instances for training
2023-08-11 01:17:39 Downloading - Downloading input data
2023-08-11 01:17:39 Training - Training image download completed. Training in progress.
2023-08-11 01:17:39 Uploading - Uploading generated training model
2023-08-11 01:17:39 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-954247117315/RF-customer-sklearn-2023-08-11-01-14-48-385/output/model.tar.gz


In [12]:
artifact

's3://sagemaker-us-east-1-954247117315/RF-customer-sklearn-2023-08-11-01-14-48-385/output/model.tar.gz'

In [16]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Customer-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifact,
    role="arn:aws:iam::954247117315:role/sagemaker",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)




In [17]:
endpoint_name = "Customer-sklearn-endpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Endpoint name: " + endpoint_name)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

Endpoint name: Customer-sklearn-endpoint-2023-08-11-01-31-38


INFO:sagemaker:Creating model with name: Customer-sklearn-model-2023-08-11-01-29-27
INFO:sagemaker:Creating endpoint-config with name Customer-sklearn-endpoint-2023-08-11-01-31-38
INFO:sagemaker:Creating endpoint with name Customer-sklearn-endpoint-2023-08-11-01-31-38


-----!

In [20]:
endpoint_name

'Customer-sklearn-endpoint-2023-08-11-01-31-38'

In [21]:
testX[features][0:2].values.tolist()

[[1454.0,
  1.0,
  0.5,
  1.0,
  1.0,
  0.0,
  34.0,
  0.7,
  83.0,
  4.0,
  3.0,
  250.0,
  1033.0,
  3419.0,
  7.0,
  5.0,
  5.0,
  1.0,
  1.0,
  0.0],
 [1092.0,
  1.0,
  0.5,
  1.0,
  10.0,
  0.0,
  11.0,
  0.5,
  167.0,
  3.0,
  14.0,
  468.0,
  571.0,
  737.0,
  14.0,
  4.0,
  11.0,
  0.0,
  1.0,
  0.0]]

In [22]:
# predicts output for index one and two
print(predictor.predict(testX[features][0:2].values.tolist()))

[3 0]
