In [None]:
import numpy as np
import pandas as pd

In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3

In [None]:
sm_boto3 = boto3.client("sagemaker")

In [None]:
sess = sagemaker.Session()

In [None]:
region = sess.boto_session.region_name

In [None]:
bucket_name = '210211060bucket'

In [None]:
print("Using Bucket Name:"+bucket_name)

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:
df['price_range'].value_counts(normalize=True)

In [None]:
df.columns

In [None]:
df.isnull().sum() * 100

In [None]:
features = list(df.columns)

In [None]:
features

In [None]:
label = features.pop(-1)

In [None]:
label

In [None]:
x = df[features]
y = df[label]

In [None]:
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=42)

In [None]:
trainX = pd.DataFrame(X_train)
trainX[label]=y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [None]:
trainX.isnull().sum()

In [None]:
testX.isnull().sum()

In [None]:
trainX.to_csv("train-V-1.csv",index=False)
testX.to_csv("test-V-1.csv",index=False)

In [None]:
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(path="train-V-1.csv",bucket=bucket_name,key_prefix=sk_prefix)

In [None]:
testpath = sess.upload_data(path="test-V-1.csv",bucket=bucket_name,key_prefix=sk_prefix)

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir,"model.joblib"))
    return clf

if __name__ == "__main__":
    
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()
    
    #hyperparameters sent by the client are passed as Command Line Arguements
    parser.add_arguement("--n_estimators",type=int,default=100)
    parser.add_arguement("--random_state",type=int,default=0)
    
    #Data,Model and Output directories
    parser.add_arguement("--model-dir",type=str,default=os.environ.get("SM_MODEL_DIR"))
    parser.add_arguement("--train",type=str,default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_arguement("--test",type=str,default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_arguement("--train-file",type=str,default=os.environ.get("train-V-1.csv"))
    parser.add_arguement("--test-file",type=str,default=os.environ.get("test-V-1.csv"))
    
    args, _ = parser.parse_known_args()
    
    print("SKLearn Version:",sklearn.__version__)
    print("Joblib Version:",joblib.__version__)
    
    print("[INFO] Reading Data")
    print()
    train_df  = pd.read_csv(os.path.join(args.train,args.train_file))
    test_df  = pd.read_csv(os.path.join(args.test,args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building Training and Testing Datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[features]
    y_test = test_df[features]
    
    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::891377263268:policy/service-role/AmazonSageMaker-ExecutionPolicy-20240116T122276",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

In [None]:
artifact

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::566373416292:role/service-role/AmazonSageMaker-ExecutionRole-20230120T164209",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
model_name

In [None]:
##Endpoints deployment
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)