In [49]:
import pandas as pd
import sagemaker, math, boto3, numpy as np
from pathlib import Path
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

dataset = load_boston()

X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['PRICE'])
df = pd.concat(objs=[y,X], axis=1)


In [4]:
df.head(10)

Unnamed: 0,PRICE,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,24.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,21.6,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,34.7,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,33.4,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,36.2,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,28.7,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,22.9,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,27.1,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,16.5,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,18.9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PRICE    506 non-null    float64
 1   CRIM     506 non-null    float64
 2   ZN       506 non-null    float64
 3   INDUS    506 non-null    float64
 4   CHAS     506 non-null    float64
 5   NOX      506 non-null    float64
 6   RM       506 non-null    float64
 7   AGE      506 non-null    float64
 8   DIS      506 non-null    float64
 9   RAD      506 non-null    float64
 10  TAX      506 non-null    float64
 11  PTRATIO  506 non-null    float64
 12  B        506 non-null    float64
 13  LSTAT    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [6]:
df.describe()

Unnamed: 0,PRICE,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,22.532806,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,9.197104,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,5.0,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,17.025,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,21.2,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,25.0,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,50.0,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [38]:
def get_s3_path(path):
    return f's3://{bucket_name}/xgboost/boston_dataset/{path}'

def get_s3_key(path):
    return f'xgboost/boston_dataset/{path}'

def get_local_path(path):
    dir_path = './datasets/xgboost/boston_dataset'

    Path(dir_path).mkdir(parents=True, exist_ok=True)

    return f'{dir_path}/{path}'

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.8, shuffle=True, random_state=0)

train_dataset = pd.concat(objs=[y_train, X_train], axis=1)
test_dataset = pd.concat(objs=[y_test, X_test], axis=1)

s3_client = boto3.client('s3')
bucket_name = 'ml-bucket-sagemaker'

train_dataset_path = get_local_path('train.csv')
test_dataset_path = get_local_path('test.csv')
train_dataset.to_csv(train_dataset_path, header=False, index=False)
test_dataset.to_csv(test_dataset_path, header=False, index=False)

s3_client.upload_file(train_dataset_path, bucket_name, get_s3_key('train.csv'))
s3_client.upload_file(test_dataset_path, bucket_name, get_s3_key('test.csv'))

In [57]:
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = 'AmazonSageMaker-ExecutionRole-20211003T193315' # sagemaker.get_execution_role()

container = sagemaker.image_uris.retrieve('xgboost', region, '1.2-2')

model = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    volume_size=5,
    output_path=get_s3_path('model'),
    sagemaker_session=sagemaker_session,
)

model.set_hyperparameters(
    max_depth=5,
    learning_rate=0.1,
    objective='reg:squarederror',
    colsample_bytree=0.3,
    alpha=10,
    num_round=30,
    seed=0,
    eval_metric='rmse'
)

train_input = sagemaker.inputs.TrainingInput(get_s3_path('train.csv'), content_type='csv')
test_input = sagemaker.inputs.TrainingInput(get_s3_path('test.csv'), content_type='csv')

model.fit({
    'train': train_input,
    'validation': test_input,
})


2021-10-09 16:20:47 Starting - Starting the training job...
2021-10-09 16:21:10 Starting - Launching requested ML instancesProfilerReport-1633796446: InProgress
......
2021-10-09 16:22:11 Starting - Preparing the instances for training...
2021-10-09 16:23:03 Downloading - Downloading input data...
2021-10-09 16:23:31 Training - Downloading the training image...
2021-10-09 16:24:06 Uploading - Uploading generated training model
2021-10-09 16:24:06 Completed - Training job completed
[34m[2021-10-09 16:23:53.189 ip-10-0-192-227.eu-west-1.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-09:16:23:53:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-09:16:23:53:INFO] Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34m[2021-10-09:16:23:53:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[202

In [64]:
# First-time deployment
predictor = model.deploy(
    endpoint_name='xgboost-boston-dataset',
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=sagemaker.serializers.CSVSerializer()
)

# Re-use a pre-existing endpoint
# predictor = sagemaker.predictor.Predictor(
#     endpoint_name='xgboost-boston-dataset',
#     sagemaker=sagemaker_session,
#     serializer=sagemaker.serializers.CSVSerializer()
# )

-------!

In [65]:
def predict(data, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

y_pre = predict(X_test.to_numpy())

mse = mean_squared_error(y_test, y_pre)
rmse = math.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 5.151655979721133
