# Hyperparameter tunning

#### Introduction

This notebook makes a hyperparameter tunning to the XGBoost algorithm used in the 02_training notebook. \
After finding the best parameters the models is hosted in an endpoint and makes predictions on test dataset. 

#### Imports:

In [7]:
import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.inputs import TrainingInput
from sklearn.metrics import classification_report

from time import gmtime, strftime

import numpy as np
import pandas as pd
import os

#### Sessions:

In [8]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Bucket paths:

In [9]:
prefix = "hyp"
bucket = "markos-telco-churn"
bucket_path = f"s3://{bucket}"
input_data_path = "ingest/ingest-2023-10-14-21-32-51"


model_path = "model"
model_output_path = os.path.join(model_path, "output")

In [10]:
sess = sagemaker.Session(default_bucket=bucket)
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Define the grid:

In [13]:
base_tuning_job_name = f'{prefix}-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
output_path = os.path.join(bucket_path, prefix, base_tuning_job_name)

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="error",
    num_round=10
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}

objective_metric_name = "validation:f1"

tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, autotune=True, max_jobs=3, max_parallel_jobs=3, base_tuning_job_name=base_tuning_job_name)

#### Define input paths:

In [14]:
s3_input_train = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/train/train.csv", content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/val/val.csv", content_type="csv"
)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.............................................!


In [15]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

#### Describe and extract the best parameters:

In [16]:
training_job_name=boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)['BestTrainingJob']['TrainingJobName']

In [17]:
# Describe the best training job to get its hyperparameters
best_training_job_description = boto3.client("sagemaker").describe_training_job(
    TrainingJobName=training_job_name
)

# Extract the hyperparameters from the best training job description
best_hyperparameters = best_training_job_description['HyperParameters']

print("Best Hyperparameters:")
print(best_hyperparameters)

Best Hyperparameters:
{'_tuning_objective_metric': 'validation:f1', 'alpha': '1.5142283504969696', 'eta': '0.5666020934684121', 'eval_metric': 'error', 'max_depth': '7', 'min_child_weight': '1.667369518766365', 'num_round': '945', 'objective': 'binary:logistic'}


#### Deploy model in an Endpoint:

In [34]:
from sagemaker.serializers import CSVSerializer

endpoint_config_name = f'churn-demo-feature-engineered-xgbpconfig-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
xgb_predictor = tuner.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer(), endpoint_name=endpoint_config_name
)


2023-10-15 16:00:39 Starting - Preparing the instances for training
2023-10-15 16:00:39 Downloading - Downloading input data
2023-10-15 16:00:39 Training - Training image download completed. Training in progress.
2023-10-15 16:00:39 Uploading - Uploading generated training model
2023-10-15 16:00:39 Completed - Resource retained for reuse
---------!

#### Evaluate on test data

In [35]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = "".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return predictions.split("\n")[:-1]

In [36]:
test = pd.read_csv(f"{bucket_path}/{input_data_path}/test/test.csv")
predictions = predict(test.to_numpy()[:, 1:])
predictions = np.array([float(num) for num in predictions])
test['pred'] = predictions
test['pred'] = test['pred'].astype('float')
test['pred'] = (test['pred'] > 0.4).astype(int)
test.head() 

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender_M,SeniorCitizen_Y,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No_phone,...,StreamingTV_Yes,StreamingMovies_No_internet,StreamingMovies_Yes,Contract_One_year,Contract_Two_years,PaperlessBilling_Yes,PaymentMethod_Credit_card,PaymentMethod_Electronic_check,PaymentMethod_Mailed_check,pred
0,1.0,0.518765,1.199763,1.084916,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,0.0,0.1923,-0.648933,-0.30318,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.0,1.212503,1.306451,1.90117,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0
3,1.0,-1.07275,-0.563916,-0.871316,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,0.0,-0.827902,0.28125,-0.640643,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1


In [38]:
cr = classification_report(test['Churn'], test['pred'])
print(cr)

              precision    recall  f1-score   support

         0.0       0.82      0.87      0.85       300
         1.0       0.63      0.53      0.58       122

    accuracy                           0.77       422
   macro avg       0.73      0.70      0.71       422
weighted avg       0.77      0.77      0.77       422



#### Delete endpoint

In [72]:
xgb_predictor.delete_endpoint()

#### Register model

In [56]:
best_training_job_name = tuner.best_training_job()

In [57]:
sagemaker_client = boto3.client('sagemaker', region_name=region)
best_job_description = sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)
model_artifacts_s3_uri = best_job_description['ModelArtifacts']['S3ModelArtifacts']

In [66]:
modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": container,
	    "ModelDataUrl": model_artifacts_s3_uri
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

create_model_package_input_dict = {
    "ModelPackageGroupName" :"test",
    "ModelPackageDescription" : "Telco churn model",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

In [69]:
sm_client = boto3.client('sagemaker', region_name=region)
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-1:204113162030:model-package/test/1


#### Delete package

In [65]:
"""from sagemaker.session import Session
# Specify the model name for registration
model_name = 'my-registered-model3' 
# Register the best model in the Model Registry
model_registry = sess.sagemaker_client
model_registry.create_model_package(
    ModelPackageName=model_name,
    InferenceSpecification={
        'Containers': [
            {
                'Image': container, 
                'ModelDataUrl': model_artifacts_s3_uri
            }
        ],
        'SupportedContentTypes': ['text/csv'],  
        'SupportedResponseMIMETypes': ['text/csv'],
        'SupportedRealtimeInferenceInstanceTypes': ['ml.m4.xlarge'], 
        'SupportedTransformInstanceTypes': ['ml.m4.xlarge'],  
    }
)
#and delete
model_package_arn = 'arn:aws:sagemaker:us-east-1:204113162030:model-package/my-registered-model'

sagemaker_session = sagemaker.Session()
sagemaker_session.sagemaker_client.delete_model_package(ModelPackageName=model_package_arn)
"""