# Hyperparameter tunning

#### Introduction

This notebook makes a hyperparameter tunning to the XGBoost algorithm used in the 02_training notebook. \
After finding the best parameters the models is hosted in an endpoint and makes predictions on test dataset. 

#### Imports:

In [2]:
import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.inputs import TrainingInput
from sklearn.metrics import classification_report

from time import gmtime, strftime

import numpy as np
import pandas as pd
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Sessions:

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Bucket paths:

In [4]:
prefix = "hyp"
bucket = "markos-telco-churn"
bucket_path = f"s3://{bucket}"
input_data_path = "ingest/ingest-2023-10-14-21-32-51"


model_path = "model"
model_output_path = os.path.join(model_path, "output")

In [5]:
sess = sagemaker.Session(default_bucket=bucket)
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Define the grid:

In [6]:
base_tuning_job_name = f'{prefix}-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
output_path = os.path.join(bucket_path, prefix, base_tuning_job_name)

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="error",
    num_round=10
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}

objective_metric_name = "validation:f1"

tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, autotune=True, max_jobs=3, max_parallel_jobs=3, base_tuning_job_name=base_tuning_job_name)

#### Define input paths:

In [7]:
s3_input_train = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/train/train.csv", content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/val/val.csv", content_type="csv"
)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................................!


In [8]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

#### Describe and extract the best parameters:

In [9]:
training_job_name=boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)['BestTrainingJob']['TrainingJobName']

In [10]:
# Describe the best training job to get its hyperparameters
best_training_job_description = boto3.client("sagemaker").describe_training_job(
    TrainingJobName=training_job_name
)

# Extract the hyperparameters from the best training job description
best_hyperparameters = best_training_job_description['HyperParameters']

print("Best Hyperparameters:")
print(best_hyperparameters)

Best Hyperparameters:
{'_tuning_objective_metric': 'validation:f1', 'alpha': '1.153907859890421', 'eta': '0.896689140418828', 'eval_metric': 'error', 'max_depth': '1', 'min_child_weight': '9.060994643450572', 'num_round': '685', 'objective': 'binary:logistic'}


#### Deploy model in an Endpoint:

In [11]:
from sagemaker.serializers import CSVSerializer

endpoint_config_name = f'churn-demo-feature-engineered-xgbpconfig-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
xgb_predictor = tuner.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer(), endpoint_name=endpoint_config_name
)


2023-10-16 12:44:20 Starting - Preparing the instances for training
2023-10-16 12:44:20 Downloading - Downloading input data
2023-10-16 12:44:20 Training - Training image download completed. Training in progress.
2023-10-16 12:44:20 Uploading - Uploading generated training model
2023-10-16 12:44:20 Completed - Resource retained for reuse
-----!

#### Evaluate on test data:

In [12]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = "".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return predictions.split("\n")[:-1]

In [13]:
test = pd.read_csv(f"{bucket_path}/{input_data_path}/test/test.csv")
predictions = predict(test.to_numpy()[:, 1:])
predictions = np.array([float(num) for num in predictions])
test['pred'] = predictions
test['pred'] = test['pred'].astype('float')
test['pred'] = (test['pred'] > 0.4).astype(int)
test.head() 

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender_M,SeniorCitizen_Y,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No_phone,...,StreamingTV_Yes,StreamingMovies_No_internet,StreamingMovies_Yes,Contract_One_year,Contract_Two_years,PaperlessBilling_Yes,PaymentMethod_Credit_card,PaymentMethod_Electronic_check,PaymentMethod_Mailed_check,pred
0,1.0,0.518765,1.199763,1.084916,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,0.0,0.1923,-0.648933,-0.30318,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.0,1.212503,1.306451,1.90117,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0
3,1.0,-1.07275,-0.563916,-0.871316,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,0.0,-0.827902,0.28125,-0.640643,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1


In [14]:
cr = classification_report(test['Churn'], test['pred'])
print(cr)

              precision    recall  f1-score   support

         0.0       0.84      0.86      0.85       300
         1.0       0.63      0.58      0.61       122

    accuracy                           0.78       422
   macro avg       0.73      0.72      0.73       422
weighted avg       0.78      0.78      0.78       422



#### Delete endpoint:

In [15]:
xgb_predictor.delete_endpoint()

#### Register model:

In [25]:
best_training_job_name = tuner.best_training_job()

In [26]:
sagemaker_client = boto3.client('sagemaker', region_name=region)
best_job_description = sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)
model_artifacts_s3_uri = best_job_description['ModelArtifacts']['S3ModelArtifacts']

In [27]:
model_artifacts_s3_uri

's3://markos-telco-churn/hyp/hyp-2023-10-18-17-03-43/hyp-2023-10-18-17-03-231018-1703-001-1b1e13f2/output/model.tar.gz'

In [35]:
model_package_group_name = 'test2'

# Create the model package group
response = boto3.client('sagemaker').create_model_package_group(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageGroupDescription='Description for the model package group'
)

In [32]:
modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": container,
	    "ModelDataUrl": model_artifacts_s3_uri
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

create_model_package_input_dict = {
    "ModelPackageGroupName" :"test",
    "ModelPackageDescription" : "Telco churn model (with hyperparameter tunning)",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

In [33]:
sm_client = boto3.client('sagemaker', region_name=region)
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-1:204113162030:model-package/test/1


In [None]:
# delete group

In [40]:
try:
    # Delete the Model Registry Group
    sagemaker_client.delete_model_package_group(ModelPackageGroupName="test")
    print(f"Model Registry Group Churn-Xgboost deleted successfully.")
except Exception as e:
    print(f"Error deleting Model Registry Group: {e}")

Model Registry Group Churn-Xgboost deleted successfully.


## Clarify

In [17]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

prefix = "clarify"
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix)
train_uri = "s3://markos-telco-churn/ingest/ingest-2023-10-14-21-32-51/train/train.csv"
train_input = TrainingInput(train_uri, content_type="csv")
test_uri = "s3://markos-telco-churn/ingest/ingest-2023-10-14-21-32-51/test/test.csv"
train_df = pd.read_csv(train_uri)
test_df = pd.read_csv(test_uri)

In [18]:
best_training_job = tuner.best_training_job()
best_model_s3_uri = sagemaker.Session().describe_training_job(best_training_job)['ModelArtifacts']['S3ModelArtifacts']

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Preprocess the data 

For clarify we choose one column for which we want to calculate the bias.
In this example it is Gender.

Columns that are non continues need to be converted to int otherwise th
The columns that are needed to checked for bias are needed to be converted to int since in our dataset
this columns are not continues (one hot encoded) and clarify needs non-continues columns to be integers and not floats.

In [19]:
from sklearn import preprocessing


def number_encode_features(df, convert_to_int_list):
    result = df.copy()
    encoders = {}
    for column in result.columns:
        if column in convert_to_int_list:
            result[column] = pd.to_numeric(result[column], errors='coerce').fillna(0).astype(int)
        if result.dtypes[column] == object:
            encoders[column] = preprocessing.LabelEncoder()
            result[column] = encoders[column].fit_transform(result[column].fillna("None"))
    return result, encoders


training_data = pd.concat([train_df["Churn"], train_df.drop(["Churn"], axis=1)], axis=1)
training_data, _ = number_encode_features(training_data, ['gender_M', 'Churn'])
training_data.to_csv("train_data.csv", index=False, header=False)

testing_data, _ = number_encode_features(test_df, ['gender_M', 'Churn'])
test_features = testing_data.drop(["Churn"], axis=1)
test_target = testing_data["Churn"]
test_features.to_csv("test_features.csv", index=False, header=False)

In [20]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

train_uri = S3Uploader.upload("train_data.csv", "s3://{}/{}".format(bucket, prefix))
train_input = TrainingInput(train_uri, content_type="csv")
test_uri = S3Uploader.upload("test_features.csv", "s3://{}/{}".format(bucket, prefix))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [99]:
#sagemaker.Session().delete_endpoint(clarify_model_name)

In [21]:
clarify_model_name = "clarify-model"
model = sagemaker.model.Model(
    model_data=best_model_s3_uri,
    image_uri=container,  # Replace with the URI for the model's container
    role=role,
    name = clarify_model_name
)
container_def = model.prepare_container_def()
sess.create_model(clarify_model_name, role, container_def)
#model.name = clarify_model_name
#model.deploy(instance_type='ml.m4.xlarge',  initial_instance_count=1,  endpoint_name=clarify_model_name)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Using already existing model: clarify-model


'clarify-model'

In [None]:
#sagemaker.Session().delete_endpoint('clarify-model')

In [22]:
from sagemaker import clarify
from sagemaker.clarify import BiasConfig

clarify_processor = clarify.SageMakerClarifyProcessor(role=role,
                                                      instance_count=1,
                                                      instance_type='ml.m4.xlarge',
                                                      sagemaker_session=sess)

bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=bias_report_output_path,
    label="Churn",
    headers=train_df.columns.to_list(),
    dataset_type="text/csv",
)

model_config = clarify.ModelConfig(
    model_name=clarify_model_name,
    instance_type="ml.m4.xlarge",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv",
)

predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)


bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],  
    facet_name="gender_M", 
    facet_values_or_threshold=[0]
)

In [23]:
# The job takes about 10 minutes to run
clarify_processor.run_bias(
    data_config=bias_data_config,
    bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=predictions_config,
    pre_training_methods="all",
    post_training_methods="all",
)

INFO:sagemaker:Creating processing-job with name Clarify-Bias-2023-10-18-17-36-56-038


...........................................[34m2023-10-18 17:44:01,713 logging.conf not found when configuring logging, using default logging configuration.[0m
[34m2023-10-18 17:44:01,714 Starting SageMaker Clarify Processing job[0m
[34m2023-10-18 17:44:01,714 Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34m2023-10-18 17:44:01,714 Analysis result path: /opt/ml/processing/output[0m
[34m2023-10-18 17:44:01,715 This host is algo-1.[0m
[34m2023-10-18 17:44:01,715 This host is the leader.[0m
[34m2023-10-18 17:44:01,715 Number of hosts in the cluster is 1.[0m
[34m2023-10-18 17:44:02,035 Running Python / Pandas based analyzer.[0m
[34m2023-10-18 17:44:02,035 Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
[34m2023-10-18 17:44:02,048 Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34m2023-10-18 17:44:02,079 Loaded dataset. Dataset info:[0m
[34m<class 'pandas.core.frame.DataFrame'>[0m
[34mRangeIndex: 4

In [24]:
bias_report_output_path = clarify_processor.output_path

AttributeError: 'SageMakerClarifyProcessor' object has no attribute 'output_path'

In [None]:
#clarify_processor.delete_job(clarify_processor.latest_job_name)
#sagemaker.Session().delete_endpoint(clarify_model_name)

In [None]:
#sagemaker.Session().delete_endpoint("sm-clarify-clarify-model-1697566486-2c00")

In [27]:
endpoint_config_name = 'clarify-model'
try:   
    sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
    print(f"Deleted SageMaker endpoint configuration: {endpoint_config_name}")
except Exception as e:
    print(f"Error deleting endpoint configuration: {e}")

Error deleting endpoint configuration: name 'sagemaker_client' is not defined
