# Simple Training

#### Introduction

This notebook makes a hyperparameter tunning to the XGBoost algorithm used in the 02_training notebook. \
After finding the best parameters the models is hosted in an endpoint and makes predictions on test dataset. 

#### Imports:

In [3]:
import boto3
import os
import re
import numpy as np
import pandas as pd
import io
import sys
import json
from IPython.display import display

import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

from time import gmtime, strftime

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Sessions:

In [4]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


#### Bucket paths:

In [5]:
prefix = "train"
bucket = "markos-telco-churn"
bucket_path = f"s3://{bucket}"
input_data_path = "ingest/ingest-2023-10-14-21-32-51"


model_path = "model"
model_output_path = os.path.join(model_path, "output")

#### Training

#### Create experiment:

In [38]:
base_training_job_name = f'{prefix}-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
output_path = os.path.join(bucket_path, prefix, base_training_job_name)

In [41]:
sess = sagemaker.Session(default_bucket=bucket)
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")

s3_input_train = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/train/train.csv", content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data=f"{bucket_path}/{input_data_path}/val/val.csv", content_type="csv"
)

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    sagemaker_session=sess,
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=10,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [44]:
!pip install sagemaker-experiments

Collecting sagemaker-experiments
  Obtaining dependency information for sagemaker-experiments from https://files.pythonhosted.org/packages/2b/2b/47d105bbcc328c58b1a23948c3fd9b86930d10b33d220d20c9819e75c41b/sagemaker_experiments-0.1.45-py3-none-any.whl.metadata
  Downloading sagemaker_experiments-0.1.45-py3-none-any.whl.metadata (10 kB)
Downloading sagemaker_experiments-0.1.45-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 kB[0m [31m540.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.45
[0m

In [50]:
from smexperiments import experiment

In [57]:
experiment_name = 'xgboost-training3'
try:
    experiment = experiment.Experiment.create(experiment_name=experiment_name, description='Model for telco-churn dataset with fixed hyperparameters')
except:
    print('Experiment alread exists')
xgb.fit({"train": s3_input_train, "validation": s3_input_validation}, experiment_config={'ExperimentName':experiment_name})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-10-18-16-59-18-362


Experiment alread exists
2023-10-18 16:59:18 Starting - Starting the training job...
2023-10-18 16:59:42 Starting - Preparing the instances for training.....

KeyboardInterrupt: 

#### Deploy model on an Endpoint

In [17]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-10-15-18-15-19-524
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-10-15-18-15-19-524
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-10-15-18-15-19-524


-------!

#### Evaluate on Test set

In [18]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = "".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return predictions.split("\n")[:-1]

test = pd.read_csv(f"{bucket_path}/{input_data_path}/test/test.csv")
predictions = predict(test.to_numpy()[:, 1:])
predictions = np.array([float(num) for num in predictions])

In [19]:
test['pred'] = predictions
test['pred'] = test['pred'].astype('float')
test['pred'] = (test['pred'] > 0.4).astype(int)
test.head() 

Unnamed: 0,Churn,tenure,MonthlyCharges,TotalCharges,gender_M,SeniorCitizen_Y,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No_phone,...,StreamingTV_Yes,StreamingMovies_No_internet,StreamingMovies_Yes,Contract_One_year,Contract_Two_years,PaperlessBilling_Yes,PaymentMethod_Credit_card,PaymentMethod_Electronic_check,PaymentMethod_Mailed_check,pred
0,1.0,0.518765,1.199763,1.084916,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,0.0,0.1923,-0.648933,-0.30318,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.0,1.212503,1.306451,1.90117,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0
3,1.0,-1.07275,-0.563916,-0.871316,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,0.0,-0.827902,0.28125,-0.640643,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1


In [20]:
from sklearn.metrics import classification_report

cr = classification_report(test['Churn'], test['pred'])
print(cr)

              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84       300
         1.0       0.62      0.57      0.59       122

    accuracy                           0.77       422
   macro avg       0.72      0.71      0.72       422
weighted avg       0.77      0.77      0.77       422



#### Delete endpoint

In [21]:
xgb_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-10-15-18-15-19-524
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-10-15-18-15-19-524
