# Bank Marketing Dataset - Notebook 03

Predicting Term Deposit Suscriptions

This notebook demonstrates how to train a model using the notebook's instance (no extra computational resources).

In [None]:
!ls -la

In [None]:
!ls -la data

In [None]:
!pip install sagemaker ipywidgets --upgrade --quiet

## Import libs

In [None]:
import sagemaker, boto3, json
import pprint
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sagemaker import get_execution_role
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sagemaker.debugger import Rule, rule_configs
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sklearn.metrics import roc_curve, roc_auc_score

## Define preproc functions

In [None]:
def rebalance(data):
    """
    Resample data to keep balance between target classes.

    The function uses the resample function to downsample the minority class to match the majority class.

    Args:
        data (pd.DataFrame): DataFrame

    Returns:
        pd.DataFrame): balanced DataFrame
    """
    churn_0 = data[data["Exited"] == 0]
    churn_1 = data[data["Exited"] == 1]
    if len(churn_0) > len(churn_1):
        churn_maj = churn_0
        churn_min = churn_1
    else:
        churn_maj = churn_1
        churn_min = churn_0
    churn_maj_downsample = resample(
        churn_maj, n_samples=len(churn_min), replace=False, random_state=1234
    )

    return pd.concat([churn_maj_downsample, churn_min])


def preprocess(df):
    """
    Preprocess and split data into training and test sets.

    Args:
        df (pd.DataFrame): DataFrame with features and target variables

    Returns:
        ColumnTransformer: ColumnTransformer with scalers and encoders
        pd.DataFrame: training data with transformed features
        pd.DataFrame: test data with transformed features
    """
    filter_feat = [
        "Exited",
        "CreditScore",
        "Geography",
        "Gender",
        "Age",
        "Tenure",
        "Balance",
        "NumOfProducts",
        "HasCrCard",
        "IsActiveMember",
        "EstimatedSalary",
    ]
    cat_cols = ["Geography", "Gender"]
    num_cols = [
        "CreditScore",
        "Age",
        "Tenure",
        "Balance",
        "NumOfProducts",
        "HasCrCard",
        "IsActiveMember",
        "EstimatedSalary",
    ]
    data = df.loc[:, filter_feat]
    data_bal = rebalance(data=data)

    df_train, df_test = train_test_split(
        data_bal, test_size=0.3, random_state=1912
    )
    col_transf = make_column_transformer(
        (StandardScaler(), num_cols),
        (OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False), cat_cols),
        remainder="passthrough",
        verbose_feature_names_out=False
    ).set_output(transform='pandas')

    df_train = col_transf.fit_transform(df_train)
    df_train = df_train

    df_test = col_transf.transform(df_test)
    
    cols = df_train.columns.tolist()
    cols = [cols[-1]] + cols[:-1]

    # Reorder the columns in the DataFrame
    df_train = df_train[cols]
    df_test = df_test[cols]

    return col_transf, df_train, df_test

## Start coding!

In [None]:
aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

print(f"Role is: {aws_role}")

## Configure Bucket to export results

In [None]:
bucket = "sagemaker-mlops-out-YOUR_INSPER_USERNAME"
bucket_path = f"https://s3-{aws_region}.amazonaws.com/{bucket}"

Create a bucket to store your experiment results

In [None]:
s3 = boto3.client("s3")

s3.create_bucket(
    Bucket=bucket,
    CreateBucketConfiguration={"LocationConstraint": aws_region},
)

## Open Data

In [None]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.head(3)

In [None]:
col_transf, df_train, df_test = preprocess(df)
df_train.head(2)

Export to parquet

In [None]:
df_train.to_parquet("churn_train.parquet")
df_test.to_parquet("churn_test.parquet")

And export to `data` inside the S3 bucket:

In [None]:
sagemaker.Session().upload_data(
    "churn_train.parquet", bucket=bucket, key_prefix="data/train"
)

sagemaker.Session().upload_data(
    "churn_test.parquet", bucket=bucket, key_prefix="data/test"
)

## Create Sagemaker Training Job


A SageMaker training job is a unit of work in Amazon SageMaker. It involves the process of training an ML model using a specified dataset and a chosen algorithm or framework.

When you initiate a training job in SageMaker, you provide the necessary configuration, including the location of the training data, the ML algorithm or framework to be used, the compute resources required, and any hyperparameter settings. SageMaker then **provisions the necessary infrastructure**, such as instances or GPU-accelerated instances, to perform the training.

During the training job, the ML model is trained by iteratively processing the training data and adjusting the model's parameters to minimize the defined loss or maximize the defined objective. The training progress, including metrics and logs, is captured and can be monitored in real-time.

Once the training job is completed, the trained model is saved in a specified output location, such as Amazon S3. This trained model can then be used for inference or deployed to make predictions on new data.

In summary, a SageMaker training job encapsulates the process of training an ML model by providing the necessary configuration, managing the required compute resources, and capturing the resulting trained model for further use. It enables efficient and scalable model training in the cloud.


In [None]:
# Sagemaker client
client = boto3.client("sagemaker", region_name=aws_region)

In [None]:
# Configure data source
train_data_uri = f"s3://{bucket}/data/train/churn_train.parquet"
val_data_uri = f"s3://{bucket}/data/test/churn_test.parquet"

train_input = TrainingInput(train_data_uri, content_type="application/x-parquet")
val_input = TrainingInput(val_data_uri, content_type="application/x-parquet")

In [None]:
# Get xgboost image
image = sagemaker.image_uris.retrieve("xgboost", aws_region, "1.7-1")
image

In [None]:
# Configure instance, output path and model name
estimator_output_path = f"s3://{bucket}/training_jobs"
train_instance_count = 1
train_instance_type = "ml.m5.large"
save_interval = 2
model_name = "churn-new-model-{}".format(datetime.now().strftime("%Y%m%d-%H%M%S"))

hyperparameters = {
    "max_depth": "4",
    "eta": "0.2",
    "objective": "binary:logistic",
    "num_round": "100",
}

# Create estimator
xgb_estimator = Estimator(
    role=aws_role,
    instance_count=train_instance_count,
    instance_type=train_instance_type,
    image_uri=image,
    hyperparameters=hyperparameters,
    output_path=estimator_output_path,
    debugger_hook_config=DebuggerHookConfig(
        s3_output_path=estimator_output_path + "/debugger",
        collection_configs=[
            CollectionConfig(name="metrics", parameters={"save_interval": str(save_interval)}),
            CollectionConfig(
                name="feature_importance", parameters={"save_interval": str(save_interval)}
            ),
            CollectionConfig(name="full_shap", parameters={"save_interval": str(save_interval)}),
            CollectionConfig(name="average_shap", parameters={"save_interval": str(save_interval)}),
        ],
    ),
    rules=[
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                "collection_names": "metrics",
                "num_steps": str(save_interval * 2),
            },
        )
    ],
)

In [None]:
response = client.list_training_jobs(
    NameContains=model_name, StatusEquals="Completed", SortBy="CreationTime", SortOrder="Descending"
)
response

Now we train the model with:

<div class="alert alert-info">

While the model trains, access the AWS console (ask the teacher for the URL and credentials) and check the left side menu **Training / Training jobs** in Sagemaker.


Take the opportunity to also see the option **Notebook / Notebook instances**.

</div>

In [None]:
%%time

train_model = True  # True if training a new model, False if wanting to use an existing estimator once you've already trained

if train_model:
    print("Training the model")
    xgb_estimator.fit(inputs={"train": train_input, "validation": val_input}, job_name=model_name)
    s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()
elif len(response["TrainingJobSummaries"]) > 0:
    training_job_name = response["TrainingJobSummaries"][0]["TrainingJobName"]
    xgb_estimator = Estimator.attach(training_job_name)
    s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()
else:
    print("No existing estimator found. You'll need to run as train = True")

In [None]:
training_job_name = xgb_estimator.latest_training_job.job_name
print(training_job_name)

In [None]:
training_job_info = client.describe_training_job(TrainingJobName=training_job_name)
pprint.pprint(f"{training_job_info}")

### Deploy Model

<div class="alert alert-info">

While the model is deployed, access the AWS console (ask the teacher for the URL and credentials) and check the left side menu:
    
- **Inference / Models** in Sagemaker
- **Inference / Endpoints** in Sagemaker
- `sagemaker-mlops-out-YOUR_INSPER_USERNAME` bucket in S3.
    

In [None]:
model_name

In [None]:
endpoint_name = "churn-model-endpoint-{}".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
print(endpoint_name)

In [None]:
endpoint_list = client.list_endpoints(
    SortBy="CreationTime",
    SortOrder="Descending",
    NameContains=endpoint_name,
    StatusEquals="InService",
)
endpoint_list

In [None]:
%%time

if len(endpoint_list["Endpoints"]) > 0:
    print(f"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}")
else:
    # deploy endpoint for model if it doesn't already exist
    xgb_estimator.deploy(
        initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name
    )

#### Create predictor

In [None]:
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sess
)

In [None]:
def get_data_input(df_inference):
    return [
        ",".join([str(i) for i in row])
        for row in df_inference.drop("Exited", axis=1).values
    ]

def get_predictions(data_inputs):
    predictions = []
    for data_input in data_inputs:
        results = predictor.predict(data_input, initial_args={"ContentType": "text/csv"})
        prediction = json.loads(results)
        predictions.append(prediction)
    return predictions

In [None]:
# To simulate new data, predict a sample of df_test
df_inference = df_test.sample(3, random_state=42)

data_inputs = get_data_input(df_inference)
data_inputs

In [None]:
predictions = get_predictions(data_inputs)
predictions

### Plot ROC Curve

In [None]:
df_inference = df_test

data_inputs = get_data_input(df_inference)
predictions = get_predictions(data_inputs)
y_true = df_inference["Exited"]

fpr, tpr, thresholds = roc_curve(y_true, predictions)

# Calculate the Area Under the ROC Curve (AUC)
auc = roc_auc_score(y_true, predictions)

plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()