# Spotify dataset · Model deployment

## Training of the model

In [18]:
# Import the necessary libraries

import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
import io
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix


In [3]:
# Set up the SageMaker session and role 

session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [4]:
# Name of the bucket

prefix = "spotify-project-b8"

In [5]:
# We define the bucket

bucket = sagemaker.Session().default_bucket()

In [6]:
# We define the container

container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, "1.5-1")
container

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [7]:
# We load the data from the S3 bucket

train_data = TrainingInput(s3_data = "s3://{}/{}/train".format(prefix, prefix), content_type="csv")
validation_data = TrainingInput(s3_data = "s3://{}/{}/validation".format(prefix, prefix), content_type="csv")

In [9]:
#Train the model 

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count = 1,
    instance_type = "ml.m4.xlarge",
    output_path = "s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=session,
)
xgb.set_hyperparameters(
    alpha = 0.05,
    eval_metric = "error",
    gamma = 5,
    max_depth = 10,
    max_leaves = 20,
    verbosity = 0,
    objective = "binary:logistic",
    num_round = 100,
)

xgb.fit({"train": train_data, "validation": validation_data})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-14-20-48-21-183


2023-03-14 20:48:21 Starting - Starting the training job......
2023-03-14 20:49:07 Starting - Preparing the instances for training......
2023-03-14 20:50:16 Downloading - Downloading input data...
2023-03-14 20:50:46 Training - Downloading the training image......
2023-03-14 20:51:37 Training - Training image download completed. Training in progress..[34m[2023-03-14 20:51:47.317 ip-10-2-122-59.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-03-14 20:51:47.393 ip-10-2-122-59.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-03-14:20:51:47:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-03-14:20:51:47:INFO] Failed to parse hyperparameter eval_metric value error to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-14:20:51:47:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-14:20:51:47:INF

## Deployment of the model

In [10]:
# We deploy the model

xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-03-14-20-52-37-437
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-03-14-20-52-37-437
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-03-14-20-52-37-437


----------!

In [11]:
# We save the endpoint

endpoint = xgb_predictor.endpoint
endpoint

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'sagemaker-xgboost-2023-03-14-20-52-37-437'

## Predict the data

In [12]:
# We define the parameters to extract the test data from the bucket

s3 = boto3.client("s3")
bucket_name = "spotify-project-b8"
object_key = "spotify-project-b8/test/test.csv"
local_file_path = "/tmp/test.csv"

s3.download_file(Bucket=bucket_name, Key=object_key, Filename=local_file_path)

test_data = pd.read_csv("test.csv")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [13]:
# We define the prediction function

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = "".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return predictions.split("\n")[:-1]

# We make the predictions with the test data

predictions = predict(test_data.to_numpy()[:, 1:])
predictions = np.array([float(num) for num in predictions])
print(predictions)

[0.03224378 0.04360343 0.31868514 ... 0.63510233 0.83114851 0.43218032]


In [17]:
# Compute accuracy, precision, and recall
labels = test_data.iloc[:, 0].values
accuracy = accuracy_score(labels, predictions.round())
precision = precision_score(labels, predictions.round())
recall = recall_score(labels, predictions.round())

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.8045487715884213
Recall: 0.8491689407095013
Precision: 0.7740841248303935


In [19]:
# We compute the confussion matrix
cm = confusion_matrix(labels, predictions.round())
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[3192  999]
 [ 608 3423]]
