In [6]:
import sagemaker
import boto3
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Manually specify the IAM role ARN for SageMaker
role = os.getenv('AWS_ROLE_ARN')  # Update your .env file with the correct role ARN

# S3 bucket where the dataset will be stored
bucket = os.getenv('AWS_S3_BUCKET_NAME')  # Replace with your actual S3 bucket name
prefix = 'sagemaker/gym-churn'  # Folder name where data will be stored

# S3 client for data upload
s3 = boto3.client('s3')

print(f"SageMaker session and role set up successfully with role: {role}")

SageMaker session and role set up successfully with role: arn:aws:iam::014077742481:role/service-role/AmazonSageMaker-ExecutionRole-20210612T122827


In [10]:
# Load the dataset
file_path = r'C:\Users\big_j\PycharmProjects\Customer-Churn-Prediction-using-AWS-SageMaker\data\gym_churn_us.csv'  # Path to your file

# Upload the churn data to S3
data_key = f'{prefix}/gym_churn_us.csv'
s3.upload_file(file_path, bucket, data_key)
print(f"File uploaded to s3://{bucket}/{data_key}")

File uploaded to s3://my-gym-churn-bucket/sagemaker/gym-churn/gym_churn_us.csv


In [11]:
# Define the XGBoost image for SageMaker
container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')

# Define the input data location in S3
input_data = f's3://{bucket}/{data_key}'

# Initialize XGBoost estimator
xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

# Set XGBoost hyperparameters
xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0
)

# Train the model using the input data
train_input = sagemaker.inputs.TrainingInput(s3_data=input_data, content_type='csv')
xgb.fit({'train': train_input})

print("Training job completed.")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-10-15-02-10-27-840


2024-10-15 02:10:31 Starting - Starting the training job...
2024-10-15 02:10:45 Starting - Preparing the instances for training...
2024-10-15 02:11:16 Downloading - Downloading input data...
2024-10-15 02:12:01 Downloading - Downloading the training image.....[34mArguments: train[0m
[34m[2024-10-15:02:12:45:INFO] Running standalone xgboost training.[0m
[34m[2024-10-15:02:12:45:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2024-10-15:02:12:45:INFO] File size need to be processed in the node: 0.31mb. Available memory size in the node: 168.83mb[0m
[34m[2024-10-15:02:12:46:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:12:45] S3DistributionType set as FullyReplicated[0m
[34m[02:12:46] 4000x13 matrix with 52000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[02:12:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 2 pruned nodes, max_depth=5[0m
[34m[0]#011train-error:0.47525[

In [12]:
# Deploy the model to an endpoint
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large'
)

print(f"Model deployed to endpoint: {xgb_predictor.endpoint_name}")

INFO:sagemaker:Creating model with name: xgboost-2024-10-15-02-13-23-163
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-10-15-02-13-23-163
INFO:sagemaker:Creating endpoint with name xgboost-2024-10-15-02-13-23-163


------!Model deployed to endpoint: xgboost-2024-10-15-02-13-23-163


In [13]:
# Test data (make sure to match the number of features)
test_data = '0.2,0.1,0.4,0.5,0.6,0.7,0.8,0.3,0.9,0.5,0.4,0.2,0.1'

# Perform prediction and set content type as CSV
prediction = xgb_predictor.predict(test_data, initial_args={'ContentType': 'text/csv'})
print(f"Predicted class: {prediction}")

Predicted class: b'0.5225263237953186'


In [14]:
# Delete the endpoint to save cost
xgb_predictor.delete_endpoint()

print("Endpoint deleted.")

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2024-10-15-02-13-23-163
INFO:sagemaker:Deleting endpoint with name: xgboost-2024-10-15-02-13-23-163


Endpoint deleted.
