In [None]:
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoost
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('telco-customer-churn.csv')

# Data cleaning and preprocessing
# Drop rows with null values
data = data.dropna()

# Convert categorical columns to numerical using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

# Separate features and target variable
X = data.drop('Churn_Yes', axis=1)  # Assuming 'Churn_Yes' is the target column
y = data['Churn_Yes']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save train and test data to CSV
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

# Upload data to S3
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'telco-churn-xgboost'

train_s3_path = sagemaker_session.upload_data('train.csv', bucket=bucket, key_prefix=prefix)
test_s3_path = sagemaker_session.upload_data('test.csv', bucket=bucket, key_prefix=prefix)

In [None]:
# Define the XGBoost model
role = get_execution_role()
xgboost_container = sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.5-1")

# Use SageMaker's built-in XGBoost algorithm
xgboost_estimator = sagemaker.estimator.Estimator(
    xgboost_container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
xgboost_estimator.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc'
)

# Prepare the training input
train_input = TrainingInput(train_s3_path, content_type="csv")
test_input = TrainingInput(test_s3_path, content_type="csv")

# Train the model
xgboost_estimator.fit({'train': train_input, 'validation': test_input})

print("Training complete.")

In [None]:
endpoint_name = 'telco-churn-xgboost-endpoint'
# Deploy the model
xgboost_predictor = xgboost_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name=endpoint_name
)

print(f"Endpoint {endpoint_name} is ready to serve predictions.")

In [None]:
# Prepare a sample payload for prediction
sample_data = X_test.iloc[:5].to_csv(header=False, index=False)

# Convert the sample data to numerical format and ensure it matches the expected input format
sample_data = X_test.iloc[:5].astype(float).to_csv(header=False, index=False)

# Invoke the endpoint with the correct content type
response = xgboost_predictor.predict(sample_data, initial_args={"ContentType": "text/csv"})

# Decode the response
print("Predictions:")
print(response.decode('utf-8'))