In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline # Not strictly used for final SM model but good for local
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import xgboost as xgb # For local model training and understanding

# SageMaker specific imports
import sagemaker
import boto3
from sagemaker.xgboost.estimator import XGBoost as SageMakerXGBoost # Alias to avoid confusion
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
import os # For joining paths
import time # For unique endpoint names

try:
    df = pd.read_csv('telco-customer-churn.csv')
    print("Data loaded successfully.")
    print(f"Initial dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'telco-customer-churn.csv' not found. Make sure the file is in the correct directory.")
    exit()
except Exception as e:
    print(f"Error loading data: {e}")
    exit()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"\nMissing values in 'TotalCharges' after conversion to numeric: {df['TotalCharges'].isnull().sum()}")
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
    print("Dropped 'customerID' column.")
if df['TotalCharges'].isnull().any():
    median_total_charges = df[df['tenure'] > 0]['TotalCharges'].median() # Median from customers with tenure
    if pd.isna(median_total_charges) and df['TotalCharges'].isnull().sum() > 0 : # if all are new customers
         median_total_charges = 0 # Fallback if all tenures are 0 or only NaNs exist
    df['TotalCharges'].fillna(median_total_charges, inplace=True)
    print(f"Imputed missing 'TotalCharges' with median: {median_total_charges}")
print(f"Missing values in 'TotalCharges' after imputation: {df['TotalCharges'].isnull().sum()}")
if 'Churn' in df.columns:
    if df['Churn'].dtype == 'object': # Check if it needs encoding
        label_encoder_churn = LabelEncoder()
        df['Churn'] = label_encoder_churn.fit_transform(df['Churn'])
        print("'Churn' column encoded to numerical (0 and 1).")
        # To see the mapping: print(dict(zip(label_encoder_churn.classes_, label_encoder_churn.transform(label_encoder_churn.classes_))))
else:
    print("Error: 'Churn' column not found. Cannot proceed with target encoding.")
    exit()
if 'Churn' in df.columns:
    if df['Churn'].dtype == 'object': # Check if it needs encoding
        label_encoder_churn = LabelEncoder()
        df['Churn'] = label_encoder_churn.fit_transform(df['Churn'])
        print("'Churn' column encoded to numerical (0 and 1).")
        # To see the mapping: print(dict(zip(label_encoder_churn.classes_, label_encoder_churn.transform(label_encoder_churn.classes_))))
else:
    print("Error: 'Churn' column not found. Cannot proceed with target encoding.")
    exit()

In [None]:
X = df.drop('Churn', axis=1)  # Assuming 'Churn' is the target column
y = df['Churn']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save train and test data to CSV
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

# Upload data to S3
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'telco-churn-xgboost'

train_s3_path = sagemaker_session.upload_data('train.csv', bucket=bucket, key_prefix=prefix)
test_s3_path = sagemaker_session.upload_data('test.csv', bucket=bucket, key_prefix=prefix)

print(f'Train data uploaded to: {train_s3_path}')
print(f'Test data uploaded to: {test_s3_path}')

In [None]:
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoost

# Define the XGBoost model
role = get_execution_role()
xgboost_container = sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.5-1")

# Use SageMaker's built-in XGBoost algorithm
xgboost_estimator = sagemaker.estimator.Estimator(
    xgboost_container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
xgboost_estimator.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc'
)   

# Prepare the training input
train_input = TrainingInput(train_s3_path, content_type="csv")
test_input = TrainingInput(test_s3_path, content_type="csv")

xgboost_estimator.fit({'train': train_input, 'validation': test_input})

# get the model artifact
model_artifact = xgboost_estimator.model_data
print(f"Model artifact saved at: {model_artifact}")
print("Training complete.")

In [None]:
# Create the model
xgboost_model = sagemaker.model.Model(
    model_data=model_artifact,
    image_uri=xgboost_container,
    role=role,
    sagemaker_session=sagemaker_session,
    predictor_cls=sagemaker.predictor.Predictor
)

#get the model artifact
model_artifact = xgboost_model.model_data
print(f"Model artifact saved at: {model_artifact}")

# Deploy the model to an endpoint
endpoint_name = f"xgboost-telco-churn-{int(time.time())}"

predictor = xgboost_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name=endpoint_name,
    serializer=CSVSerializer()
)
print(f"Model deployed to endpoint: {endpoint_name}")

In [None]:
# invoke the endpoint using the enpdpoint name and the test data
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=CSVSerializer()
)

# Prepare a sample payload for prediction
sample_data = X_test.iloc[:1].to_csv(header=False, index=False)

# Print the sample data for debugging
print(f"Sample data for prediction: {sample_data}")

# Convert the sample data to numerical format and ensure it matches the expected input format
sample_data = X_test.iloc[:1].apply(pd.to_numeric, errors='coerce').to_csv(header=False, index=False)

# Make a prediction
predicted = predictor.predict(sample_data, initial_args={'ContentType': 'text/csv'})
print(f"Predicted value: {predicted}")

In [None]:
# Get list of numerical and categorical features from the training dataset
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

import json

# Create a dictionary to store the metadata
metadata = {
   "predictorList": []
}

# add a model object to the metadata with the following keys objective , outcomeType, expectedPerformance , framework and modellingtechnique
metadata["model"] = {
    "objective": "Churn",
    "outcomeType": "BINARY",
    "expectedPerformance": 70,
    "expectedPerformanceMeasurement": "AUC",
    "framework": "xgboost",
    "modellingTechnique": "XGBoost",
    # add a outcomes object which has a range array of 0 to 1
    "outcomes": {
         "range": []
     }
}

# Add numerical features to the metadata
for feature in numerical_features:
    metadata["predictorList"].append({"name": feature, "type": "NUMERIC"})

# Add categorical features to the metadata
for feature in categorical_features:
    metadata["predictorList"].append({"name": feature, "type": "CATEGORICAL"})

# Save the metadata to a JSON file
with open("model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("Model metadata saved to 'model_metadata.json'")