# Predicting whether to contact a customer because they are at risk of churning

## Part 1: Load and examine the data

In [2]:
!pip install awswrangler

Collecting awswrangler
  Using cached awswrangler-3.14.0-py3-none-any.whl.metadata (16 kB)
Using cached awswrangler-3.14.0-py3-none-any.whl (380 kB)
Installing collected packages: awswrangler
Successfully installed awswrangler-3.14.0


In [3]:
import pandas as pd
import awswrangler as wr  # For efficient S3 reads/writes
from time import sleep
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

# Setup
data_bucket = 'machine-learning-for-interview'
subfolder = 'chapter-03'
dataset = 'churn_data.csv'
processed_subfolder = f'{subfolder}/processed'  # For clarity

role = get_execution_role()
sess = Session(boto3.Session())  # Explicit session for reproducibility
region = sess.boto_region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [5]:
# Load data with AWS Data Wrangler (faster than s3fs)
s3_path = f's3://{data_bucket}/{subfolder}/{dataset}'
df = wr.s3.read_csv(path=s3_path, use_threads=True)
print(df.head())



   churned  id  customer_code                        co_name  total_spend  \
0        0   1           1826  Hoffman Martinez and Chandler     68567.34   
1        0   2            772         Lee Martin and Escobar     74335.27   
2        0   3            479       Hobbs Mcdaniel and Baker     48746.22   
3        0   4           1692                Williams-Harris     64416.70   
4        0   5           2578                    Beck-Snyder     71623.20   

   week_minus_4  week_minus_3  week_minus_2  last_week  4-3_delta  3-2_delta  \
0          0.81          0.02          0.74       1.45      -0.79       0.72   
1          1.87          1.02          1.29       1.19      -0.85       0.27   
2          1.21          0.70          1.04       2.12      -0.51       0.34   
3          0.75          2.08          2.40       2.02       1.33       0.32   
4          2.33          0.66          1.97       1.60      -1.67       1.31   

   2-1_delta  
0       0.71  
1      -0.10  
2       1.0

In [6]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['churned'].value_counts())



Number of rows in dataset: 2999
churned
0    2833
1     166
Name: count, dtype: int64


In [None]:
## Part 2: Get the data into the right shape

In [8]:
# Drop non-feature columns
columns = df.columns.tolist()
encoded_data = df.drop(['id', 'customer_code', 'co_name'], axis=1)
print(encoded_data.head())

   churned  total_spend  week_minus_4  week_minus_3  week_minus_2  last_week  \
0        0     68567.34          0.81          0.02          0.74       1.45   
1        0     74335.27          1.87          1.02          1.29       1.19   
2        0     48746.22          1.21          0.70          1.04       2.12   
3        0     64416.70          0.75          2.08          2.40       2.02   
4        0     71623.20          2.33          0.66          1.97       1.60   

   4-3_delta  3-2_delta  2-1_delta  
0      -0.79       0.72       0.71  
1      -0.85       0.27      -0.10  
2      -0.51       0.34       1.08  
3       1.33       0.32      -0.38  
4      -1.67       1.31      -0.37  


## Part 3: Create training, validation and test data sets

In [9]:
y = encoded_data['churned']
train_df, test_and_val_data, _, _ = train_test_split(
    encoded_data, y, test_size=0.3, stratify=y, random_state=0
)

y = test_and_val_data['churned']
val_df, test_df, _, _ = train_test_split(
    test_and_val_data, y, test_size=0.333, stratify=y, random_state=0
)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print(f'Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['churned'].value_counts())
print()
print(f'Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['churned'].value_counts())
print()
print(f'Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['churned'].value_counts())

(2099, 9) (600, 9) (300, 9)

Number of rows in Train dataset: 2099
churned
0    1983
1     116
Name: count, dtype: int64

Number of rows in Validate dataset: 600
churned
0    567
1     33
Name: count, dtype: int64

Number of rows in Test dataset: 300
churned
0    283
1     17
Name: count, dtype: int64


In [10]:
# Prepare CSVs (header=False for train/val, True for test)
train_path = f's3://{data_bucket}/{processed_subfolder}/train.csv'
val_path = f's3://{data_bucket}/{processed_subfolder}/val.csv'
test_path = f's3://{data_bucket}/{processed_subfolder}/test.csv'

wr.s3.to_csv(train_df, path=train_path, header=False, index=False, use_threads=True)
wr.s3.to_csv(val_df, path=val_path, header=False, index=False, use_threads=True)
wr.s3.to_csv(test_df, path=test_path, header=True, index=False, use_threads=True)

print(f"Uploaded: {train_path}, {val_path}, {test_path}")

Uploaded: s3://machine-learning-for-interview/chapter-03/processed/train.csv, s3://machine-learning-for-interview/chapter-03/processed/val.csv, s3://machine-learning-for-interview/chapter-03/processed/test.csv


In [11]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(
    s3_data=f's3://{data_bucket}/{processed_subfolder}/train.csv',
    content_type='csv'
)
val_input = TrainingInput(
    s3_data=f's3://{data_bucket}/{processed_subfolder}/val.csv',
    content_type='csv'
)

## Part 4: Train the model

In [14]:

from sagemaker.estimator import Estimator

sess = sagemaker.Session()
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

# Modern way to get the XGBoost container
container = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.5-1"       # recommended stable version
)

estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",   # replacement for m4.xlarge
    output_path=f"s3://{data_bucket}/{subfolder}/output",
    sagemaker_session=sess
)

estimator.set_hyperparameters(
    max_depth=3,
    subsample=0.7,
    objective='binary:logistic',
    eval_metric='auc',
    num_round=100,
    early_stopping_rounds=10,
    scale_pos_weight=17
)

estimator.fit({'train': train_input, 'validation': val_input})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-16-16-41-15-334


2025-11-16 16:41:17 Starting - Starting the training job...
2025-11-16 16:41:49 Downloading - Downloading input data...
2025-11-16 16:42:14 Downloading - Downloading the training image......
2025-11-16 16:43:15 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-16 16:43:08.191 ip-10-0-244-119.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-16 16:43:08.213 ip-10-0-244-119.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-16:16:43:08:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-16:16:43:08:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-16:16:43:08:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-16:16:43:08:INFO] No GPUs detected (norma

## Part 5: Host the model

In [16]:
from sagemaker.serializers import CSVSerializer

endpoint_name = "customer-churn"

# Delete old endpoint
try:
    sess.delete_endpoint(endpoint_name)
    print("Existing endpoint deleted.")
    sleep(30)
except:
    pass

predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name=endpoint_name,
    serializer=CSVSerializer(),     # modern serializer
)


INFO:sagemaker:Deleting endpoint with name: customer-churn
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-16-16-54-07-191
INFO:sagemaker:Creating endpoint-config with name customer-churn
INFO:sagemaker:Creating endpoint with name customer-churn


------!

## Part 6: Test the model

In [17]:
def get_prediction(row):
    # Predict probability, threshold at 0.5
    prob = float(predictor.predict(row[1:]).decode('utf-8'))  # row[1:] skips target
    return 1 if prob > 0.5 else 0

# Load test data
test_data = wr.s3.read_csv(path=test_path, use_threads=True)

# Apply predictions
test_data['prediction'] = test_data.apply(get_prediction, axis=1)
print(test_data.head(10))

  has_multiple_rows = len(data) > 0 and self._is_sequence_like(data[0])


   churned  total_spend  week_minus_4  week_minus_3  week_minus_2  last_week  \
0        0     76897.46          0.56          2.29          1.14       2.23   
1        0     19604.63          1.95          2.04          0.82       1.62   
2        0     23369.60          1.11          1.54          1.55       1.14   
3        1     40709.47          2.40          1.87          0.07       0.61   
4        0     69953.52          2.01          1.20          1.05       1.41   
5        0     71939.07          0.54          1.17          0.21       2.29   
6        0     45930.53          0.08          1.43          0.41       1.34   
7        0     47080.25          1.54          0.68          0.80       0.54   
8        0     35506.83          1.37          0.93          1.70       0.67   
9        0     39188.12          0.40          1.86          0.10       0.82   

   4-3_delta  3-2_delta  2-1_delta  prediction  
0       1.73      -1.15       1.09           0  
1       0.09      -1.

In [18]:
print(test_data['churned'].value_counts())
print(test_data['prediction'].value_counts())
print(metrics.accuracy_score(test_data['churned'], test_data['prediction']))

churned
0    283
1     17
Name: count, dtype: int64
prediction
0    266
1     34
Name: count, dtype: int64
0.9433333333333334


In [20]:
print(metrics.confusion_matrix(test_data['churned'], test_data['prediction']))

[[266  17]
 [  0  17]]


In [21]:
# Demo confusion matrix (assuming binary; adjust if multi-class)
y_demo = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]  # Fixed to binary example
pred_demo = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
print(metrics.confusion_matrix(y_demo, pred_demo))

[[7 1]
 [1 1]]


## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [22]:
# Cleanup
sess.delete_endpoint(endpoint_name)

INFO:sagemaker:Deleting endpoint with name: customer-churn
