# Predicting whether to contact a customer because they are at risk of churning

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-3/v-5/119

## Part 1: Load and examine the data

In [1]:
data_bucket = 'mlforbusiness'
subfolder = 'ch03'
dataset = 'churn_data.csv'

In [2]:
import pandas as pd
from time import sleep

import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,churned,id,customer_code,co_name,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,1,1826,Hoffman Martinez and Chandler,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,2,772,Lee Martin and Escobar,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,3,479,Hobbs Mcdaniel and Baker,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,4,1692,Williams-Harris,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,5,2578,Beck-Snyder,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


In [4]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['churned'].value_counts())

Number of rows in dataset: 2999
0    2833
1     166
Name: churned, dtype: int64


## Part 2: Get the data into the right shape

In [5]:
columns = df.columns.tolist()
encoded_data = df.drop(['id', 'customer_code', 'co_name'], axis=1)
encoded_data.head()

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


## Part 3: Create training, validation and test data sets

In [6]:
y = encoded_data['churned']
train_df, test_and_val_data, _, _ = train_test_split(encoded_data, y, test_size=0.3, stratify=y, random_state=0)

y = test_and_val_data['churned']
val_df, test_df, _, _ = train_test_split(test_and_val_data, y, test_size=0.333, stratify=y, random_state=0)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print('Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['churned'].value_counts())
print()
print('Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['churned'].value_counts())
print()
print('Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['churned'].value_counts())

(2099, 9) (600, 9) (300, 9)

Number of rows in Train dataset: {train_df.shape[0]}
0    1983
1     116
Name: churned, dtype: int64

Number of rows in Validate dataset: {val_df.shape[0]}
0    567
1     33
Name: churned, dtype: int64

Number of rows in Test dataset: {test_df.shape[0]}
0    283
1     17
Name: churned, dtype: int64


In [7]:
train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data) 
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [8]:
sess = sagemaker.Session()

container = sagemaker.amazon.amazon_estimator.get_image_uri(
                boto3.Session().region_name,
                'xgboost',
                'latest')

estimator = sagemaker.estimator.Estimator(
                        container, 
                        role,
                        train_instance_count=1, 
                        train_instance_type='ml.m4.xlarge',
                        output_path=f's3://{data_bucket}/{subfolder}/output',
                        sagemaker_session=sess)

estimator.set_hyperparameters(
                        max_depth=3,
                        subsample=0.7,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=10,
                        scale_pos_weight=17)

estimator.fit({'train': train_input, 'validation': val_input})

2019-07-28 03:06:41 Starting - Starting the training job...
2019-07-28 03:06:42 Starting - Launching requested ML instances......
2019-07-28 03:08:10 Starting - Preparing the instances for training.........
2019-07-28 03:09:34 Downloading - Downloading input data
2019-07-28 03:09:34 Training - Downloading the training image...
2019-07-28 03:10:05 Uploading - Uploading generated training model
2019-07-28 03:10:05 Completed - Training job completed

[31mArguments: train[0m
[31m[2019-07-28:03:09:55:INFO] Running standalone xgboost training.[0m
[31m[2019-07-28:03:09:55:INFO] File size need to be processed in the node: 0.12mb. Available memory size in the node: 8444.15mb[0m
[31m[2019-07-28:03:09:55:INFO] Determined delimiter of CSV input is ','[0m
[31m[03:09:55] S3DistributionType set as FullyReplicated[0m
[31m[03:09:55] 2099x8 matrix with 16792 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-07-28:03:09:55:INFO] Determined delimi

## Part 5: Host the model

In [9]:
endpoint_name = 'customer-churn'

try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

In [10]:
predictor = estimator.deploy(
                initial_instance_count=1,
                instance_type='ml.m4.xlarge', 
                endpoint_name=endpoint_name)

--------------------------------------------------------------------------------------------------------------!

In [11]:
from sagemaker.predictor import csv_serializer, json_serializer
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

## Part 6: Test the model

In [12]:
def get_prediction(row):
    prob = float(predictor.predict(row[1:]).decode('utf-8'))
    return 1 if prob > 0.5 else 0

with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv') as f:
    test_data = pd.read_csv(f)

test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data[:10]

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta,prediction
0,0,76897.46,0.56,2.29,1.14,2.23,1.73,-1.15,1.09,0
1,0,19604.63,1.95,2.04,0.82,1.62,0.09,-1.22,0.8,0
2,0,23369.6,1.11,1.54,1.55,1.14,0.43,0.01,-0.41,0
3,1,40709.47,2.4,1.87,0.07,0.61,-0.53,-1.8,0.54,1
4,0,69953.52,2.01,1.2,1.05,1.41,-0.81,-0.15,0.36,0
5,0,71939.07,0.54,1.17,0.21,2.29,0.63,-0.96,2.08,0
6,0,45930.53,0.08,1.43,0.41,1.34,1.35,-1.02,0.93,0
7,0,47080.25,1.54,0.68,0.8,0.54,-0.86,0.12,-0.26,0
8,0,35506.83,1.37,0.93,1.7,0.67,-0.44,0.77,-1.03,0
9,0,39188.12,0.4,1.86,0.1,0.82,1.46,-1.76,0.72,0


In [13]:
print(test_data['churned'].value_counts())
print(test_data['prediction'].value_counts())
print(metrics.accuracy_score(test_data['churned'],test_data['prediction']))

0    283
1     17
Name: churned, dtype: int64
0    267
1     33
Name: prediction, dtype: int64
0.9466666666666667


In [14]:
print(metrics.confusion_matrix(test_data['churned'],test_data['prediction']))

[[267  16]
 [  0  17]]


In [19]:
y = [1,0,0,0,0,0,0,0,0,2]
pred = [0,0,0,0,0,0,0,0,1,2]
print(metrics.confusion_matrix(y,pred))

[[7 1 0]
 [1 0 0]
 [0 0 1]]


## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
sess.delete_endpoint(endpoint_name)