# Predicting whether to contact a customer because they are at risk of churning (v2)

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-3/v-5/119

## Part 1: Load and examine the data

In [None]:
data_bucket = 'doughudgeon-mlforbusiness' # change the name odf your bucket
subfolder = 'ch03'
dataset = 'churn_data.csv'

In [None]:
import sys
import pandas as pd
from time import sleep

import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split

# correspond to Version 2.x of the SageMaker Python SDK
# Check the latest version of SageMaker
if int(sagemaker.__version__.split('.')[0]) == 2:
    print("Version is good")
else:
    !{sys.executable} -m pip install --upgrade sagemaker
    print("Installing latest SageMaker Version. Please restart the kernel")

In [None]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

In [None]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['churned'].value_counts())

## Part 2: Get the data into the right shape

In [None]:
columns = df.columns.tolist()
encoded_data = df.drop(['id', 'customer_code', 'co_name'], axis=1)
encoded_data.head()

## Part 3: Create training, validation and test data sets

In [None]:
y = encoded_data['churned']
train_df, test_and_val_data, _, _ = train_test_split(encoded_data, y, test_size=0.3, stratify=y, random_state=0)

y = test_and_val_data['churned']
val_df, test_df, _, _ = train_test_split(test_and_val_data, y, test_size=0.333, stratify=y, random_state=0)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print('Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['churned'].value_counts())
print()
print('Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['churned'].value_counts())
print()
print('Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['churned'].value_counts())

In [None]:
train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data) 
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [None]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve(
                'xgboost',
                boto3.Session().region_name,
                'latest')

estimator = sagemaker.estimator.Estimator(
                        container, 
                        role,
                        instance_count=1, 
                        instance_type='ml.m4.xlarge',
                        output_path=f's3://{data_bucket}/{subfolder}/output',
                        sagemaker_session=sess)

estimator.set_hyperparameters(
                        max_depth=3,
                        subsample=0.7,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=10,
                        scale_pos_weight=17)

estimator.fit({'train': train_input, 'validation': val_input})

## Part 5: Host the model

In [None]:
endpoint_name = 'customer-churn'

try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

In [None]:
predictor = estimator.deploy(
                initial_instance_count=1,
                instance_type='ml.m4.xlarge', 
                endpoint_name=endpoint_name)

In [None]:
from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()

## Part 6: Test the model

In [None]:
def get_prediction(row):
    prob = float(predictor.predict(row[1:]).decode('utf-8'))
    return 1 if prob > 0.5 else 0

with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv') as f:
    test_data = pd.read_csv(f)

test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data[:10]

In [None]:
print(test_data['churned'].value_counts())
print(test_data['prediction'].value_counts())
print(metrics.accuracy_score(test_data['churned'],test_data['prediction']))

In [None]:
print(metrics.confusion_matrix(test_data['churned'],test_data['prediction']))

In [None]:
y = [1,0,0,0,0,0,0,0,0,2]
pred = [0,0,0,0,0,0,0,0,1,2]
print(metrics.confusion_matrix(y,pred))

## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
sess.delete_endpoint(endpoint_name)