### Predicting Customer Churn
This notebook performs these steps:<br/>
1. Builds a scikit-learn model to predict customer churn
2. Stores the model into the project
3. Stores the model into the deployment space
4. Creates and test an Online Deployement for the model
5. Creates and test a Batch Deployment for the model

### Environment Setup

In [None]:
!pip install sklearn-pandas
# Update WML library
!pip install -U ibm-watson-machine-learning

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import sklearn.pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, LabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import json, requests
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


### Step 1: Load data 
Insert <b><font color=blue>your own</font></b> values for the <b>ibm_api_key_id</b> and <b>bucket name</b> in the appropriate places of the two cells below  

In [None]:
from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

import pandas as pd

customer_churn = pd.read_csv(wslib.mount.get_data_path('churn.csv'))
customer_churn.head()

In [None]:
customer = pd.read_csv('/project_data/data_asset/customer-profile.csv')
customer.head()

### Step 2: Merge Files

In [None]:
data = pd.merge(customer, customer_churn, on='ID')

### Step 3: Rename some columns
This step is to remove spaces from columns names, it's an example of data preparation that you may want to do before creating a model. 

In [None]:
data.columns

In [None]:
data.rename(columns={'Est Income':'EstIncome', 'Car Owner':'CarOwner' }, inplace=True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Uncomment if you would like to see the profile report

#Uncomment and run once to install the package in your runtime environment
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
# Uncomment if you would like to see the profile report
#from pandas_profiling import ProfileReport
    
#profile = ProfileReport(data, title="Data Profiling Report")
#profile.to_widgets()

### Step 4: Data understanding

In [None]:
data.describe()

In [None]:
g1 = sns.countplot(data=data, x='CHURN', order=data.CHURN.value_counts().index)
plt.title('Customer Churn Rates')
plt.ylabel('Count of Churn')
plt.ylim(0, 800)
#Add percentages to the graph
total = float(len(data)) #one person per row
for p in g1.patches:
    height = p.get_height()
    g1.text(p.get_x()+p.get_width()/2.,
            height + 1,
            '{0:.0%}'.format(height/total),
            ha="center") 
plt.show()

In [None]:
sns.catplot(x="CHURN", y="AvgMonthlySpend",
                 hue="MembershipPlan", col="Paymethod",
                 data=data, kind='box',
                 height=7, aspect=.81);

### Step 5: Build the sklearn pipeline and the Random Forest model


In [None]:
# Define input data to the model
X = data.drop(['ID','CHURN'], axis=1)

In [None]:
# Define the target variable and encode with value between 0 and n_classes-1, that is from T/F to 1/0
le = LabelEncoder()
y = le.fit_transform(data['CHURN'])

In [None]:
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

#### Use the DataFrameMapper class to declare transformations and variable imputations.

* LabelBinarizer - Converts a categorical variable into a dummy variable (aka binary variable)
* StandardScaler - Standardize features by removing the mean and scaling to unit variance, z = (x - u) / s

See docs: 
* https://github.com/scikit-learn-contrib/sklearn-pandas
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html#sklearn.preprocessing.LabelBinarizer
* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:

mapper_good = DataFrameMapper([
    (['Gender'], LabelBinarizer()),
    (['Status'], LabelBinarizer()),
    (['CarOwner'], LabelBinarizer()),
    (['Paymethod'], LabelBinarizer()),
    (['MembershipPlan'], LabelBinarizer()),
    (['Children'],  StandardScaler()),
    (['EstIncome'],  StandardScaler()),
    (['Age'],  StandardScaler()),
    (['AvgMonthlySpend'],  StandardScaler()),
    (['CustomerSupportCalls'],  StandardScaler())], default=False)


In [None]:
# Instantiate the Classifier
random_forest = RandomForestClassifier(random_state=5)

# Define the steps in the pipeline to sequentially apply a list of transforms and the estimator, i.e. RandomForestClassifier
steps = [('mapper', mapper_good),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)

# train the model
model=pipeline.fit( X_train, y_train )

model

In [None]:
# Display Label Mapping to assist with interpretation of the model
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )

### and print the report
print(report)

###  Step 6:  Tune the model to find the best model

In [None]:
# List keys to the model param to tune
#model.get_params().keys()

In [None]:
parameters = { 'RandonForestClassifier__max_depth': [5,8,10],
               'RandonForestClassifier__n_estimators': [150,180,200]}

In [None]:
grid_obj = GridSearchCV(estimator=model, param_grid=parameters,  cv=3)

In [None]:
# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)


In [None]:
# Get the estimator
best_clf = grid_fit.best_estimator_

In [None]:
best_predictions = best_clf.predict(X_test)

In [None]:
best_predictions_report = sklearn.metrics.classification_report( y_test, best_predictions )

In [None]:
print('Results of best fitted model: \n\n',best_predictions_report)

In [None]:
print('Results of default model: \n\n',report)

In [None]:
m_step=pipeline.named_steps['mapper']

In [None]:
m_step.transformed_names_

In [None]:
features = m_step.transformed_names_

In [None]:
# Get the features importance
importances = pipeline.named_steps['RandonForestClassifier'][1].feature_importances_
indices = np.argsort(importances)

In [None]:
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b',align='center')
plt.yticks(range(len(indices)), (np.array(features))[indices])
plt.xlabel('Relative Importance')

### Step 7: Save Model in the Project and WML Deployment Space


<div class="alert alert-block alert-info">
You have a choice to either save the model in the <b>project</b> OR the <b>deployment space</b>:<br><br>
    <li> If you're saving your model in your project, you have to set the default project using the python client.</li><br>
    <li>If you're saving the model in the deployment space, first, we will check if an existing deployment space is already associated with this project and set the associated deployment space as the default space.  If this project is not yet associated with a deployment space, we will create a deployment space.. From there you'll be able to deploy and score the model in your deployment space.</li></div>


In [None]:
# get the Project ID and set the location to save the model to the project
from ibm_watson_machine_learning import APIClient
import os

token = os.environ['USER_ACCESS_TOKEN']

wml_credentials = {
   "token": token,
   "instance_id" : "openshift",
   "url": os.environ['RUNTIME_ENV_APSX_URL'],
   "version": "4.0"
}

client = APIClient(wml_credentials)

project_id = os.environ['PROJECT_ID']
client.set.default_project(project_id)

In [None]:
# Note - this step is commented out because we are saving to a Deployment Space. If you would like to save to the project instead, comment out the code. 
# The "pc" object is generated by "Insert project token" action from the top menu. Make sure to insert the code and run it if you want to save to the project. 
# client.set.default_project(pc.project_id)

# IMPORTANT
# Replace the space_uid value with the Space ID that you looked up on the Settings tab of your Deployment Space
space_uid='replace_with_your_space_id'

# Set default project and space. When we invoke the store_model function in the next cell, it will save the model to the specified project and space
client.set.default_space(space_uid)

In [None]:
# Provide metadata and save the model into the repository. After running this cell, the model will be displayed in the Assets view

# Model Metadata

model_name = 'customer_churn_model_1'
software_spec_uid = client.software_specifications.get_uid_by_name('default_py3.7_opence')

metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.23"
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)

You can choose to stop here, navigate to the project and promote/deploy the saved model into the deployment space using the UI, or continue executing the code cells belows to deploy the model programmatically.

### Step 8: Create an Online Deployment for the stored model

In [None]:
model_uid = client.repository.get_model_uid(stored_model_details)
deployment = client.deployments.create(
    artifact_uid=model_uid,
    meta_props={
        client.deployments.ConfigurationMetaNames.NAME: "Churn Deployment via API-Online",
        client.deployments.ConfigurationMetaNames.ONLINE:{}}
)

### Step 8. Test the Online Deployment By Sending a Score Request (with data) to the Scoring Endpoint

In [None]:
deployment_id = client.deployments.get_id(deployment)

In [None]:
scoring_data = {
    client.deployments.ScoringMetaNames.INPUT_DATA: [
        {
            'fields': ['Gender', 'Status', 'Children', 'EstIncome', 'CarOwner', 'Age', 'AvgMonthlySpend', 'CustomerSupportCalls', 'Paymethod', 'MembershipPlan'],
            'values': [['M','S',2.0,25000,'Y',25,10,1,'CC',1]]
        }]
}

predictions = client.deployments.score(deployment_id, scoring_data)
print(predictions)

In [None]:
# get the predicted value and reverse the label transformation
predicted_value = predictions.get('predictions')[0].get('values')[0][0]
le.inverse_transform([predicted_value])

### Step 9: Create a Batch Deployment for the stored model

In [None]:
# Choose a Deployment Name & Tag for the BATCH Deployment

deployment_name = "Churn Deployment via API-Batch"
deployment_desc = 'Churn Model deployed for Batch scoring using a small configuration'

In [None]:
# Create the deployment metadata and then create the BATCH deployment

# Create the metedata
meta_props = {
    client.deployments.ConfigurationMetaNames.NAME: deployment_name,
    client.deployments.ConfigurationMetaNames.DESCRIPTION: deployment_desc,
    client.deployments.ConfigurationMetaNames.BATCH: {},
    client.deployments.ConfigurationMetaNames.HARDWARE_SPEC:{
         "name": "XS",       # XS, S, M, L, XL
         "nodes": 2
     }
}

# Create the deployment
#model_uid = published_model_details["metadata"]["id"]
deployment_details = client.deployments.create( artifact_uid=model_uid, meta_props=meta_props)

### Step 10: Create a Job to run the Batch Deployment

#### Obtain the BATCH Deployment UID

In [None]:
deployment_details

In [None]:
# Obtain the BATCH Deployment UID from the batch deployment details - the job needs to be linked to the batch deployment it is going to run

batch_deployment_uid = deployment_details["metadata"]["id"]
batch_deployment_uid

#### Obtain the Data Asset Information Required to Create the Job

<font color = blue>This examples assumes that the input file for batch scoring "new_customers.csv" is already in the deployment space.<br>
Whilst files/data connections can be promoted to the deployment space using APIs, this file was manually promoted to the deployment space for the GUI based deployment earlier in the lab.</font>

In [None]:
client.data_assets.list()

In [None]:
#copy the data file name and asset id from the output in the previous cell to the commands below:
input_file = 'new_customers.csv'                     #update this filename if you are not using the default file promoted in the last section of the lab
asset_id = 'replace_with_id_from_the_output_above'

In [None]:
data_asset = client.data_assets.get_details(asset_id)
input_data_href = client.data_assets.get_href(data_asset)
print('Input Data HREF is: ' + str(input_data_href))


#### Create the JOB Payload metadata

In [None]:
# Specify the name and description of the CSV file which will contain the results of the scoring process
batch_output_file = "Churn Results - Batch API.csv"
batch_output_desc = "Data file containing the scoring results of the churn model processed via API"

In [None]:
# Specify the input


job_payload_ref = {
    client.deployments.ScoringMetaNames.INPUT_DATA_REFERENCES: [{
        "name": input_file,
        "type": "data_asset",
        "connection": {},
        "location": {
                      "href":  input_data_href
                    }
    }],
    client.deployments.ScoringMetaNames.OUTPUT_DATA_REFERENCE: {       
            "type": "data_asset",
            "connection": {},
            "location": {
                "name": batch_output_file,
                "description": batch_output_desc
            }
        }
}

#### Create the JOB
<font color=blue><b>Note:</b> The JOB automatically executes upon creation</font><br><br>


In [None]:
# Create the job

job = client.deployments.create_job(deployment_id=batch_deployment_uid,meta_props=job_payload_ref)

#### Check to see if the job has successfully completed

In [None]:
#Find the job ID

job_id = client.deployments.get_job_uid(job)
print(job_id)

In [None]:
#Find the status of the job which has just been created and executed
client.deployments.get_job_status(job_id)

**Author:**  Sidney Phoon and Elena Lowery <br/>
**Date:**  September 2021