### Predicting Customer Churn

### Environment Setup

In [None]:
#Uncomment and run once to install the package in your runtime environment
!pip install sklearn-pandas

In [None]:
!pip install -U ibm-watson-machine-learning

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling
import sklearn.pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, LabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


### Step 1: Load data 

#### 1.1: Download the data files

In [None]:
from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

import pandas as pd

customer_churn = pd.read_csv(wslib.mount.get_data_path('churn.csv'))
customer_churn.head()


In [None]:
customer = pd.read_csv(wslib.mount.get_data_path('customer-profile.csv'))
customer.head()

In [None]:
customer = pd.read_csv('/project_data/data_asset/customer-profile.csv')
customer.head()

### Step 2: Merge Files

In [None]:
data = pd.merge(customer, customer_churn, on='ID')

### Step 3: Rename some columns
This step is to remove spaces from columns names, it's an example of data preparation that you may want to do before creating a model. 

In [None]:
data.columns

In [None]:
data.rename(columns={'Est Income':'EstIncome', 'Car Owner':'CarOwner' }, inplace=True)

In [None]:
data.head()

In [None]:
data.shape

### Step 4: Data understanding

In [None]:
data.describe()

In [None]:
# Uncomment if you would like to see the profile report

#Uncomment and run once to install the package in your runtime environment
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
# Uncomment if you would like to see the profile report
#from pandas_profiling import ProfileReport
    
#profile = ProfileReport(data, title="Data Profiling Report")
#profile.to_widgets()

### Step 5: Build the sklearn pipeline and the Random Forest model


In [None]:
# Define input data to the model
X = data.drop(['ID','CHURN'], axis=1)

In [None]:
# Define the target variable and encode with value between 0 and n_classes-1, that is from T/F to 1/0
le = LabelEncoder()
y = le.fit_transform(data['CHURN'])

In [None]:
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

#### Use the DataFrameMapper class to declare transformations and variable imputations.

* LabelBinarizer - Converts a categorical variable into a dummy variable (aka binary variable)
* StandardScaler - Standardize features by removing the mean and scaling to unit variance, z = (x - u) / s

See docs: 
* https://github.com/scikit-learn-contrib/sklearn-pandas
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html#sklearn.preprocessing.LabelBinarizer
* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:

mapper_good = DataFrameMapper([
    (['Gender'], LabelBinarizer()),
    (['Status'], LabelBinarizer()),
    (['CarOwner'], LabelBinarizer()),
    (['Paymethod'], LabelBinarizer()),
    (['MembershipPlan'], LabelBinarizer()),
    (['Children'],  StandardScaler()),
    (['EstIncome'],  StandardScaler()),
    (['Age'],  StandardScaler()),
    (['AvgMonthlySpend'],  StandardScaler()),
    (['CustomerSupportCalls'],  StandardScaler())], default=False)


In [None]:
# Instantiate the Classifier
random_forest = RandomForestClassifier(random_state=5)

# Define the steps in the pipeline to sequentially apply a list of transforms and the estimator, i.e. RandomForestClassifier
steps = [('mapper', mapper_good),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)

# train the model
model=pipeline.fit( X_train, y_train )

model

In [None]:
# Display Label Mapping to assist with interpretation of the model
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )

### and print the report
print(report)

###  Step 6:  Tune the model to find the best model

In [None]:
# List keys to the model param to tune
#model.get_params().keys()

In [None]:
parameters = { 'RandonForestClassifier__max_depth': [5,8,10],
               'RandonForestClassifier__n_estimators': [150,180,200]}

In [None]:
grid_obj = GridSearchCV(estimator=model, param_grid=parameters,  cv=3)

In [None]:
# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)


In [None]:
# Get the estimator
best_clf = grid_fit.best_estimator_

In [None]:
best_predictions = best_clf.predict(X_test)

In [None]:
best_predictions_report = sklearn.metrics.classification_report( y_test, best_predictions )

In [None]:
print('Results of best fitted model: \n\n',best_predictions_report)

In [None]:
print('Results of default model: \n\n',report)

In [None]:
m_step=pipeline.named_steps['mapper']

In [None]:
m_step.transformed_names_

In [None]:
features = m_step.transformed_names_

In [None]:
# Get the features importance
importances = pipeline.named_steps['RandonForestClassifier'][1].feature_importances_
indices = np.argsort(importances)

In [None]:
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b',align='center')
plt.yticks(range(len(indices)), (np.array(features))[indices])
plt.xlabel('Relative Importance')

### Step 7: Save Model in the Project


In [None]:
# get the Project ID and set the location to save the model to the project
from ibm_watson_machine_learning import APIClient
import os

token = os.environ['USER_ACCESS_TOKEN']

wml_credentials = {
   "token": token,
   "instance_id" : "openshift",
   "url": os.environ['RUNTIME_ENV_APSX_URL'],
   "version": "4.0"
}

client = APIClient(wml_credentials)

project_id = os.environ['PROJECT_ID']
client.set.default_project(project_id)

In [None]:
# Provide metadata and save the model into the repository. After running this cell, the model will be displayed in the Assets view

model_name = 'customer_churn_model'
software_spec_uid = client.software_specifications.get_uid_by_name('default_py3.7_opence')

metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.23"
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)

**In this version of the notebook we will perform deployment steps in the UI.**

**Author:**  Sidney Phoon and Elena Lowery<br/>
**Date:**  September 2021