# Telco Customer Churn for ICP4D

We'll use this notebook to create a machine learning model to predict customer churn.

# 1.0 Install required packages

In [None]:
!pip freeze

In [None]:
!pip install --user watson-machine-learning-client --upgrade | tail -n 1

# 2.0 Load and Clean data
We'll load our data as a pandas data frame.

* Highlight the cell below by clicking it.
* Click the `10/01` "Find data" icon in the upper right of the notebook.
* To load the virtualized data created in Exercise-1, choose the `Remote` tab.
* Choose your virtualized data (i.e. User<xyz>.billingProductCustomers), click `Insert to code` and choose `Insert Pandas DataFrame`
* The code to bring the data into the notebook environment and create a Pandas DataFrame will be added to the cell below.
* Run the cell


In [None]:
# Place cursor below and insert the Pandas DataFrame for the Telco churn data
# Make sure the variable is named `df1` for the line `df1 = pd.read_sql(query, con=conn)`


We'll use the Pandas naming convention `df` for our DataFrame

In [None]:
df = df1

### 2.1 Drop CustomerID feature (column)

In [None]:
df = df.drop('customerID', axis=1)
df.head(5)

### 2.2 Examine the data types of the features

In [None]:
df.info()

### 2.3 Any NaN values should be removed to create a more accurate model. Prior examination shows NaN values for `TotalCharges`

In [None]:
# Check if we have any NaN values
df.isnull().values.any()

In [None]:
# Handle missing values for column 8, TotalCharges
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values="NaN", strategy="most_frequent")

df.iloc[:, 8] = imp.fit_transform(df.iloc[:, 8].values.reshape(-1, 1))
df.iloc[:, 8] = pd.Series(df.iloc[:, 8])

In [None]:
# Check if we have any NaN values
df.isnull().values.any()

# 3.0 Create a model

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import json

spark = SparkSession.builder.getOrCreate()
df_data = spark.createDataFrame(df)
df_data.head()

### 3.1 Split the data into training and test sets

In [None]:
spark_df = df_data
(train_data, test_data) = spark_df.randomSplit([0.8, 0.2], 24)

print("Number of records for training: " + str(train_data.count()))
print("Number of records for evaluation: " + str(test_data.count()))

### 3.2 Examine the Spark DataFrame Schema
Look at the data types to determine requirements for feature engineering

In [None]:
spark_df.printSchema()

### 3.3 Use StringIndexer to encodes a string column of labels to a column of label indices

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, Model


si_gender = StringIndexer(inputCol = 'gender', outputCol = 'gender_IX')
si_Partner = StringIndexer(inputCol = 'Partner', outputCol = 'Partner_IX')
si_Dependents = StringIndexer(inputCol = 'Dependents', outputCol = 'Dependents_IX')
si_PhoneService = StringIndexer(inputCol = 'PhoneService', outputCol = 'PhoneService_IX')
si_MultipleLines = StringIndexer(inputCol = 'MultipleLines', outputCol = 'MultipleLines_IX')
si_InternetService = StringIndexer(inputCol = 'InternetService', outputCol = 'InternetService_IX')
si_OnlineSecurity = StringIndexer(inputCol = 'OnlineSecurity', outputCol = 'OnlineSecurity_IX')
si_OnlineBackup = StringIndexer(inputCol = 'OnlineBackup', outputCol = 'OnlineBackup_IX')
si_DeviceProtection = StringIndexer(inputCol = 'DeviceProtection', outputCol = 'DeviceProtection_IX')
si_TechSupport = StringIndexer(inputCol = 'TechSupport', outputCol = 'TechSupport_IX')
si_StreamingTV = StringIndexer(inputCol = 'StreamingTV', outputCol = 'StreamingTV_IX')
si_StreamingMovies = StringIndexer(inputCol = 'StreamingMovies', outputCol = 'StreamingMovies_IX')
si_Contract = StringIndexer(inputCol = 'Contract', outputCol = 'Contract_IX')
si_PaperlessBilling = StringIndexer(inputCol = 'PaperlessBilling', outputCol = 'PaperlessBilling_IX')
si_PaymentMethod = StringIndexer(inputCol = 'PaymentMethod', outputCol = 'PaymentMethod_IX')


In [None]:
si_Label = StringIndexer(inputCol="Churn", outputCol="label").fit(spark_df)
label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=si_Label.labels)

### 3.4 Create a single vector

In [None]:
va_features = VectorAssembler(inputCols=['gender_IX',  'SeniorCitizen', 'Partner_IX', 'Dependents_IX', 'PhoneService_IX', 'MultipleLines_IX', 'InternetService_IX', \
                                         'OnlineSecurity_IX', 'OnlineBackup_IX', 'DeviceProtection_IX', 'TechSupport_IX', 'StreamingTV_IX', 'StreamingMovies_IX', \
                                         'Contract_IX', 'PaperlessBilling_IX', 'PaymentMethod_IX', 'TotalCharges', 'MonthlyCharges'], outputCol="features")

### 3.5 Create a pipeline, and fit a model using RandomForestClassifier 
Assemble all the stages into a pipeline. We don't expect a clean linear regression, so we'll use RandomForestClassifier to find the best decision tree for the data.

In [None]:
classifier = RandomForestClassifier(featuresCol="features")

pipeline = Pipeline(stages=[si_gender, si_Partner, si_Dependents, si_PhoneService, si_MultipleLines, si_InternetService, si_OnlineSecurity, si_OnlineBackup, si_DeviceProtection, \
                            si_TechSupport, si_StreamingTV, si_StreamingMovies, si_Contract, si_PaperlessBilling, si_PaymentMethod, si_Label, va_features, \
                            classifier, label_converter])

model = pipeline.fit(train_data)

In [None]:
predictions = model.transform(test_data)
evaluatorDT = BinaryClassificationEvaluator(rawPredictionCol="prediction")
area_under_curve = evaluatorDT.evaluate(predictions)

#default evaluation is areaUnderROC
print("areaUnderROC = %g" % area_under_curve)

# 4.0 Save the model to Cloud Pak for Data

In [None]:
from dsx_ml.ml import save

In [None]:
MODEL_NAME = "telco churn model"

In [None]:
save(name=MODEL_NAME,
    model=model,
    test_data = test_data,
    algorithm_type='Classification',
    description='This is a SparkML Model to Classify Telco Customer Churn Risk')

### 4.1 Write the test data without label to a .csv so that we can later use it for batch scoring

In [None]:
write_score_CSV=test_data.toPandas().drop(['Churn'], axis=1)
write_score_CSV.to_csv('../datasets/TelcoCustomerSparkMLBatchScore.csv', sep=',', index=False)

### 4.2 Write the test data to a .csv so that we can later use it for evaluation

In [None]:
write_eval_CSV=test_data.toPandas()
write_eval_CSV.to_csv('../datasets/TelcoCustomerSparkMLEval.csv', sep=',', index=False)

# 5.0 Deploy the model to Watson Machine Learning

### 5.1 Set up Watson Machine Learning client to communicate with Cloud Pak for Data

In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [None]:
wml_credentials = {
  "url": "https://169.x.y.z",
  "instance_id": "icp",
  "username": "username",
  "password": "password"
}

client = WatsonMachineLearningAPIClient(wml_credentials)
print(client)

## 5.2 List existing deployments and models

In [None]:
client.repository.list_models()
client.deployments.list()

### 5.3 Save model to Watson Machine Learning client

In [None]:
model_props = {client.repository.ModelMetaNames.AUTHOR_NAME: "IBM",
               client.repository.ModelMetaNames.NAME: MODEL_NAME}
stored_model = client.repository.store_model(model=model, pipeline=pipeline, meta_props=model_props, training_data=train_data)

### 5.4 Deploy the model to Watson Machine Learning

In [None]:
model_uid = client.repository.get_model_uid(stored_model)
print(model_uid)

In [None]:
created_deployment = client.deployments.create(model_uid, name=MODEL_NAME + " deployment")
client.repository.list_models()
client.deployments.list()

## Congratulations, you have created a model based on customer churn data, and deployed it to Watson Machine Learning!