# <font color=  #FF5733> Let's start building simple ELEMENTS of a PIPELINE for Orange Churn dataset

In [2]:
displayHTML("<font size=8>Let's start building simple ELEMENTS of a <font size=8 color='green'>PIPELINE</font> for</font> <font color=orange size=8>Orange Churn dataset</font>")

![How to create a DataFrame](https://blog.cloudera.com/wp-content/uploads/2017/04/Spark.png)

### [MSTC](http://mstc.ssr.upm.es/big-data-track) and MUIT:

## Importing Churn Data

###  Load churn-bigml-80.csv into a DataFrame

In [6]:
%fs ls /FileStore/tables

path,name,size
dbfs:/FileStore/tables/churn_bigml_80-bf1a8.csv,churn_bigml_80-bf1a8.csv,223998


In [7]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

CV_data = sqlContext.read.load('/FileStore/tables/churn_bigml_80-bf1a8.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')


In [8]:
display(CV_data)

State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False
MA,121,510,No,Yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,False
MO,147,415,Yes,No,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,False
WV,141,415,Yes,Yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,False
RI,74,415,No,No,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,False


In [9]:
CV_data.printSchema()

This is simply to illustrate an example to apply a UDF to a Spark DataFrame

In [11]:
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.functions import UserDefinedFunction

toStr = UserDefinedFunction(lambda k: k, StringType())
CV_data = CV_data.withColumn('Churn', toStr(CV_data['Churn']))

#binary_map = {'Yes':1.0, 'No':0.0, 'True':1.0, 'False':0.0}
#toNum = UserDefinedFunction(lambda k: binary_map[k], DoubleType())
#CV_data = CV_data.withColumn('Churn', toNum(CV_data['Churn']))

In [12]:
CV_data.printSchema()

## Spark: ML Pipelines
https://spark.apache.org/docs/latest/ml-pipeline.html

##  <font color= #e38009> Transformer A: StringIndexer
  https://spark.apache.org/docs/latest/ml-features.html#stringindexer

<font font-family: "calibri" size=3.5>StringIndexer converts String values that are part of a look-up into categorical indices, which could be used by machine learning algorithms in ml library.

***Notice we provide the input column name and the output column name as parameters at the time of initialization of the StringIndexer.***

In [15]:
from pyspark.ml.feature import StringIndexer

# Index labels: using StringIndexer to encodes a string column of labels Churn ("True" , "False" strings NO Boolean) to a column of label indices indexedChurn

stringindexer = StringIndexer(inputCol='Churn',
                             outputCol='indexedChurn')

model=stringindexer.fit(CV_data)


dataframe_transformedA=model.transform(CV_data)

In [16]:
display(dataframe_transformedA)

State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn,indexedChurn
KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False,0.0
OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False,0.0
NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False,0.0
OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False,0.0
OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False,0.0
AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False,0.0
MA,121,510,No,Yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,False,0.0
MO,147,415,Yes,No,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,False,0.0
WV,141,415,Yes,Yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,False,0.0
RI,74,415,No,No,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,False,0.0


##  <font color= #e38009> Transformer B: VectorAssembler

### ...after “feature engineering” … the feature engineering results are then combined using the VectorAssembler, before being passed to ML Estimator

###  For simplicity: first we drop all columns:
* categorical
* and numerical highly correlated

### This will be our list with predictors

In [20]:
predictors=('Number vmail messages',
 'Total day minutes',
 'Total day calls',
 'Total eve minutes',
 'Total eve calls',
 'Total night minutes',
 'Total night calls',
 'Total intl minutes',
 'Total intl calls',
 'Customer service calls')

#### Notice we provide to *VectorAssembler* the input = list of columns (MUST BE NUMERIC!) and the output column assembles all of them in a single column/vector

In [22]:
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=predictors,outputCol='features')

dataframe_transformedB=assembler.transform(dataframe_transformedA).select('indexedChurn','features')


In [23]:
dataframe_transformedB.take(5)

##  <font color=#FF5733> Estimators

<font font-family: "calibri" size=3.5>
An Estimator abstracts the concept of a learning algorithm or any algorithm that fits or trains on data. 

Technically, an Estimator implements a method fit(), which accepts a DataFrame and produces a Model, which is a Transformer. <br><br>
***For example, a learning algorithm such as LogisticRegression is an Estimator, and calling fit() trains a LogisticRegressionModel, which is a Model and hence a Transformer.***

In [25]:
from pyspark.ml.classification import DecisionTreeClassifier

# Train a DecisionTree model
dTree_algorithm = DecisionTreeClassifier(maxDepth=2,
                                        labelCol='indexedChurn', featuresCol='features')

In [26]:
dTree_model=dTree_algorithm.fit(dataframe_transformedB)

In [27]:
print(dTree_model._call_java("toDebugString"))

##  <font color= #e38009> Transformers include:learned models: 

*** e.g.  take a DataFrame, read the column containing feature vectors, predict the label for each feature vector, and output a new DataFrame with predicted labels appended as a column***

In [29]:
predictions=dTree_model.transform(dataframe_transformedB)

In [30]:
predictions.printSchema()

In [31]:
display(predictions)

indexedChurn,features,rawPrediction,probability,prediction
0.0,"List(1, 10, List(), List(25.0, 265.1, 110.0, 197.4, 99.0, 244.7, 91.0, 10.0, 3.0, 1.0))","List(1, 2, List(), List(35.0, 6.0))","List(1, 2, List(), List(0.8536585365853658, 0.14634146341463414))",0.0
0.0,"List(1, 10, List(), List(26.0, 161.6, 123.0, 195.5, 103.0, 254.4, 103.0, 13.7, 3.0, 1.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(0.0, 243.4, 114.0, 121.2, 110.0, 162.6, 104.0, 12.2, 5.0, 0.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(0.0, 299.4, 71.0, 61.9, 88.0, 196.9, 89.0, 6.6, 7.0, 2.0))","List(1, 2, List(), List(35.0, 91.0))","List(1, 2, List(), List(0.2777777777777778, 0.7222222222222222))",1.0
0.0,"List(1, 10, List(), List(0.0, 166.7, 113.0, 148.3, 122.0, 186.9, 121.0, 10.1, 3.0, 3.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(0.0, 223.4, 98.0, 220.6, 101.0, 203.9, 118.0, 6.3, 6.0, 0.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(24.0, 218.2, 88.0, 348.5, 108.0, 212.6, 118.0, 7.5, 7.0, 3.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(0.0, 157.0, 79.0, 103.1, 94.0, 211.8, 96.0, 7.1, 6.0, 0.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(37.0, 258.6, 84.0, 222.0, 111.0, 326.4, 97.0, 11.2, 5.0, 0.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0
0.0,"List(1, 10, List(), List(0.0, 187.7, 127.0, 163.4, 148.0, 196.0, 94.0, 9.1, 5.0, 0.0))","List(1, 2, List(), List(2111.0, 188.0))","List(1, 2, List(), List(0.9182253153545019, 0.08177468464549804))",0.0


In [32]:
import pandas as pd

pd.DataFrame(predictions.take(5), columns=predictions.columns)

## <font color=#938882>Model Evaluation

### *** For evaluation we will use the training cvs file, that is Train Error***

In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [35]:
evaluator=BinaryClassificationEvaluator(labelCol='indexedChurn',\
                                        rawPredictionCol='rawPrediction',\
                                       metricName='areaUnderROC')

In [36]:
accuracy=evaluator.evaluate(predictions)

In [37]:
accuracy

In [38]:
# Since dTree_model is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("dTree_model was fit using parameters: ")
print(dTree_model.extractParamMap())

In [39]:
dTree_model.extractParamMap().keys()

In [40]:
dTree_model.maxDepth

In [41]:
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {dTree_model.maxDepth: 1}
paramMap[dTree_model.maxDepth] = 7  # Specify 1 Param, overwriting the original maxIter.


# Now learn a new model using the paramMap parameters.
# paramMap overrides all parameters set earlier via dTree_model.set* methods.
dTree_model2=dTree_algorithm.fit(dataframe_transformedB, paramMap)

dTree_model2.extractParamMap()

In [42]:
predictions2=dTree_model2.transform(dataframe_transformedB)

accuracy2=evaluator.evaluate(predictions2)
print(accuracy2)



In [43]:
print(dTree_model2._call_java("toDebugString"))

##  Model selection via cross-validation

In this example we will use CrossValidator to select from a grid of parameters in the Tree model

In [45]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Search through decision tree's maxDepth parameter for best model
paramGrid = ParamGridBuilder().addGrid(dTree_algorithm.maxDepth, [2,3,4,5,6,7]).build()

In [46]:
# Set up 3-fold cross validation
crossval = CrossValidator(estimator=dTree_algorithm,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [47]:
Cross_res=crossval.fit(dataframe_transformedB)

In [48]:
print(Cross_res.bestModel)

In [49]:
print(Cross_res.bestModel._call_java("toDebugString"))

In [50]:
# Fetch the best model for make predictions with it:
Best_tree_model = Cross_res.bestModel
print(Best_tree_model)

In [51]:
predictions_CV=Best_tree_model.transform(dataframe_transformedB)

In [52]:
pd.DataFrame(predictions_CV.take(5), columns=predictions.columns)

In [53]:
accuracy_CV=evaluator.evaluate(predictions_CV)

print(accuracy_CV)

## Now let's create a PIPELINE! see MSTC_Pipeline_PySpark_2.ipynb