In [1]:
displayHTML("<font size=8 color=black> Let's start building a PIPELINE using the elements in<br> <font color=  #FA5733> MSTC_Pipeline_PySpark_1.ipynb")

### [MSTC](http://mstc.ssr.upm.es/big-data-track) and MUIT:

## Importing Churn Data

###  Load churn-bigml-80.csv into a DataFrame

In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

CV_data = sqlContext.read.load('/FileStore/tables/churn_bigml_80-bf1a8.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.functions import UserDefinedFunction

toStr = UserDefinedFunction(lambda k: k, StringType())
CV_data = CV_data.withColumn('Churn', toStr(CV_data['Churn']))


## Spark: ML Pipelines
https://spark.apache.org/docs/2.2.0/ml-pipeline.html

##  <font color= #e38009> Transformer A: StringIndexer

<font font-family: "calibri" size=3.5>StringIndexer converts String values that are part of a look-up into categorical indices, which could be used by machine learning algorithms in ml library.

##  <font color= #e38009> Transformer B: VectorAssembler

<font font-family: "calibri" size=3.5>...after “feature engineering” … the feature engineering results are then combined using the VectorAssembler, before being passed to ML Estimator

***Notice we provide the input = list of columns (MUST BE NUMERIC!) and the output column assembles all of them in a single column/vector***

### <font color= #C70039 > list with predictors to Assemble

In [9]:
predictors=('Number vmail messages',
 'Total day minutes',
 'Total day calls',
 'Total eve minutes',
 'Total eve calls',
 'Total night minutes',
 'Total night calls',
 'Total intl minutes',
 'Total intl calls',
 'Customer service calls')

##  <font color=#FF5733> Estimators

<font font-family: "calibri" size=3.5>
An Estimator abstracts the concept of a learning algorithm or any algorithm that fits or trains on data. 

Technically, an Estimator implements a method fit(), which accepts a DataFrame and produces a Model, which is a Transformer. <br><br>
***For example, a learning algorithm such as LogisticRegression is an Estimator, and calling fit() trains a LogisticRegressionModel, which is a Model and hence a Transformer.***

In [11]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Index labels, adding metadata to the label column
stringindexer = StringIndexer(inputCol='Churn',
                             outputCol='indexedLabel')

assembler=VectorAssembler(inputCols=predictors,outputCol='features')

# Train a DecisionTree model
dTree_algorithm = DecisionTreeClassifier(maxDepth=2,
                                        labelCol='indexedLabel', featuresCol='features')


# Chain indexers and tree in a Pipeline

In [13]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[stringindexer,\
                            assembler, dTree_algorithm])

## <font color=#938882>Model Evaluation using:

* Hyperparameters selection
* Cross-validation

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

evaluator=BinaryClassificationEvaluator(labelCol='indexedLabel',\
                                        rawPredictionCol='rawPrediction',\
                                       metricName='areaUnderROC')


# Search through decision tree's maxDepth parameter for best model
paramGrid = ParamGridBuilder().addGrid(dTree_algorithm.maxDepth, [2,3,4,5,6,7]).build()

# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [16]:
CrossvalModel=crossval.fit(CV_data)

# <font face="calibri" color=#d63de2> Evaluate TEST DATA

##  <font color= #e38009> Transformer : Making predictions with the TRAINED model

### <font color=red>Evaluation on TEST data

In [20]:
%sh ls /dbfs/FileStore/tables

In [21]:
Test_data = sqlContext.read.load('/FileStore/tables/churn_bigml_20-55239.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.functions import UserDefinedFunction

toStr = UserDefinedFunction(lambda k: k, StringType())
Test_data = Test_data.withColumn('Churn', toStr(Test_data['Churn']))

In [22]:
# make predictions and evaluate result
predictions_Test = CrossvalModel.transform(Test_data)
accuracy_Test=evaluator.evaluate(predictions_Test)

print(accuracy_Test)

In [23]:
# make predictions and evaluate result
#pipelineModel=pipeline.fit(CV_data)
#predictions_Test = pipelineModel.transform(Test_data)
#accuracy_Test=evaluator.evaluate(predictions_Test)

#print(accuracy_Test)

### <font color=red>Evaluation on <font color=green> TRAIN data

In [25]:
# make predictions and evaluate result
predictions_Train = CrossvalModel.transform(CV_data)

accuracy_Train=evaluator.evaluate(predictions_Train)

print(accuracy_Train)

In [26]:
#pipelineModel=pipeline.fit(CV_data)
#predictions_Train = pipelineModel.transform(CV_data)
#accuracy_Train=evaluator.evaluate(predictions_Train)

#print(accuracy_Train)

# <font color= #9e9b9e >..... ANALYZE BEST MODEL

In [28]:
# Fetch best model BUT TO BE USED we need process everything NO Pipes!! see below...
Best_tree_model = CrossvalModel.bestModel
print(Best_tree_model.stages[2])

In [29]:
print(Best_tree_model.stages[2]._call_java("toDebugString"))