In [1]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

CV_data = sqlContext.read.load('/resources/data/MSTC/churn-bigml-80.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()


In [2]:
predictors=('Number vmail messages',
 'Total day minutes',
 'Total day calls',
 'Total eve minutes',
 'Total eve calls',
 'Total night minutes',
 'Total night calls',
 'Total intl minutes',
 'Total intl calls',
 'Customer service calls',
 'IntlPlan',
 'VmailPlan')

In [3]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.classification import RandomForestClassifier

# Index labels, adding metadata to the label column
stringindexer = StringIndexer(inputCol='Churn',
                             outputCol='indexedLabel')

stringindexerIntlPlan = StringIndexer(inputCol='International plan',
                             outputCol='IntlPlan')

stringindexerVmailPlan = StringIndexer(inputCol='Voice mail plan',
                             outputCol='VmailPlan')

assembler=VectorAssembler(inputCols=predictors,outputCol='features')

# Train a RandomForest model.
rf_algorithm = RandomForestClassifier(\
                                      labelCol='indexedLabel', featuresCol='features')

# Train a DecisionTree model
#dTree_algorithm = DecisionTreeClassifier(maxDepth=2,
#                                        labelCol='indexedLabel', featuresCol='features')


In [4]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[stringindexer,\
                            stringindexerIntlPlan,\
                            stringindexerVmailPlan,\
                            assembler, rf_algorithm])

#                            assembler, dTree_algorithm])

In [5]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

evaluator=BinaryClassificationEvaluator(labelCol='indexedLabel',\
                                        rawPredictionCol='rawPrediction',\
                                       metricName='areaUnderROC')


# Search through decision tree's maxDepth parameter for best model
#paramGrid = ParamGridBuilder().addGrid(dTree_algorithm.maxDepth, [2,3,4,5,6,7]).build()
paramGrid = ParamGridBuilder().addGrid(rf_algorithm.numTrees, [100,200,400,800])\
                        .addGrid(rf_algorithm.maxDepth, [2,3,4,5,6,7])\
                        .build()

# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [6]:
from time import time

t0 = time()

CrossvalModel=crossval.fit(CV_data)

tt = time() - t0
print("Task completed in {} seconds".format(round(tt,3)))


### <font color=red>Evaluation on TEST data

In [8]:
Test_data = sqlContext.read.load('/resources/data/MSTC/churn-bigml-20.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [9]:
# make predictions and evaluate result
predictions_Test = CrossvalModel.transform(Test_data)
accuracy_Test=evaluator.evaluate(predictions_Test)

print(accuracy_Test)

In [10]:
# Confussion Matrix
predictions_Test.crosstab('Churn','prediction').show()

In [11]:
# make predictions and evaluate result
#pipelineModel=pipeline.fit(CV_data)
#predictions_Test = pipelineModel.transform(Test_data)
#accuracy_Test=evaluator.evaluate(predictions_Test)

#print(accuracy_Test)

### <font color=red>Evaluation on <font color=green> TRAIN data

In [13]:
# make predictions and evaluate result
predictions_Train = CrossvalModel.transform(CV_data)

accuracy_Train=evaluator.evaluate(predictions_Train)

print(accuracy_Train)

In [14]:
# Confussion Matrix
predictions_Train.crosstab('Churn','prediction').show()

In [15]:
#pipelineModel=pipeline.fit(CV_data)
#predictions_Train = pipelineModel.transform(CV_data)
#accuracy_Train=evaluator.evaluate(predictions_Train)

#print(accuracy_Train)