In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Create Spark Context

In [2]:
# PySpark :
from pyspark.sql import SparkSession
import os
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages \
                                    org.postgresql:postgresql:42.1.1,org.apache.hadoop:hadoop-aws:2.7.1,com.datastax.spark:spark-cassandra-connector_2.11:2.3.0 \
                                    --executor-memory 4G \
                                    pyspark-shell'

spark = SparkSession.builder.appName('Classification with Spark')\
                            .master("spark://localhost:7077")\
                            .getOrCreate()


In [3]:
spark

In [4]:
df = spark.read.parquet("./data/fireservice_data_for_ML.parquet")

In [5]:
df = df.select("features", "FinalPriority")

In [6]:
df.show()

+--------------------+-------------+
|            features|FinalPriority|
+--------------------+-------------+
|(198,[1,5,27,48,8...|            3|
|(198,[1,5,27,48,8...|            3|
|(198,[0,5,27,48,8...|            3|
|(198,[0,5,27,55,8...|            3|
|(198,[1,5,22,55,8...|            3|
|(198,[0,5,27,55,8...|            3|
|(198,[0,7,22,33,8...|            3|
|(198,[1,7,27,33,8...|            3|
|(198,[0,7,27,33,8...|            3|
|(198,[1,10,27,48,...|            3|
|(198,[1,10,27,48,...|            3|
|(198,[1,10,27,48,...|            3|
|(198,[0,10,27,48,...|            3|
|(198,[0,11,27,48,...|            3|
|(198,[0,11,27,48,...|            3|
|(198,[0,9,27,33,8...|            3|
|(198,[1,9,27,33,8...|            3|
|(198,[0,9,27,33,8...|            3|
|(198,[1,5,20,48,8...|            3|
|(198,[1,5,27,48,8...|            3|
+--------------------+-------------+
only showing top 20 rows



In [12]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='FinalPriority',
                        outputCol = 'label',
                        handleInvalid="keep",        # last index will be for invalid values not encountered before
                        stringOrderType='alphabetAsc')

new_df = indexer.fit(df).transform(df)

In [13]:
new_df.show(5)

+--------------------+-------------+-----+
|            features|FinalPriority|label|
+--------------------+-------------+-----+
|(198,[1,5,27,48,8...|            3|  1.0|
|(198,[1,5,27,48,8...|            3|  1.0|
|(198,[0,5,27,48,8...|            3|  1.0|
|(198,[0,5,27,55,8...|            3|  1.0|
|(198,[1,5,22,55,8...|            3|  1.0|
+--------------------+-------------+-----+
only showing top 5 rows



### Test Train Split

In [16]:
train, test = new_df.randomSplit([0.8,0.2])

In [17]:
train = train.repartition(3)
test = test.repartition(3)

### Classification Evaluator

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import accuracy_score
evaluator = BinaryClassificationEvaluator(labelCol="FinalPriority")

In [19]:
evaluator.extractParamMap()

{Param(parent='BinaryClassificationEvaluator_09ed4aecb1ec', name='labelCol', doc='label column name.'): 'FinalPriority',
 Param(parent='BinaryClassificationEvaluator_09ed4aecb1ec', name='metricName', doc='metric name in evaluation (areaUnderROC|areaUnderPR)'): 'areaUnderROC',
 Param(parent='BinaryClassificationEvaluator_09ed4aecb1ec', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction'}

### Logistic Regression

In [54]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label', standardization=False, maxIter=2)

In [61]:
# Fit the model
lrModel = lr.fit(train)

In [51]:
# predict
predictions = lrModel.transform(test)

In [52]:
predictions.select('label', 'prediction').show(15)

+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 15 rows



In [53]:
# areaUnderROC
auc = evaluator.evaluate(predictions)
print("Auc : {}".format(auc))

Auc : 1.0


### Decision Tree Model

In [45]:
### Classification Model
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol="features",
                            labelCol="label")

In [46]:
dt_model = dt.fit(train)

In [47]:
# prdict
predictions = dt_model.transform(test)

# areaUnderROC
auc = evaluator.evaluate(predictions)
print("Auc : {}".format(auc))

Auc : 1.0


In [48]:
from pprint import pprint

pprint(dt_model.toDebugString)

('DecisionTreeClassificationModel (uid=DecisionTreeClassifier_209fd8ebd4d3) of '
 'depth 5 with 31 nodes\n'
 '  If (feature 189 in {0.0})\n'
 '   If (feature 190 in {0.0})\n'
 '    If (feature 48 in {0.0})\n'
 '     If (feature 57 in {0.0})\n'
 '      If (feature 31 in {0.0})\n'
 '       Predict: 1.0\n'
 '      Else (feature 31 not in {0.0})\n'
 '       Predict: 0.0\n'
 '     Else (feature 57 not in {0.0})\n'
 '      Predict: 1.0\n'
 '    Else (feature 48 not in {0.0})\n'
 '     If (feature 27 in {0.0})\n'
 '      Predict: 1.0\n'
 '     Else (feature 27 not in {0.0})\n'
 '      If (feature 185 in {0.0})\n'
 '       Predict: 1.0\n'
 '      Else (feature 185 not in {0.0})\n'
 '       Predict: 0.0\n'
 '   Else (feature 190 not in {0.0})\n'
 '    If (feature 20 in {0.0})\n'
 '     If (feature 80 in {0.0})\n'
 '      If (feature 48 in {0.0})\n'
 '       Predict: 1.0\n'
 '      Else (feature 48 not in {0.0})\n'
 '       Predict: 0.0\n'
 '     Else (feature 80 not in {0.0})\n'
 '      Predict

### Cross Validation Linear Regression

In [56]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5])
             .addGrid(lr.elasticNetParam, [0.1, 0.9])
             .build())


In [57]:
# Create k-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
# Run cross validations
cvModel = cv.fit(train)

In [58]:
# prdict
predictions = cvModel.transform(test)

# areaUnderROC
auc = evaluator.evaluate(predictions)
print("Auc : {}".format(auc))


Auc : 1.0


In [59]:
cvModel.bestModel.extractParamMap()

{Param(parent='LogisticRegression_f75fbd9a598c', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LogisticRegression_f75fbd9a598c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.1,
 Param(parent='LogisticRegression_f75fbd9a598c', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto',
 Param(parent='LogisticRegression_f75fbd9a598c', name='featuresCol', doc='features column name'): 'features',
 Param(parent='LogisticRegression_f75fbd9a598c', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LogisticRegression_f75fbd9a598c', name='labelCol', doc='label column name'): 'label',
 Param(parent='LogisticRegression_f75fbd9a598c', name='maxIter', doc='maximum number of iterations (>= 0)'): 