In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')
findspark.find()

'/usr/hdp/current/spark2-client'

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("yarn")\
.appName("ch24_MLOverview").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
for x in sc._conf.getAll():
    if "/proxy/" in x[1]:
        print(x[1])

http://rm01.itversity.com:19288/proxy/application_1533622723243_15474


In [5]:
from pyspark.ml.linalg import Vectors

In [6]:
denseVec = Vectors.dense(1.0, 2.0, 3.0)

In [7]:
type(denseVec)

pyspark.ml.linalg.DenseVector

In [8]:
denseVec

DenseVector([1.0, 2.0, 3.0])

In [9]:
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)

In [10]:
sparseVec

SparseVector(3, {1: 2.0, 2: 3.0})

In [11]:
# COMMAND ----------
df = spark.read.json("/user/kranthidr/dataSets/spark-guide/simple-ml")
df.orderBy("value2").show()

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
|green| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red| bad|     2|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|  red| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|green|good|     1|14.386294994851129|
|green|good|    12|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|  red|good|    35|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 20 rows



In [12]:
df.count()

110

In [13]:
# COMMAND ----------
from pyspark.ml.feature import RFormula

In [14]:
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")

In [15]:
print(supervised.explainParams())

featuresCol: features column name. (default: features)
forceIndexLabel: Force to index label whether it is numeric or string (default: False)
formula: R model formula (current: lab ~ . + color:value1 + color:value2)
handleInvalid: how to handle invalid entries. Options are 'skip' (filter out rows with invalid values), 'error' (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels). (default: error)
labelCol: label column name. (default: label)
stringIndexerOrderType: How to order categories of a string feature column used by StringIndexer. The last category after ordering is dropped when encoding strings. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. The default value is frequencyDesc. When the ordering is set to alphabetDesc, RFormula drops the same category as R when encoding strings. (default: frequencyDesc)


In [16]:
# COMMAND ----------
fittedRF = supervised.fit(df)

In [17]:
preparedDF = fittedRF.transform(df)
preparedDF.show(5, False)

+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|color|lab |value1|value2            |features                                                              |label|
+-----+----+------+------------------+----------------------------------------------------------------------+-----+
|green|good|1     |14.386294994851129|(10,[1,2,3,5,8],[1.0,1.0,14.386294994851129,1.0,14.386294994851129])  |1.0  |
|blue |bad |8     |14.386294994851129|(10,[2,3,6,9],[8.0,14.386294994851129,8.0,14.386294994851129])        |0.0  |
|blue |bad |12    |14.386294994851129|(10,[2,3,6,9],[12.0,14.386294994851129,12.0,14.386294994851129])      |0.0  |
|green|good|15    |38.97187133755819 |(10,[1,2,3,5,8],[1.0,15.0,38.97187133755819,15.0,38.97187133755819])  |1.0  |
|green|good|12    |14.386294994851129|(10,[1,2,3,5,8],[1.0,12.0,14.386294994851129,12.0,14.386294994851129])|1.0  |
+-----+----+------+------------------+----------------------------------

In [18]:
preparedDF.printSchema()

root
 |-- color: string (nullable = true)
 |-- lab: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [19]:
# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])

In [20]:
# COMMAND ----------
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")

# __init__(self, 
#          featuresCol="features", 
#          labelCol="label", 
#          predictionCol="prediction",                  
#          maxIter=100, 
#          regParam=0.0, 
#          elasticNetParam=0.0, 
#          tol=1e-6, 
#          fitIntercept=True,                  
#          threshold=0.5, 
#          thresholds=None, 
#          probabilityCol="probability",                  
#          rawPredictionCol="rawPrediction", 
#          standardization=True, 
#          weightCol=None,                  
#          aggregationDepth=2, 
#          family="auto",                  
#          lowerBoundsOnCoefficients=None, 
#          upperBoundsOnCoefficients=None,                  
#          lowerBoundsOnIntercepts=None, 
#          upperBoundsOnIntercepts=None)

In [21]:
# COMMAND ----------

print lr.explainParams()

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [22]:
# COMMAND ----------

fittedLR = lr.fit(train)

In [23]:
# COMMAND ----------

train, test = df.randomSplit([0.7, 0.3])

In [24]:
# COMMAND ----------

rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

In [25]:
# COMMAND ----------

from pyspark.ml import Pipeline
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

In [26]:
# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()

In [27]:
# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

In [28]:
# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)

In [29]:
# COMMAND ----------

tvsFitted = tvs.fit(train)

In [30]:
predictions = tvsFitted.transform(test)

In [31]:
predictions.columns

['color',
 'lab',
 'value1',
 'value2',
 'features',
 'label',
 'rawPrediction',
 'probability',
 'prediction']

In [32]:
predictions.select('features',
 'label',
 'rawPrediction',
 'probability',
 'prediction').take(2)

[Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0),
 Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0)]

In [33]:
predictions.select('features',
 'label',
 'rawPrediction',
 'probability',
 'prediction').head(3)

[Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0),
 Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0),
 Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0)]

In [34]:
evaluator.evaluate(predictions)

0.9

In [35]:
summary = tvsFitted.bestModel.stages[1].summary

In [36]:
summary.objectiveHistory

[0.6909233093138178,
 0.6433899078202643,
 0.48680869448203506,
 0.4568181473278105,
 0.44967917190467177,
 0.4429750531158449,
 0.4348269943975538,
 0.4291589873786701,
 0.4268293999878343,
 0.4231952235317365,
 0.4208194405755006,
 0.4201506440118763,
 0.4201106432617527,
 0.42011009888212203,
 0.42011004123017776,
 0.4201100327703955,
 0.4201100326350431,
 0.4201100321469774,
 0.4201100321229984,
 0.42011003209462594,
 0.4201100320896239,
 0.42011003208812403]

In [37]:
summary.areaUnderROC

0.95

In [38]:
summary.roc.show()

+-----+-------------------+
|  FPR|                TPR|
+-----+-------------------+
|  0.0|                0.0|
|  0.0|                0.2|
|  0.0|0.37142857142857144|
|  0.0|0.45714285714285713|
|  0.0| 0.6857142857142857|
|  0.0| 0.8571428571428571|
|  0.2| 0.8571428571428571|
| 0.35| 0.8571428571428571|
| 0.35|                1.0|
| 0.55|                1.0|
|0.775|                1.0|
|  1.0|                1.0|
|  1.0|                1.0|
+-----+-------------------+



In [39]:
#ERROR how to persist and load models

tvsFitted.bestModel.write().overwrite().save("/user/kranthidr/savedModels/tvsFitted")

# AttributeError: 'Pipeline' object has no attribute '_transfer_param_map_to_java'
# it will not work for tvsFitted (TrainValidationSplit) ...call it on tvsFitted.bestModel

In [40]:
from pyspark.ml.pipeline import PipelineModel

loadedPipeline = PipelineModel.load("/user/kranthidr/savedModels/tvsFitted")

In [41]:
predictions = loadedPipeline.transform(test)

In [42]:
predictions.select('features',
 'label',
 'rawPrediction',
 'probability',
 'prediction').take(2)

[Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0),
 Row(features=SparseVector(7, {2: 8.0, 3: 14.3863, 6: 8.0}), label=1.0, rawPrediction=DenseVector([-1.8056, 1.8056]), probability=DenseVector([0.1412, 0.8588]), prediction=1.0)]