In [1]:
cuse = spark.read.csv('data/cuse_binary.csv', header=True, inferSchema=True)
cuse.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



## Process categorical columns

The following code does three things with pipeline:

* `StringIndexer` all categorical columns
* `OneHotEncoder` all categorical index columns
* `VectorAssembler` all feature columns into one vector column

In [49]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# categorical columns
categorical_columns = cuse.columns[0:3]

# build StringIndexer stages
stringindexer_stages = [StringIndexer(inputCol=c, outputCol='strindexed_' + c) for c in categorical_columns]
# encode label column and add it to stringindexer_stages
stringindexer_stages += [StringIndexer(inputCol='y', outputCol='label')]

# build OneHotEncoder stages
onehotencoder_stages = [OneHotEncoder(inputCol='strindexed_' + c, outputCol='onehot_' + c) for c in categorical_columns]

# build VectorAssembler stage
feature_columns = ['onehot_' + c for c in categorical_columns]
vectorassembler_stage = VectorAssembler(inputCols=feature_columns, outputCol='features') 

# all stages
all_stages = stringindexer_stages + onehotencoder_stages + [vectorassembler_stage]

# build pipeline model
pipeline = Pipeline(stages=all_stages)

# fit pipeline model
pipeline_model = pipeline.fit(cuse)

# transform the data
final_columns = feature_columns + ['features', 'label']
cuse_df = pipeline_model.transform(cuse).\
            select(final_columns)
    
cuse_df.show(5)

+-------------+----------------+----------------+-------------------+-----+
|   onehot_age|onehot_education|onehot_wantsMore|           features|label|
+-------------+----------------+----------------+-------------------+-----+
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
+-------------+----------------+----------------+-------------------+-----+
only showing top 5 rows



**Split data into training and test datasets

In [37]:
training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234)

**Build cross-validation model**

In [38]:
## ======= build cross validation model ===========

# estimator
from pyspark.ml.regression import GeneralizedLinearRegression
glm = GeneralizedLinearRegression(featuresCol='features', labelCol='label', family='binomial')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(glm.regParam, [0, 0.5, 1, 2, 4]).\
    build()
    
# evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')

# build cross-validation model
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=glm, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

**Fit model**

In [44]:
cv_model = cv.fit(training)
# to compare with R results, here we fit the model with the whole dataset
cv_model = cv.fit(cuse_df)

**Prediction**

In [45]:
# prediction
pred_training_cv = cv_model.transform(training)
pred_test_cv = cv_model.transform(test)

pred_training_cv.show(5)
pred_test_cv.show(5, truncate=False)


+----------+----------------+----------------+---------+-----+------------------+
|onehot_age|onehot_education|onehot_wantsMore| features|label|        prediction|
+----------+----------------+----------------+---------+-----+------------------+
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|0.5140024065151293|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|0.5140024065151293|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|0.5140024065151293|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|0.5140024065151293|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|0.5140024065151293|
+----------+----------------+----------------+---------+-----+------------------+
only showing top 5 rows

+----------+----------------+----------------+---------+-----+------------------+
|onehot_age|onehot_education|onehot_wantsMore|features |label|prediction        |
+----------+----------------+----------------+---------+-----+-----------

**Estimated intercept and Coefficients**

In [61]:
print('Intercept: ' + str(cv_model.bestModel.intercept) + "\n"
     'coefficients: ' + str(cv_model.bestModel.coefficients))

Intercept: 0.0560242751692
coefficients: [-0.280625443036,-0.799857303154,-1.18923892948,0.324994652864,-0.832954761554]


In [47]:
cv_model.bestModel.intercept

0.056024275169240606

In [48]:
cuse.select('education').rdd.countByValue()

defaultdict(int, {Row(education=u'high'): 994, Row(education=u'low'): 613})