# Logistic regression using Spark ML

In [1]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import functions as F, types as T, SparkSession

In [2]:
spark = SparkSession.builder\
        .appName("Spark ML")\
        .getOrCreate()

Read in the [Wine Quality](https://archive.ics.uci.edu/ml/datasets/wine+quality) dataset.

In [3]:
whites = spark.read\
         .option("header", True)\
         .option("inferSchema", True)\
         .option("sep", ";")\
         .csv("datasets/wine-quality.csv")

In [4]:
whites.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|           

Define a new variable 'label' for wines with quality ≥ 7.

In [5]:
whites = whites.withColumn("label", (F.col("quality") >= 7).cast(T.IntegerType()))

Define a pipeline for a logistic regression model for 'good_quality' using all predictors.

In [6]:
assembler = VectorAssembler(inputCols=[c for c in whites.columns if c not in {"quality", "good_quality"}],
                            outputCol="features")
lr = LogisticRegression(maxIter=10, family="binomial")
pipeline = Pipeline(stages=[assembler, lr])

Fit a regularised logistic regression model using 3-fold cross-validation with AUC scoring to determine the 'optimal' values for the hyperparameters.

In [7]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [10.**x for x in range(-4, 5)])\
    .addGrid(lr.elasticNetParam, [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.])\
    .build()

In [8]:
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator(metricName="areaUnderROC"),
                    numFolds=3,
                    seed=42)

In [9]:
cvModel = cv.fit(whites)

Extract classification metrics and confusion matrix for the 'best' model.

In [10]:
lrModel = cvModel.bestModel.stages[1]

In [11]:
lrModel.summary.accuracy

1.0

In [12]:
lrModel.summary.areaUnderROC

0.9999225717010628

In [13]:
predictions = cvModel.transform(whites)

In [14]:
predictions.groupBy(F.col("label"), F.col("prediction"))\
           .count()\
           .show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0| 3838|
|    1|       1.0| 1060|
+-----+----------+-----+

