## Classification with MLLib

Let's import all the necessary packages and load the data.

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.classification.{SVMWithSGD, SVMModel}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{LBFGS, LogisticGradient, SquaredL2Updater, HingeGradient}

Intitializing Scala interpreter ...

Spark Web UI available at http://d7f1b9bc7dde:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1607800469041)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.classification.{SVMWithSGD, SVMModel}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{LBFGS, LogisticGradient, SquaredL2Updater, HingeGradient}


In [2]:
val spark = SparkSession
    .builder
    .appName("LinearSVC")
    .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@50d34161


In [3]:
val data = MLUtils.loadLibSVMFile(sc, "dataset.libsvm")
val numFeatures = data.take(1)(0).features.size

data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[6] at map at MLUtils.scala:86
numFeatures: Int = 185316


In [25]:
val numEntries = data.count

numEntries: Long = 149389


## Using SVMWithSGD

The first model that will be used is SVMWithSVD.

In [4]:
val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
val train_data = splits(0).cache()
val test_data = splits(1)

splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[7] at randomSplit at <console>:32, MapPartitionsRDD[8] at randomSplit at <console>:32)
train_data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[7] at randomSplit at <console>:32
test_data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[8] at randomSplit at <console>:32


In [5]:
val numIterations = 100
val svm_with_sgd = SVMWithSGD.train(train_data, numIterations)

numIterations: Int = 100
svm_with_sgd: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 185316, numClasses = 2, threshold = 0.0


In [6]:
svm_with_sgd.clearThreshold()

res0: svm_with_sgd.type = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 185316, numClasses = 2, threshold = None


In [7]:
val score_labels_sgd = test_data.map { point =>
    val score = svm_with_sgd.predict(point.features)
    if (score < 0.0) {
        (0.0, point.label)
    } else {
        (1.0, point.label)
    }
}

score_labels_sgd: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[212] at map at <console>:34


In [8]:
val correct_counter = score_labels_sgd
    .map(x => if (x._1 == x._2) 1 else 0)
    .aggregate((0, 0))(
        (u, t) => (u._1 + t, u._2 + 1),
        (u1, u2) => (u1._1 + u2._1, u1._2 + u2._2)
    )

val acc = correct_counter._1.toDouble / correct_counter._2
println(s"Accuracy for SVM with SGD: $acc")

Accuracy for SVM with SGD: 0.9990000444424693


correct_counter: (Int, Int) = (44957,45002)
acc: Double = 0.9990000444424693


In [9]:
val score_labels_sgd = test_data.map { point =>
    val score = svm_with_sgd.predict(point.features)
    (score, point.label)
}

score_labels_sgd: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[214] at map at <console>:34


In [10]:
val metrics_sgd = new BinaryClassificationMetrics(score_labels_sgd)

metrics_sgd: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@21e3801e


In [12]:
val auROC = metrics_sgd.areaUnderROC
println(s"Area under ROC = $auROC")

Area under ROC = 0.40210812920341155


auROC: Double = 0.40210812920341155


## Using L-BFGS instead of SGD

Let's apply L-BFGS optimization algorithm instead of SGD.

In [13]:
val train_data_lbfgs = splits(0).map(x => (x.label, MLUtils.appendBias(x.features))).cache()

train_data_lbfgs: org.apache.spark.rdd.RDD[(Double, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[229] at map at <console>:32


In [14]:
val numCorrections = 10
val convergenceTol = 1e-4
val maxNumIterations = 100
val regParam = 0.1
val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))

numCorrections: Int = 10
convergenceTol: Double = 1.0E-4
maxNumIterations: Int = 100
regParam: Double = 0.1
initialWeightsWithIntercept: org.apache.spark.mllib.linalg.Vector = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...


In [15]:
val (weightsWithIntercept, loss) = LBFGS.runLBFGS(
    train_data_lbfgs,
    new HingeGradient(),
    new SquaredL2Updater(),
    numCorrections,
    convergenceTol,
    maxNumIterations,
    regParam,
    initialWeightsWithIntercept)

weightsWithIntercept: org.apache.spark.mllib.linalg.Vector = [0.0,6.960410288618664E-10,2.088123086585592E-9,4.176246173171184E-9,4.176246173171184E-9,1.3920820577237328E-9,-9.601829098911453E-5,2.6449559096750754E-8,6.960410288618664E-10,2.7841641154474657E-9,0.0,4.176246173171184E-9,2.088123086585592E-9,9.587890133624112E-5,6.960410288618664E-10,4.872287202033028E-9,6.960410288618664E-10,0.0,3.4802051443092993E-9,0.0,6.960410288618664E-10,4.176246173171184E-9,2.5997239735029336E-7,6.960410288618664E-10,-9.584131512068252E-5,6.960410288618664E-10,6.960410288618664E-10,6.960410288618664E-10,6.960410288618664E-10,2.7841641154474657E-9,6.960410288618664E-10,1.3920820577237328E-9,5.97416648615017E-7,6.960410288618664E-10,0.0,1.531290263496102E-8,2.088123086585592E-9,0.0,2.088123086585592E-...


In [16]:
val svm_with_lbfgs = new SVMModel(
    Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
    weightsWithIntercept(weightsWithIntercept.size - 1))

svm_with_lbfgs: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = -0.1946827812723564, numFeatures = 185316, numClasses = 2, threshold = 0.0


In [17]:
svm_with_lbfgs.clearThreshold()

res4: svm_with_lbfgs.type = org.apache.spark.mllib.classification.SVMModel: intercept = -0.1946827812723564, numFeatures = 185316, numClasses = 2, threshold = None


In [18]:
val score_labels_lbfgs = test_data.map { point =>
    val score = svm_with_lbfgs.predict(point.features)
    if (score < 0.0) {
        (0.0, point.label)
    } else {
        (1.0, point.label)
    }
}

score_labels_lbfgs: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[271] at map at <console>:34


In [19]:
score_labels_lbfgs.take(100)

res5: Array[(Double, Double)] = Array((0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,1.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), (0.0,0.0), ...


In [20]:
val correct_counter = score_labels_lbfgs
    .map(x => if (x._1 == x._2) 1 else 0)
    .aggregate((0, 0))(
        (u, t) => (u._1 + t, u._2 + 1),
        (u1, u2) => (u1._1 + u2._1, u1._2 + u2._2)
    )

val acc = correct_counter._1.toDouble / correct_counter._2
println(s"Accuracy for L-BFGS: $acc")

Accuracy for L-BFGS: 0.999022265677081


correct_counter: (Int, Int) = (44958,45002)
acc: Double = 0.999022265677081


In [21]:
val score_labels_lbfgs = test_data.map { point =>
    val score = svm_with_lbfgs.predict(point.features)
    (score, point.label)
}

score_labels_lbfgs: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[273] at map at <console>:34


In [22]:
val metrics_lbfgs = new BinaryClassificationMetrics(score_labels_lbfgs)

metrics_lbfgs: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@6df66d61


In [23]:
val auROC = metrics_lbfgs.areaUnderROC
println(s"Area under ROC = $auROC")

Area under ROC = 0.5195197335695125


auROC: Double = 0.5195197335695125


## Conclusions

This data is very unbalanced and we see that models learn to predict only one class. Therefore, their accuracy is almost perfect, while roc-auc score is not. I think that in this task it is more suitable to use L-BFGS, because the data has a lot of features (185316), which is even greater than the number of entries in the dataset and L-BFGS migth be more stable and less noisy because of its Hessian approximation.