## Classification with MLLib

Let's import all the necessary packages and load the data.

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.classification.{SVMWithSGD, SVMModel}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{LBFGS, LogisticGradient, SquaredL2Updater}

Intitializing Scala interpreter ...

Spark Web UI available at http://d05198b55fe7:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1607725569467)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.classification.{SVMWithSGD, SVMModel}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{LBFGS, LogisticGradient, SquaredL2Updater}


In [2]:
val spark = SparkSession
      .builder
      .appName("LinearSVC")
      .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@12a5c7c1


In [3]:
val data = MLUtils.loadLibSVMFile(sc, "dataset.libsvm")
val numFeatures = data.take(1)(0).features.size

data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[6] at map at MLUtils.scala:86
numFeatures: Int = 185316


## Using SVMWithSGD

The first model that will be used is SVMWithSVD.

In [4]:
val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
val train_data = splits(0).cache()
val test_data = splits(1)

splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[7] at randomSplit at <console>:32, MapPartitionsRDD[8] at randomSplit at <console>:32)
train_data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[7] at randomSplit at <console>:32
test_data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[8] at randomSplit at <console>:32


In [5]:
val numIterations = 100
val svm_with_sgd = SVMWithSGD.train(train_data, numIterations)

numIterations: Int = 100
svm_with_sgd: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 185316, numClasses = 2, threshold = 0.0


In [6]:
svm_with_sgd.clearThreshold()

res0: svm_with_sgd.type = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 185316, numClasses = 2, threshold = None


In [7]:
val score_labels_sgd = test_data.map { point =>
    val score = svm_with_sgd.predict(point.features)
    (score, point.label)
}

score_labels_sgd: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[212] at map at <console>:34


In [8]:
val metrics_sgd = new BinaryClassificationMetrics(score_labels_sgd)
val aucROC_sgd = metrics_sgd.areaUnderROC()

metrics_sgd: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@40f267ef
aucROC_sgd: Double = 0.40210812920343003


In [9]:
println(s"Area under ROC for SVM with SGD = $aucROC_sgd")

Area under ROC for SVM with SGD = 0.40210812920343003


## Using L-BFGS instead of SGD

Let's apply L-BFGS optimization algorithm instead of SGD.

In [10]:
val train_data_lbfgs = splits(0).map(x => (x.label, MLUtils.appendBias(x.features))).cache()

train_data_lbfgs: org.apache.spark.rdd.RDD[(Double, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[227] at map at <console>:32


In [11]:
val numCorrections = 10
val convergenceTol = 1e-4
val maxNumIterations = 100
val regParam = 0.1
val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))

numCorrections: Int = 10
convergenceTol: Double = 1.0E-4
maxNumIterations: Int = 100
regParam: Double = 0.1
initialWeightsWithIntercept: org.apache.spark.mllib.linalg.Vector = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...


In [12]:
val (weightsWithIntercept, loss) = LBFGS.runLBFGS(
    train_data_lbfgs,
    new LogisticGradient(),
    new SquaredL2Updater(),
    numCorrections,
    convergenceTol,
    maxNumIterations,
    regParam,
    initialWeightsWithIntercept)

weightsWithIntercept: org.apache.spark.mllib.linalg.Vector = [0.0,-8.453627018328698E-8,-3.790526836658803E-6,-1.1038575527441863E-5,-9.57737544954984E-7,-9.71442139125112E-8,-2.0270416786332444E-5,-8.274583862475016E-5,-4.050405672202671E-6,-8.670932751760524E-6,0.0,-8.343250947621132E-6,-4.988237970347788E-7,2.7566262862127508E-5,-1.1034059455300929E-7,-2.6109600506465002E-6,-1.768503585250929E-7,0.0,-1.0295566332357653E-5,0.0,-2.3715478997203128E-7,-2.5091293643306628E-5,-7.446079864594622E-6,-1.1974778183093067E-5,-4.593376894059787E-5,-3.1380448783051945E-7,-6.126283798147832E-8,-1.3475873437764575E-6,-1.788447335690164E-7,-2.5889204345670688E-6,-3.765576875415585E-6,-1.0080012252339971E-5,-3.651889804009591E-5,-2.490306103124338E-6,0.0,-2.18351251219886E-5,-4.085157288299821E-7,0....


In [13]:
val svm_with_lbfgs = new SVMModel(
    Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
    weightsWithIntercept(weightsWithIntercept.size - 1))

svm_with_lbfgs: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = -0.18663851545993435, numFeatures = 185316, numClasses = 2, threshold = 0.0


In [14]:
svm_with_lbfgs.clearThreshold()

res2: svm_with_lbfgs.type = org.apache.spark.mllib.classification.SVMModel: intercept = -0.18663851545993435, numFeatures = 185316, numClasses = 2, threshold = None


In [15]:
val score_labels_lbfgs = test_data.map { point =>
    val score = svm_with_lbfgs.predict(point.features)
    (score, point.label)
}

score_labels_lbfgs: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[243] at map at <console>:34


In [16]:
val metrics_lbfgs = new BinaryClassificationMetrics(score_labels_lbfgs)
val aucROC_lbfgs = metrics_lbfgs.areaUnderROC()

metrics_lbfgs: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@6cc94fb6
aucROC_lbfgs: Double = 0.4475328488407433


In [17]:
println(s"Area under ROC for SVM with L-BFGS = $aucROC_lbfgs")

Area under ROC for SVM with L-BFGS = 0.4475328488407433
