---

In [1]:
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{HingeGradient, LBFGS, SquaredL2Updater}
import org.apache.spark.mllib.util.MLUtils

Intitializing Scala interpreter ...

Spark Web UI available at http://98f358221d01:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1607796816619)
SparkSession available as 'spark'


import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.{HingeGradient, LBFGS, SquaredL2Updater}
import org.apache.spark.mllib.util.MLUtils


In [2]:
val sc = org.apache.spark.sql.SparkSession.builder()
    .appName("KIS-DE-HW-3")
    .master("local")
    .getOrCreate()
    .sparkContext

sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@17841ec2


Предопределим гиперпараметры:

In [3]:
val kIterations:    Integer = 1024    
val numCorrections: Integer = 20
val convergenceTol: Double  = 1e-6
val regParam:       Double  = 0.1,

kIterations: Integer = 1024
numCorrections: Integer = 20
convergenceTol: Double = 1.0E-6
regParam: Double = 0.1


Загрузим данные. Сделаем разбиение на `train` и `test`:

In [4]:
val trainTestSplit: Array[Double] = Array(0.8,0.2)
val randomSeed:     Long          = 42L

trainTestSplit: Array[Double] = Array(0.8, 0.2)
randomSeed: Long = 42


In [5]:
val dataset     = MLUtils.loadLibSVMFile(sc, "dataset.libsvm")
val numFeatures = dataset.take(1)(0).features.size
val splits      = dataset.randomSplit(trainTestSplit, seed = randomSeed)
val train       = splits(0)
val test        = splits(1)

dataset: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[6] at map at MLUtils.scala:86
numFeatures: Int = 185316
splits: Array[org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint]] = Array(MapPartitionsRDD[7] at randomSplit at <console>:36, MapPartitionsRDD[8] at randomSplit at <console>:36)
train: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[7] at randomSplit at <console>:36
test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[8] at randomSplit at <console>:36


---

In [6]:
val modelSVMWithSGD          = SVMWithSGD.train(train, kIterations)
val scoreAndLabelsSVMWithSGD = test.map { point =>
  val score = modelSVMWithSGD.predict(point.features)
  (score, point.label)
}

modelSVMWithSGD: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 185316, numClasses = 2, threshold = 0.0
scoreAndLabelsSVMWithSGD: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[300] at map at <console>:35


In [7]:
val (weightsWithIntercept, _) = LBFGS.runLBFGS(
    train.map{ x => 
        (x.label, MLUtils.appendBias(x.features))
    },
    new HingeGradient,
    new SquaredL2Updater,
    numCorrections   = numCorrections,
    convergenceTol   = convergenceTol,
    maxNumIterations = kIterations,
    regParam         = regParam,
    initialWeights   = Vectors.dense(new Array[Double](numFeatures + 1))
)

val modelSVMWithLBFGS = new SVMModel(
    Vectors.dense(weightsWithIntercept.toArray.init),
    weightsWithIntercept.toArray.last
)

val scoreAndLabelsSVMWithLBFGS = test.map { point =>
  val score = modelSVMWithLBFGS.predict(point.features)
  (score, point.label)
}

weightsWithIntercept: org.apache.spark.mllib.linalg.Vector = [2.822551961091748E-10,0.0,2.822551961091748E-10,8.467655883274513E-10,8.467655883274513E-10,4.2338279416372566E-10,-8.330058803622971E-5,6.7741247066196105E-9,1.411275980545874E-10,8.467655883274513E-10,1.411275980545874E-10,1.5524035786003775E-9,7.056379902729038E-10,8.368481087798259E-5,2.822551961091748E-10,1.5524035786003775E-9,1.411275980545874E-10,1.411275980545874E-10,8.467655883274513E-10,1.411275980545874E-10,2.822551961091748E-10,9.87893186382057E-10,1.411275980545874E-10,1.411275980545874E-10,-8.367577871170714E-5,1.411275980545874E-10,1.411275980545874E-10,1.411275980545874E-10,1.411275980545874E-10,4.2338279416372566E-10,1.411275980545874E-10,0.0,-5.443430465663548E-7,1.411275980545874E-10,-8.36776133704819E-5,3....


In [8]:
val metricsSVMWithSGD     = new BinaryClassificationMetrics(scoreAndLabelsSVMWithSGD)
val areaUnderROCWithSGD   = metricsSVMWithSGD.areaUnderROC
val metricsSVMWithLBFGS   = new BinaryClassificationMetrics(scoreAndLabelsSVMWithLBFGS)
val areaUnderROCWithLBFGS = metricsSVMWithLBFGS.areaUnderROC

println(s"""areaUnderROCWithSGD   = ${areaUnderROCWithSGD}
           |areaUnderROCWithLBFGS = ${areaUnderROCWithLBFGS}""".stripMargin)

areaUnderROCWithSGD   = 0.5
areaUnderROCWithLBFGS = 0.5


metricsSVMWithSGD: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@5d91b736
areaUnderROCWithSGD: Double = 0.5
metricsSVMWithLBFGS: org.apache.spark.mllib.evaluation.BinaryClassificationMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@c4e3954
areaUnderROCWithLBFGS: Double = 0.5


In [9]:
val metricSVMWithSGD   = new MulticlassMetrics(scoreAndLabelsSVMWithSGD)
val metricSVMWithLBFGS = new MulticlassMetrics(scoreAndLabelsSVMWithLBFGS)

println(s"""accuracyWithSGD   = ${metricSVMWithSGD.accuracy}
           |accuracyWithLBFGS = ${metricSVMWithLBFGS.accuracy}""".stripMargin)

accuracyWithSGD   = 0.9993303421951383
accuracyWithLBFGS = 0.9993303421951383


metricSVMWithSGD: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@257265ad
metricSVMWithLBFGS: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@4f210128


---

### Вывод:  
AUC и accuracy вышли одинаковы. ML-ресерчер с меня никакой:)  
Есть единственное предположение - это значительное преобладание одного класса над другим, давайте проверим это:

In [10]:
dataset.map(_.label).countByValue

res2: scala.collection.Map[Double,Long] = Map(0.0 -> 149211, 1.0 -> 178)


Датасет фигня, будем предсказывать только нули. Весь код приложенный выше чисто для демонстрации, что я могу простой ML на Spark