## Meta
### System:
```bash
macOS 10.15.7 (19H1419)
```
### Scala:
```bash
scala -version
Scala code runner version 2.13.8 -- Copyright 2002-2021, LAMP/EPFL and Lightbend, Inc.
```
### Java:
```bash
java -version
openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.8+10)
OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.8+10, mixed mode)
```

## Настройка окружения
### Установка python
```bash
brew install python@3.7 
```
### Установка scala
```bash
brew install scala
brew install sbt
```

### Настройка Jupyter
```bash
cd ~
mkdir jupyter_tests
cd jupyter_tests
python3 -m venv venv
source venv/bin/activate
python3 -m pip install jupyter notebook 
```

### Настройка Scala kernel
```bash
curl -Lo coursier https://git.io/coursier-cli
chmod +x coursier
./coursier launch --fork almond:0.10.9 --scala 2.13 -- --install --jupyter-path ./venv/share/jupyter/kernels/
```

## Установка зависимостей

In [1]:
import $ivy.`org.scalanlp:breeze_2.13:1.3` 
import $ivy.`org.scalanlp:breeze-viz_2.13:1.3`

[32mimport [39m[36m$ivy.$                              
[39m
[32mimport [39m[36m$ivy.$                                 [39m

## Загрузка данных для обучения
#### Source:
[Kaggle_Facebook_Dataset](https://www.kaggle.com/kiranraje/prediction-facebook-comment)

#### Move to local notebook storage:
```bash
mkdir -p ./data
cp ~/Downloads/archive.zip .
unzip archive.zip
```
Now we have dataset on path: `./data/Dataset.csv`

## Загрузка датасета в breeze.DenseMatrix

In [2]:
import breeze.linalg._
import scala.collection.mutable.ArrayBuffer


def loadCsvIntoDenseMatrix(filepath: String): (DenseMatrix[String], Map[String, Int]) = {
    val rows = ArrayBuffer[DenseVector[String]]()        // массив со строками из файла
    val bufferedSource = scala.io.Source.fromFile(filepath)    // буфер с данными в файле
    val lineIter = bufferedSource.getLines               // Полуачем итератор по буферу
    val header = lineIter.next().split(",").map(_.trim)  // Забираем header файла из буфера
    var indexedHeader = header.zipWithIndex.map{case(value, index) => (value.toString, index.toInt)}.toMap
    
    for (line <- lineIter) {  // Выгружаем данные из буфера в массив изменяемой длины 
        rows += DenseVector(line.split(",").map(_.trim):_*)
    }
    bufferedSource.close()

    val matrix = DenseMatrix(rows.toArray: _*)
    return (matrix, indexedHeader)
}
var (stringMatrix, header) = loadCsvIntoDenseMatrix("data/Dataset.csv")

[32mimport [39m[36mbreeze.linalg._
[39m
[32mimport [39m[36mscala.collection.mutable.ArrayBuffer


[39m
defined [32mfunction[39m [36mloadCsvIntoDenseMatrix[39m
[36mstringMatrix[39m: [32mDenseMatrix[39m[[32mString[39m] = 634995  0  463  1  0   0   0   0   0    65  166  2   24  0  0  ... (28 total)
634995  0  463  1  0   0   0   0   0    10  132  1   24  0  0  ...
634995  0  463  1  0   0   0   0   0    14  133  2   24  0  0  ...
634995  0  463  1  7   0   3   7   -3   62  131  1   24  0  0  ...
634995  0  463  1  1   0   0   1   0    58  142  5   24  0     ...
634995  0  463  1  0   0       0   0    60  166  1   24  0  0  ...
634995  0  463  1  0   0       0   0    68  145  2   24  0  0  ...
634995  0  463  1  1   0   1   1   -1   32  157  2   24  0  0  ...
634995  0  463  1  0   0       0   0    35  177  5   24  0  0  ...
634995  0  463  1  0   0       0   0    48  126  1   24  0  0  ...
634995  0  463  1  0   0       0   0    52  188  1   24  0  0  ...
634995  0  463 

## Топ значений по столбцам

In [4]:
import $ivy.`org.plotly-scala::plotly-almond:0.7.6`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._

repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)

[32mimport [39m[36m$ivy.$                                      
[39m
[32mimport [39m[36mplotly._, plotly.element._, plotly.layout._, plotly.Almond._

[39m

In [5]:
import scala.collection.mutable
import scala.collection.immutable.ListMap


val TOP_N = 10

def valueCounts[A](x: Seq[A]): mutable.Map[A, Int] = {
    val response = mutable.Map[A, Int]()
    for (variable <- x) {
        x
        if (response.get(variable) == None) {
            response(variable) = 1
        } else {
            response(variable) += 1
        }
    }
    return response
}

for ((column, index) <- header.toSeq.sortWith(_._2 < _._2)) {
    val topValueCounts = ListMap(valueCounts(stringMatrix(::,header(column)).toArray).toSeq.sortWith(_._2  > _._2):_*).slice(0, TOP_N)
    Seq(
        Bar(
            topValueCounts.keys.toArray.map(x => if (x== "") "value: NULL" else"value: "+x).toSeq,
            topValueCounts.values.toArray.toSeq,
        )
    ).plot(
        title=column
    )
}  

[32mimport [39m[36mscala.collection.mutable
[39m
[32mimport [39m[36mscala.collection.immutable.ListMap


[39m
[36mTOP_N[39m: [32mInt[39m = [32m10[39m
defined [32mfunction[39m [36mvalueCounts[39m

## Количество пустых данных в каждом столбце

In [6]:
for ((column, index) <- header.toSeq.sortWith(_._2 < _._2)) {
    println(s"$column: ${valueCounts(stringMatrix(::,header(column)).toArray).get("")}")
}

likes: None
Checkins: None
Returns: Some(51)
Category: Some(57)
commBase: Some(60)
comm24: None
comm48: Some(48)
comm24_1: None
diff2448: None
baseTime: None
length: None
shares: Some(2449)
hrs: None
sun_pub: None
mon_pub: Some(1927)
tue_pub: None
wed_pub: None
thu_pub: Some(3045)
fri_pub: None
sat_pub: None
sun_base: None
mon_base: Some(1970)
tue_base: None
wed_base: None
thu_base: None
fri_base: None
sat_base: None
output: None


## Удаляем столбцы с большим количеством пустых значений

In [7]:
val removeCols = Seq("shares", "mon_pub", "thu_pub", "mon_base")

val clearStringMatrix = stringMatrix.delete(
    removeCols.map{value => header(value)}, 
    breeze.linalg.Axis._1
)

val clearHeader = Map(header.filter(x => !removeCols.contains(x._1)).toSeq.sortWith(_._2 < _._2).zipWithIndex.map{
    case (value, index) => (value._1, index)
}:_*)


[36mremoveCols[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m([32m"shares"[39m, [32m"mon_pub"[39m, [32m"thu_pub"[39m, [32m"mon_base"[39m)
[36mclearStringMatrix[39m: [32mDenseMatrix[39m[[32mString[39m] = 634995  0  463  1  0   0   0   0   0    65  166  24  0  0  1  0  ... (24 total)
634995  0  463  1  0   0   0   0   0    10  132  24  0  0  0  0  ...
...
[36mclearHeader[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mHashMap[39m(
  [32m"comm48"[39m -> [32m6[39m,
...

### Преобразовываем матрицу к Double типу, заполняя все оставшиеся пустые значения медианным значением и нормируем величины


In [8]:
import breeze.stats.median

def convertToDoubleWithMedian(x: DenseVector[String]): DenseVector[Double] = {
    val nonEmpty = (x: String) => if (x == "") false else true
    val medianValue = median(x(x.findAll(nonEmpty)).map(x => x.toDouble))
    val maxValue = x(x.findAll(nonEmpty)).map(x => x.toDouble).toArray.max.toDouble
    val minValue = x(x.findAll(nonEmpty)).map(x => x.toDouble).toArray.min.toDouble
    return x.map(x => if (x == "") medianValue else x.toDouble).map(x => (x-minValue)/(maxValue-minValue)).toDenseVector
}


val Y = clearStringMatrix(::, clearHeader("output")).map(x => x.toDouble)
val X = clearStringMatrix(::, breeze.linalg.*).map(convertToDoubleWithMedian).delete(clearHeader("output"), breeze.linalg.Axis._1)

[32mimport [39m[36mbreeze.stats.median

[39m
defined [32mfunction[39m [36mconvertToDoubleWithMedian[39m
[36mY[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0...
[36mX[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 0.0013038915167285063  0.0  7.602699664463143E-5  0.0  ... (23 total)
0.0013038915167285063  0.0  7.602699664463143E-5  0.0  ...
...

In [9]:
val TRAIN_PERCENT = 0.7

val rowCount = X.rows
val trainRowCount = (TRAIN_PERCENT * rowCount).toInt
val testRowCout = rowCount - trainRowCount

val YTrain = Y(List.range(0,trainRowCount+1)).toDenseVector
val XTrain = X(List.range(0,trainRowCount+1),::).toDenseMatrix

val YTest = Y(List.range(trainRowCount+2,rowCount)).toDenseVector
val XTest = X(List.range(trainRowCount+2,rowCount),::).toDenseMatrix

println(s"Source 'X' size: ${X.rows}, ${X.cols}")

println(s"Train 'XTrain' size: ${XTrain.rows}, ${XTrain.cols}")

println(s"Test 'XTest' size: ${XTest.rows}, ${XTest.cols}")

Source 'X' size: 40949, 23
Train 'XTrain' size: 28665, 23
Test 'XTest' size: 12283, 23


[36mTRAIN_PERCENT[39m: [32mDouble[39m = [32m0.7[39m
[36mrowCount[39m: [32mInt[39m = [32m40949[39m
[36mtrainRowCount[39m: [32mInt[39m = [32m28664[39m
[36mtestRowCout[39m: [32mInt[39m = [32m12285[39m
[36mYTrain[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0...
[36mXTrain[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 0.0013038915167285063  0.0  7.602699664463143E-5  0.0  ... (23 total)
0.0013038915167285063  0.0  7.602699664463143E-5  0.0  ...
...
[36mYTest[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0...
[36mXTest[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 6.508502134169815E-4   0.0                   ... (23 total)
6.508502134169815E-

## Обучение модели

### Метод наименьших квадратов

In [10]:
import breeze.stats.regression.leastSquares
val result = leastSquares(XTrain, YTrain)

val ModelYTrain = result(XTrain)
val ModelYTest = result(XTest)

янв. 29, 2022 1:49:24 PM dev.ludovic.netlib.InstanceBuilder$NativeLAPACK getInstanceImpl
янв. 29, 2022 1:49:24 PM dev.ludovic.netlib.InstanceBuilder$LAPACK getInstanceImpl
янв. 29, 2022 1:49:24 PM dev.ludovic.netlib.InstanceBuilder$NativeBLAS getInstanceImpl
янв. 29, 2022 1:49:24 PM dev.ludovic.netlib.InstanceBuilder$NativeBLAS getInstanceImpl
янв. 29, 2022 1:49:24 PM dev.ludovic.netlib.InstanceBuilder$BLAS getInstanceImpl


[32mimport [39m[36mbreeze.stats.regression.leastSquares
[39m
[36mresult[39m: [32mbreeze[39m.[32mstats[39m.[32mregression[39m.[32mLeastSquaresRegressionResult[39m = [33mLeastSquaresRegressionResult[39m(
  DenseVector(-109.79208688702836, -7.558332959880783, 222.91153287262253, -2.04...
[36mModelYTrain[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-3.660358468184705, 6.538365801064895, 5.392113557153127, -2.4867856604443386, -1.6915075961767818, -2.0890023018007198, -4.194626661264784, 1.32384...
[36mModelYTest[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-0.6428619140203728, 1.1400451872369612, 10.274355794957991, 5.061702035587466, 0.122724708647163, 5.047312756413554, 6.627576019290208, 4.062669340...

### Средняя абсолютная ошибка (MAE)

In [11]:
import breeze.numerics.abs
var MAETrain = sum(abs(YTrain - ModelYTrain)) / YTrain.size.toDouble
var MAETest = sum(abs(YTest - ModelYTest)) / YTest.size.toDouble

[32mimport [39m[36mbreeze.numerics.abs
[39m
[36mMAETrain[39m: [32mDouble[39m = [32m7.856499088055895[39m
[36mMAETest[39m: [32mDouble[39m = [32m8.647954769284581[39m

### Средняя квадратичная ошибка (MSE)

In [12]:
import breeze.numerics.{pow, sqrt}
val MSETrain = sum(pow(YTrain - ModelYTrain, 2)) / YTrain.size.toDouble
val RMSETrain = sqrt(MSETrain)
val MSETest = sum(pow(YTest - ModelYTest, 2)) / YTest.size.toDouble
val RMSETest = sqrt(MSETest)

[32mimport [39m[36mbreeze.numerics.{pow, sqrt}
[39m
[36mMSETrain[39m: [32mDouble[39m = [32m891.6494629246345[39m
[36mRMSETrain[39m: [32mDouble[39m = [32m29.860500044785493[39m
[36mMSETest[39m: [32mDouble[39m = [32m888.9311249168065[39m
[36mRMSETest[39m: [32mDouble[39m = [32m29.814948011304775[39m

In [13]:
Seq(
    Scatter(
        List.range(0, YTest.size).toSeq,
        ModelYTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers),
        name = "model"
    ),
    Scatter(
        List.range(0, YTest.size).toSeq,
        YTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers),
        name = "data"
    ),
).plot(title="[TEST] model score vs data score")

[36mres12[39m: [32mString[39m = [32m"plot-e08a2c32-b73a-4f7c-af59-cfa4fd4e3357"[39m

In [14]:
Seq(
    Scatter(
        ModelYTest.toArray.toSeq,
        YTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers)
    )
).plot(
    title="Currect value vs model predict"
)

[36mres13[39m: [32mString[39m = [32m"plot-c4152466-f59d-436c-8c9b-59f32ab0fb11"[39m

In [15]:
val YTestDiff = ModelYTest-YTest

Seq(
    Bar(
        List.range(0,ModelYTest.size).toSeq,
        YTestDiff.toArray.toSeq
    )
).plot(title="model/data score diff")

[36mYTestDiff[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-0.6428619140203728, 1.1400451872369612, 9.274355794957991, 5.061702035587466, 0.122724708647163, 5.047312756413554, 6.627576019290208, 3.0626693403...
[36mres14_1[39m: [32mString[39m = [32m"plot-020bfc06-e0cc-4df8-9e88-4a0679541967"[39m

### Метод наименьших квадратов  (Destructive)

In [16]:
import breeze.stats.regression.leastSquaresDestructive
val resultDestructive = leastSquaresDestructive(XTrain, YTrain)

val ModelDestructiveYTrain = resultDestructive(XTrain)
val ModelDestructiveYTest = resultDestructive(XTest)

[32mimport [39m[36mbreeze.stats.regression.leastSquaresDestructive
[39m
[36mresultDestructive[39m: [32mbreeze[39m.[32mstats[39m.[32mregression[39m.[32mLeastSquaresRegressionResult[39m = [33mLeastSquaresRegressionResult[39m(
  DenseVector(-109.79208688702836, -7.558332959880783, 222.91153287262253, -2.04...
[36mModelDestructiveYTrain[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-519.5077057444367, 447.33740043105104, 1586.8949898910705, -342.44675408591195, -1782.717675352714, -2690.6358747742574, -152.1860499723791, 226.77...
[36mModelDestructiveYTest[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-0.6428619140203728, 1.1400451872369612, 10.274355794957991, 5.061702035587466, 0.122724708647163, 5.047312756413554, 6.627576019290208, 4.062669340...

#### Средняя абсолютная ошибка (MAE)

In [17]:
import breeze.numerics.abs
var MAETrain = sum(abs(YTrain - ModelDestructiveYTrain)) / YTrain.size.toDouble
val MAETest = sum(abs(YTest - ModelDestructiveYTest)) / YTest.size.toDouble

[32mimport [39m[36mbreeze.numerics.abs
[39m
[36mMAETrain[39m: [32mDouble[39m = [32m8.699852263324612[39m
[36mMAETest[39m: [32mDouble[39m = [32m8.647954769284581[39m

#### Средняя квадратичная ошибка (MSE)

In [18]:
import breeze.numerics.{pow, sqrt}
val MSETrain = sum(pow(YTrain - ModelDestructiveYTrain, 2)) / YTrain.size.toDouble
val RMSETrain = sqrt(MSETrain)
val MSETest = sum(pow(YTest - ModelDestructiveYTest, 2)) / YTest.size.toDouble
val RMSETest = sqrt(MSETest)

[32mimport [39m[36mbreeze.numerics.{pow, sqrt}
[39m
[36mMSETrain[39m: [32mDouble[39m = [32m1551.908813936521[39m
[36mRMSETrain[39m: [32mDouble[39m = [32m39.394273872436344[39m
[36mMSETest[39m: [32mDouble[39m = [32m888.9311249168065[39m
[36mRMSETest[39m: [32mDouble[39m = [32m29.814948011304775[39m

In [19]:
Seq(
    Scatter(
        List.range(0, YTest.size).toSeq,
        ModelDestructiveYTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers),
        name = "model"
    ),
    Scatter(
        List.range(0, YTest.size).toSeq,
        YTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers),
        name = "data"
    ),
).plot(title="[TEST] model (Destructive) score vs data score")

[36mres18[39m: [32mString[39m = [32m"plot-d27b52a0-0246-4020-bcd3-c763c806eeb4"[39m

In [20]:
Seq(
    Scatter(
        ModelDestructiveYTest.toArray.toSeq,
        YTest.toArray.toSeq,
        mode = ScatterMode(ScatterMode.Markers)
    )
).plot(
    title="Currect value vs model predict"
)

[36mres19[39m: [32mString[39m = [32m"plot-e83721bc-1f14-4bb8-b218-23574dd934fd"[39m

In [21]:
val DestructiveYTestDiff = ModelDestructiveYTest-YTest

Seq(
    Bar(
        List.range(0,ModelYTest.size).toSeq,
        DestructiveYTestDiff.toArray.toSeq
    )
).plot(title="model/data score diff")

[36mDestructiveYTestDiff[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(-0.6428619140203728, 1.1400451872369612, 9.274355794957991, 5.061702035587466, 0.122724708647163, 5.047312756413554, 6.627576019290208, 3.0626693403...
[36mres20_1[39m: [32mString[39m = [32m"plot-56fdb009-7410-4a34-81dd-d3929037f26d"[39m