## _Анализ данных [Facebook Comments](https://www.kaggle.com/kiranraje/prediction-facebook-comment) и построение модели линейной регрессии_

In [1]:
import $ivy.`org.scalanlp::breeze:1.0`

[32mimport [39m[36m$ivy.$                         [39m

In [2]:
import $ivy.`com.github.tototoshi::scala-csv:1.3.6`

[32mimport [39m[36m$ivy.$                                      [39m

In [3]:
import breeze.linalg._
import breeze.numerics._
import breeze.stats.regression.{lasso, leastSquares}
import com.github.tototoshi.csv._
import java.io.File
import scala.collection.mutable.ListBuffer

[32mimport [39m[36mbreeze.linalg._
[39m
[32mimport [39m[36mbreeze.numerics._
[39m
[32mimport [39m[36mbreeze.stats.regression.{lasso, leastSquares}
[39m
[32mimport [39m[36mcom.github.tototoshi.csv._
[39m
[32mimport [39m[36mjava.io.File
[39m
[32mimport [39m[36mscala.collection.mutable.ListBuffer[39m

### _Загрузка данных_

In [4]:
val dataFilePath = "./data/Dataset.csv"
val reader = CSVReader.open(new File(dataFilePath))
val readerAll = reader.all()  // прочитать csv-файл
val readerWoDubl = readerAll.distinct  // удалить строки-дубли

[36mdataFilePath[39m: [32mString[39m = [32m"./data/Dataset.csv"[39m
[36mreader[39m: [32mCSVReader[39m = com.github.tototoshi.csv.CSVReader@64045cb6
[36mreaderAll[39m: [32mList[39m[[32mList[39m[[32mString[39m]] = [33mList[39m(
  [33mList[39m(
    [32m"likes"[39m,
    [32m"Checkins"[39m,
    [32m"Returns"[39m,
    [32m"Category"[39m,
    [32m"commBase"[39m,
    [32m"comm24"[39m,
    [32m"comm48"[39m,
    [32m"comm24_1"[39m,
    [32m"diff2448"[39m,
    [32m"baseTime"[39m,
    [32m"length"[39m,
    [32m"shares"[39m,
    [32m"hrs"[39m,
    [32m"sun_pub"[39m,
    [32m"mon_pub"[39m,
    [32m"tue_pub"[39m,
    [32m"wed_pub"[39m,
    [32m"thu_pub"[39m,
    [32m"fri_pub"[39m,
    [32m"sat_pub"[39m,
    [32m"sun_base"[39m,
    [32m"mon_base"[39m,
    [32m"tue_base"[39m,
    [32m"wed_base"[39m,
    [32m"thu_base"[39m,
    [32m"fri_base"[39m,
    [32m"sat_base"[39m,
    [32m"output"[39m
  ),
  [33mList[39m(
    [32m"63

### _Полезные функции_

In [5]:
def matrixShape(m: DenseMatrix[Double]): (Int, Int) = {
  // Возвращает форму массива
  (m.rows, m.cols)
}

defined [32mfunction[39m [36mmatrixShape[39m

In [6]:
def row2denseVector(row: List[String]): DenseVector[Double] = {
  // Забивает пустые строки Double.NaN и
  // преобразует строку из csv-файла в полносвязный вектор
  val arr = row.map(
    elem => if (elem == "") Double.NaN.toString // обязательно Double.NaN.toString!!!
    else elem
  ).map(_.toDouble).toArray
  DenseVector(arr)
}

defined [32mfunction[39m [36mrow2denseVector[39m

In [7]:
def denseMatrixWoNanRows(input: DenseMatrix[Double]): DenseMatrix[Double] = {
  // Удаляет из полносвязной матрицы строки, в которых есть хотя бы один NaN
  val lstDenseVectorsWoNaN = ListBuffer[DenseVector[Double]]()
  for (idx <- 0 until input.rows) {
    val densVecRow = input(idx, ::).t
    val sumRow = densVecRow.map(_.isNaN).toArray.collect{ case true => 1; case false => 0}.sum
    if (sumRow == 0) {
      lstDenseVectorsWoNaN += densVecRow
    }
  }
  DenseMatrix(lstDenseVectorsWoNaN.toList: _*)
}

defined [32mfunction[39m [36mdenseMatrixWoNanRows[39m

In [8]:
def nanColCounts(m: DenseMatrix[Double], colNames: DenseVector[String]): DenseMatrix[String] = {
  // Вычисляет число NaN в каждом столбце
  val lstNanColsCounts = ListBuffer[Int]()
  val indexes = DenseVector(0 until m.cols: _*).map(_.toString)
  for (idx <- 0 until m.cols) {
    val densVecCol = m(::, idx)
    val sumCol = densVecCol.map(_.isNaN).toArray.collect {
      case true => 1; case false => 0
    }.sum
    lstNanColsCounts += sumCol
  }
  DenseMatrix(
    indexes,
    colNames,
    DenseVector(lstNanColsCounts.toArray.map(_.toString))
  ).t
}

defined [32mfunction[39m [36mnanColCounts[39m

In [9]:
def readCSV2DenseMatrix(input: List[List[String]], skip: Int = 0): DenseMatrix[Double] = {
  // читает csv-файл в полносвязную вещественную матрицу
  val lstDenseVectors = ListBuffer[DenseVector[Double]]()
  for (idx <- skip until input.size) {
    lstDenseVectors += row2denseVector(input(idx))
  }
  DenseMatrix( // DenseMatrix строится на базе объектов НЕИЗМЕНЯЕМОГО типа данных!
    lstDenseVectors.toList: _* // распаковать список полносвязных векторов
  )
}

defined [32mfunction[39m [36mreadCSV2DenseMatrix[39m

In [10]:
def removeColumns(input: DenseMatrix[Double], lstIdxRemoveCols: List[Int]): DenseMatrix[Double] = {
  val restCols = Set(0 until input.cols: _*).diff(lstIdxRemoveCols.toSet).toList
  input(::, restCols).toDenseMatrix
}

defined [32mfunction[39m [36mremoveColumns[39m

In [11]:
def trainTestSplit(input: DenseMatrix[Double],
                   testSize: Double = 0.20): (DenseMatrix[Double], DenseMatrix[Double],
                                              DenseVector[Double], DenseVector[Double]) = {
  val shuffleIdx = scala.util.Random.shuffle(List(0 until input.rows: _*))
  val inputShaked = input(shuffleIdx, ::).toDenseMatrix  // перемешанный набор данных
  val nRowsTrain = ((1 - testSize)*input.rows).toInt
  val xtrain = inputShaked(0 until nRowsTrain, 0 until inputShaked.cols).toDenseMatrix
  val xtest = inputShaked(nRowsTrain to -1, 0 until inputShaked.cols).toDenseMatrix
  val ytrain = inputShaked(0 until nRowsTrain, -1).toDenseVector
  val ytest = inputShaked(nRowsTrain to -1, -1).toDenseVector
  (xtrain, xtest, ytrain, ytest)
}

defined [32mfunction[39m [36mtrainTestSplit[39m

In [12]:
def rootMeanSquaredError(ytrue: DenseVector[Double], ypred: DenseVector[Double]): Double = {
  sqrt(sum(pow(ytrue - ypred, 2))/ytrue.size)
}

defined [32mfunction[39m [36mrootMeanSquaredError[39m

### _Чтение CSV-файла в полносвязную вещественную матрицу_

In [13]:
val data = readCSV2DenseMatrix(readerWoDubl, skip=1)
val colNames = DenseVector(readerWoDubl(0).toArray) // имена столбцов

[36mdata[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 634995.0  0.0  463.0  1.0  0.0   0.0   0.0   0.0   0.0    65.0  ... (28 total)
634995.0  0.0  463.0  1.0  0.0   0.0   0.0   0.0   0.0    10.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   0.0   0.0   0.0    14.0  ...
634995.0  0.0  463.0  1.0  7.0   0.0   3.0   7.0   -3.0   62.0  ...
634995.0  0.0  463.0  1.0  1.0   0.0   0.0   1.0   0.0    58.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    60.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    68.0  ...
634995.0  0.0  463.0  1.0  1.0   0.0   1.0   1.0   -1.0   32.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    35.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    48.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    52.0  ...
634995.0  0.0  463.0  1.0  1.0   0.0   NaN   1.0   0.0    69.0  ...
634995.0  0.0  463.0  1.0  0.0   0.0   NaN   0.0   0.0    3.0   ...
634995.0  0.0  463.0  1.0  1.0   1.0   0.0   1.

In [14]:
val (rowsBeforeRemoveNan, colsBeforeRemoveNan) = matrixShape(data)
println(s"--> Shape of DenseMatrix (before remove NaN): (${rowsBeforeRemoveNan}, ${colsBeforeRemoveNan})")

--> Shape of DenseMatrix (before remove NaN): (40941, 28)


[36mrowsBeforeRemoveNan[39m: [32mInt[39m = [32m40941[39m
[36mcolsBeforeRemoveNan[39m: [32mInt[39m = [32m28[39m

In [15]:
nanColCounts(data, colNames)

[36mres14[39m: [32mDenseMatrix[39m[[32mString[39m] = 0   likes     0     
1   Checkins  0     
2   Returns   51    
3   Category  57    
4   commBase  60    
5   comm24    0     
6   comm48    48    
7   comm24_1  0     
8   diff2448  0     
9   baseTime  0     
10  length    0     
11  shares    2448  
12  hrs       0     
13  sun_pub   0     
14  mon_pub   1927  
15  tue_pub   0     
16  wed_pub   0     
17  thu_pub   3045  
18  fri_pub   0     
19  sat_pub   0     
20  sun_base  0     
21  mon_base  1969  
22  tue_base  0     
23  wed_base  0     
24  thu_base  0     
25  fri_base  0     
26  sat_base  0     
27  output    0     

In [16]:
// удалить следующие столбцы: shares, mon_pub, thu_pub, mon_base
val lstIdxRemoveCols = List(11, 14, 17, 21)
val dataWoNanCols = removeColumns(data, lstIdxRemoveCols)
println(matrixShape(dataWoNanCols))
val dataWoNanRows = denseMatrixWoNanRows(dataWoNanCols)

(40941,24)


[36mlstIdxRemoveCols[39m: [32mList[39m[[32mInt[39m] = [33mList[39m([32m11[39m, [32m14[39m, [32m17[39m, [32m21[39m)
[36mdataWoNanCols[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 634995.0  0.0   166.0  0.0  0.0  0.0  0.0  0.0   65.0  0.0  ... (24 total)
634995.0  0.0   132.0  0.0  1.0  0.0  0.0  0.0   10.0  0.0  ...
634995.0  0.0   133.0  0.0  0.0  0.0  0.0  0.0   14.0  0.0  ...
634995.0  0.0   131.0  0.0  0.0  0.0  0.0  3.0   62.0  0.0  ...
634995.0  0.0   142.0  0.0  0.0  0.0  0.0  0.0   58.0  0.0  ...
634995.0  0.0   166.0  0.0  1.0  0.0  0.0  NaN   60.0  0.0  ...
634995.0  0.0   145.0  0.0  0.0  0.0  0.0  NaN   68.0  0.0  ...
634995.0  0.0   157.0  0.0  0.0  0.0  0.0  1.0   32.0  0.0  ...
634995.0  0.0   177.0  0.0  0.0  0.0  0.0  NaN   35.0  0.0  ...
634995.0  0.0   126.0  0.0  0.0  1.0  0.0  NaN   48.0  0.0  ...
634995.0  0.0   188.0  0.0  0.0  1.0  0.0  NaN   52.0  0.0  ...
634995.0  0.0   172.0  1.0  0.0  0.0  0.0  NaN   69.0  0.0  ...
634995.0  0.0   157.0  

In [17]:
val (rowsAfterRemoveNan, colsAfterRemoveNan) = matrixShape(dataWoNanRows)
println(s"--> Shape of DenseMatrix (after remove NaN): (${rowsAfterRemoveNan}, ${colsAfterRemoveNan})")

--> Shape of DenseMatrix (after remove NaN): (40726, 24)


[36mrowsAfterRemoveNan[39m: [32mInt[39m = [32m40726[39m
[36mcolsAfterRemoveNan[39m: [32mInt[39m = [32m24[39m

In [18]:
// Подготовленный набор данных
val x = dataWoNanRows(::, 0 until dataWoNanRows.cols-1)

val (
  xtrain: DenseMatrix[Double],
  xtest: DenseMatrix[Double],
  ytrain: DenseVector[Double],
  ytest: DenseVector[Double]) = trainTestSplit(x, testSize = 0.15)

matrixShape(xtrain)
matrixShape(xtest)
ytrain.size
ytest.size

[36mx[39m: [32mDenseMatrix[39m[[32mDouble[39m] = 634995.0  0.0   166.0  0.0  0.0  0.0  0.0  0.0   65.0  0.0  ... (23 total)
634995.0  0.0   132.0  0.0  1.0  0.0  0.0  0.0   10.0  0.0  ...
634995.0  0.0   133.0  0.0  0.0  0.0  0.0  0.0   14.0  0.0  ...
634995.0  0.0   131.0  0.0  0.0  0.0  0.0  3.0   62.0  0.0  ...
634995.0  0.0   142.0  0.0  0.0  0.0  0.0  0.0   58.0  0.0  ...
634995.0  0.0   157.0  0.0  0.0  0.0  0.0  1.0   32.0  0.0  ...
634995.0  1.0   126.0  0.0  1.0  0.0  0.0  0.0   37.0  0.0  ...
634995.0  0.0   103.0  0.0  0.0  0.0  0.0  0.0   23.0  0.0  ...
634995.0  0.0   158.0  1.0  0.0  0.0  0.0  3.0   40.0  0.0  ...
634995.0  0.0   151.0  0.0  1.0  0.0  0.0  3.0   54.0  0.0  ...
634995.0  0.0   133.0  0.0  1.0  0.0  0.0  0.0   29.0  0.0  ...
634995.0  0.0   137.0  0.0  0.0  0.0  0.0  1.0   36.0  0.0  ...
634995.0  0.0   106.0  0.0  0.0  0.0  0.0  0.0   11.0  0.0  ...
634995.0  0.0   149.0  1.0  0.0  0.0  0.0  0.0   62.0  0.0  ...
634995.0  46.0  135.0  0.0  0.0  0.0  

### _Модель линейной регрессии_

In [19]:
// Linear Regression
val lr = leastSquares(xtrain, ytrain)
val ls = lasso(xtrain, ytrain, 0.05)

Dec 08, 2020 8:30:15 PM com.github.fommil.netlib.LAPACK <clinit>
Dec 08, 2020 8:30:15 PM com.github.fommil.netlib.LAPACK <clinit>


[36mlr[39m: [32mbreeze[39m.[32mstats[39m.[32mregression[39m.[32mLeastSquaresRegressionResult[39m = [33mLeastSquaresRegressionResult[39m(
  DenseVector(2.307145089606019E-22, -0.016858291008173536, 5.6349405265055E-19, 6.454199984230192E-16, 1.1032784490437055E-15, 1.23815893960071E-15, 6.934048553787697E-20, 0.016858291008173893, -3.4155005225093958E-18, 1.2620614516019079E-15, -9.457980829551907E-21, -5.4091405159351225E-16, -5.1712659938513194E-17, 1.217320680717843E-16, -3.4682231173670154E-15, -4.846513779296854E-17, -1.8413128300353244E-16, 1.3149068108704379E-15, 6.333554515341546E-16, 3.232012315941283E-16, 0.016858291008173938, -1.9615469311016516E-16, 1.000000000000003),
  [32m3.116283455449835E-23[39m
)
[36mls[39m: [32mbreeze[39m.[32mstats[39m.[32mregression[39m.[32mLassoResult[39m = [33mLassoResult[39m(
  DenseVector(0.0, 0.0, 0.0, 2.8056594680250018, 2.3542718229538986, 1.9720393383555848, 0.0, 0.09175705068609953, 0.0, 1.4250903750596768, 0.0, 2.1

In [20]:
val lrcoefs = lr.coefficients // веса Linear Regression
lrcoefs.foreach(
  println
)

2.307145089606019E-22
-0.016858291008173536
5.6349405265055E-19
6.454199984230192E-16
1.1032784490437055E-15
1.23815893960071E-15
6.934048553787697E-20
0.016858291008173893
-3.4155005225093958E-18
1.2620614516019079E-15
-9.457980829551907E-21
-5.4091405159351225E-16
-5.1712659938513194E-17
1.217320680717843E-16
-3.4682231173670154E-15
-4.846513779296854E-17
-1.8413128300353244E-16
1.3149068108704379E-15
6.333554515341546E-16
3.232012315941283E-16
0.016858291008173938
-1.9615469311016516E-16
1.000000000000003


[36mlrcoefs[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(2.307145089606019E-22, -0.016858291008173536, 5.6349405265055E-19, 6.454199984230192E-16, 1.1032784490437055E-15, 1.23815893960071E-15, 6.934048553787697E-20, 0.016858291008173893, -3.4155005225093958E-18, 1.2620614516019079E-15, -9.457980829551907E-21, -5.4091405159351225E-16, -5.1712659938513194E-17, 1.217320680717843E-16, -3.4682231173670154E-15, -4.846513779296854E-17, -1.8413128300353244E-16, 1.3149068108704379E-15, 6.333554515341546E-16, 3.232012315941283E-16, 0.016858291008173938, -1.9615469311016516E-16, 1.000000000000003)

In [21]:
matrixShape(xtrain)
lrcoefs.size

[36mres20_0[39m: ([32mInt[39m, [32mInt[39m) = ([32m34617[39m, [32m23[39m)
[36mres20_1[39m: [32mInt[39m = [32m23[39m

In [22]:
val ypredTrain = xtrain*lrcoefs // прогноз на обучающем наборе данных
val ypredTest = xtest*lrcoefs // прогноз на тестовом наборе данных

Dec 08, 2020 8:30:47 PM com.github.fommil.netlib.BLAS <clinit>
Dec 08, 2020 8:30:47 PM com.github.fommil.netlib.BLAS <clinit>


[36mypredTrain[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(26.000000000000004, 2188.000000000001, 27.99999999999999, 10.000000000000005, 3.000000000000002, 9.000000000000004, 1.000000000000001, 12.000000000000016, 2.584535304468965E-15, 130.99999999999994, 5.000000000000004, 4.000000000000001, 36.99999999999999, 757.0, 12.999999999999998, 2.000000000000002, 68.99999999999999, 157.99999999999997, 2.4139837203170128E-15, 4.000000000000001, 53.999999999999986, 1.0000000000000047, 12.000000000000005, 41.0, 189.0, 4.000000000000001, 36.99999999999999, 62.0, 4.321248223573401E-15, 2.000000000000001, 2.0000000000000053, 2.1613845904207764E-15, 8.000000000000002, 4.252455705409971E-15, 5.000000000000005, 8.000000000000004, 17.00000000000001, 4.000000000000002, 4.000000000000001, 32.99999999999999, 13.000000000000004, 3.000000000000004, 4.058459727136249E-15, 221.99999999999994, 2.0000000000000018, 17.000000000000004, 3.0000000000000053, 16.000000000000004, 1.0000000000000002, 

In [24]:
// Решение с использованием псевдообратной матрицы
val coefsFromPinv = pinv(xtrain) * ytrain

: 

### _Результаты_

In [None]:
val ypredPinvTrain = xtrain*coefsFromPinv // прогноз на обучающем наборе данных
val ypredPinvTest = xtest*coefsFromPinv // прогноз на тестовом наборе данных

In [23]:
val rmseTrain = rootMeanSquaredError(ytrain, ypredTrain)
val rmseTest = rootMeanSquaredError(ytest, ypredTest)

[36mrmseTrain[39m: [32mDouble[39m = [32m3.142859829070192E-14[39m
[36mrmseTest[39m: [32mDouble[39m = [32m3.0809916860819736E-14[39m

In [None]:
val rmsePinvTrain = rootMeanSquaredError(ytrain, ypredPinvTrain)
val rmsePinvTest = rootMeanSquaredError(ytest, ypredPinvTest)