### Linear regression

In [45]:
import scala.util.Random
import breeze.linalg._

class LinearRegression() {

  var w: DenseVector[Double] = DenseVector.ones[Double](0)

  def predict(X: DenseMatrix[Double]): DenseVector[Double] = {
    X * w
  }
    
  def mae(y: DenseVector[Double], y_preds: DenseVector[Double]): Double = {
    val diff = y.toArray zip y_preds.toArray map ( z => scala.math.abs(z._1 - z._2))
    diff.sum / y.length.toDouble
  }

  def mse(y: DenseVector[Double], y_preds: DenseVector[Double]): Double = {
    val diff = y.toArray zip y_preds.toArray map ( z => scala.math.pow(z._1 - z._2, 2))
    diff.sum / y.length.toDouble
  }

  def fit(X: DenseMatrix[Double], y: DenseVector[Double], lr: Double, iter: Int): Unit = {
    w = DenseVector.ones[Double](X.cols)
    var y_preds = DenseVector.zeros[Double](y.length)
    var loss = Double.MaxValue
    var loss_best = Double.MaxValue
    var grad = DenseVector.zeros[Double](X.cols)
    for (i <- 0 to iter; if loss_best >= loss) {
      loss_best = loss
      y_preds = predict(X)
      grad = pinv(X) * (y_preds - y)
      w -= lr * grad
      y_preds = predict(X)
      loss = mse(y, y_preds)
      if ((i&0xff) == 0xff) {
        val loss_mae = mae(y, y_preds)
        println(s"iter: $i, mse: $loss, mae: $loss_mae, weight: $w")
      }
    }
  }
}

import scala.util.Random
import breeze.linalg._
defined class LinearRegression


### Train on synthetic dataset

In [26]:
import breeze.linalg.{DenseMatrix, DenseVector}
import scala.util.Random

val real_weight = DenseVector[Double](1.5, 0.3, -0.7)
val rand = new Random
val x = DenseMatrix.fill[Double](1000, real_weight.length)(rand.nextDouble() * 100)
val noise = DenseVector.fill[Double](1000)(rand.nextDouble() * 10)
val y = x * real_weight + noise
val model = new LinearRegression()
val lr = 0.001
val iter = 10000
model.fit(x, y, lr, iter)

iter: 255, mse: 6801.293594484323, mae: 71.24078006431142, weight: DenseVector(1.1197905924507394, 0.8481835800506964, 0.6230973195209634)
iter: 511, mse: 4079.1068671593794, mae: 55.10137047549481, weight: DenseVector(1.2125136403042793, 0.7306711704038267, 0.33135850628688895)
iter: 767, mse: 2448.129689984001, mae: 42.619522302447066, weight: DenseVector(1.2842852496381703, 0.6397115335896733, 0.10554017292016607)
iter: 1023, mse: 1470.9422965677622, mae: 32.96340030735457, weight: DenseVector(1.339839548441132, 0.5693048799075181, -0.06925288638939403)
iter: 1279, mse: 885.468008383434, mae: 25.508706920790587, weight: DenseVector(1.3828409544863938, 0.5148071152248472, -0.20455019873461794)
iter: 1535, mse: 534.6856053004556, mae: 19.755026700916186, weight: DenseVector(1.4161258840366338, 0.472623511828276, -0.3092761117665656)
iter: 1791, mse: 324.51703462885905, mae: 15.316968613450948, weight: DenseVector(1.4418898447312753, 0.4399715965454999, -0.39033845269755496)
iter: 2047

import breeze.linalg.{DenseMatrix, DenseVector}
import scala.util.Random
create_data: (weight: breeze.linalg.DenseVector[Double], size: Int)(breeze.linalg.DenseMatrix[Double], breeze.linalg.DenseVector[Double])
real_weight: breeze.linalg.DenseVector[Double] = DenseVector(1.5, 0.3, -0.7)
rand: scala.util.Random = scala.util.Random@343ff464
x: breeze.linalg.DenseMatrix[Double] =
5.633309532292497   55.8299015529841     40.05329223856968
90.38651874313315   27.84349514902794    10.725925408446614
80.61272854978668   74.93174837484277    81.81988584687561
7.445887822248132   69.23431627347023    38.529371732600794
69.45014579103332   24.27202156522782    97.20728519366615
34.62311886714482   36.63235113315707    33.09704447004728
56.372717895375935  0.04432400202863951  27.781107179269625
2...


In [29]:
val y_preds = model.predict(x)
val weight = model.w
val mse = model.mse(y_preds, y)

println(s"\nreal_weight: $real_weight")
println(s"\npredicted weight: $weight")
println(s"\nMSE: $mse")


real_weight: DenseVector(1.5, 0.3, -0.7)

predicted weight: DenseVector(1.5301233884939232, 0.3281489528851135, -0.6679517316032598)

MSE: 10.388445038055304


y_preds: breeze.linalg.DenseVector[Double] = DenseVector(0.18651649711318896, 140.27493966160836, 93.28446170039427, 8.376534927403583, 49.30235640064606, 42.89128346257063, 67.71532035328912, 40.75369009338049, 73.89604138480013, 86.03649075518354, 20.243537011988785, 113.22696007703611, 21.002606973352947, 57.547727795810744, 14.939525206751377, 51.793307388003555, 135.9383766501384, 68.51575103618636, 59.213499589417054, 132.316327106871, 79.65599201074859, 64.68074503791817, 4.840224538286861, 73.560994335234, -11.851651009728798, 67.21437207066495, 29.40061603871476, 42.15110288053914, 124.25405635029095, 82.14487234599014, 26.872165042051407, 162.72605667257884, 66.62539435233153, 67.0796292658882, 81.75784315902025, 93.0584886595591, 7.107898548397593, 19.042127764740854, -24.396...


### Train and predict on Insurance.csv

In [35]:
var source = scala.io.Source.fromFile("X_insurance_train.csv")
var dataTrain = source.getLines.map(tmp => tmp.split(" ")).toArray.map(_.map(_.toDouble))
source.close
val X_train = DenseMatrix(dataTrain:_*)

source = scala.io.Source.fromFile("Y_insurance_train.csv")
var data_vectorTrain = source.getLines.toArray.map(_.toDouble)
source.close
val y_train = DenseVector(data_vectorTrain)

source = scala.io.Source.fromFile("X_insurance_test.csv")
var dataTest = source.getLines.map(tmp => tmp.split(" ")).toArray.map(_.map(_.toDouble))
source.close
val X_test = DenseMatrix(dataTest:_*)

source = scala.io.Source.fromFile("Y_insurance_test.csv")
var data_vectorTest = source.getLines.toArray.map(_.toDouble)
source.close
val y_test = DenseVector(data_vectorTest)

source: scala.io.BufferedSource = <iterator>
dataTrain: Array[Array[Double]] = Array(Array(46.0, 1.0, 19.95, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0), Array(47.0, 1.0, 24.32, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0), Array(52.0, 1.0, 24.86, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0), Array(39.0, 1.0, 34.32, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0), Array(54.0, 1.0, 21.47, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0), Array(63.0, 0.0, 41.47, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0), Array(22.0, 1.0, 24.3, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0), Array(18.0, 0.0, 21.565, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0), Array(40.0, 0.0, 41.23, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0), Array(37.0, 0.0, 34.2, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0), Array(34.0, 0.0, 22.42, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0), Array(50.0, 0.0, 37.07, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0), Array(49.0, 1.0, 29.925, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0...


In [36]:
val model = new LinearRegression()
val lr = 0.0003
val epochs = 50000

model: LinearRegression = LinearRegression@7801ce99
lr: Double = 3.0E-4
epochs: Int = 50000


In [37]:
model.fit(X_train, y_train, lr, epochs)

iter: 255, mse: 2.800678079716961E8, mae: 12291.996450393392, weight: DenseVector(19.925722598429974, 2.300652705873521, 25.84921131567277, 32.3693158797116, 1749.5869467586308, -882.5906058979458, -909.9968637440714, -931.2302093202591, -942.4636240160715)
iter: 511, mse: 2.4549333714254475E8, mae: 11383.763497297825, weight: DenseVector(37.45216015576465, 3.5051408470639775, 48.86118077723944, 61.41932192955621, 3368.8910328534976, -1700.8523922036647, -1753.6386092202893, -1794.5353997951484, -1816.1716797024656)
iter: 767, mse: 2.1584243569865286E8, mae: 10543.759473748538, weight: DenseVector(53.68276967957925, 4.620574410317953, 70.1717460009365, 88.32149776061442, 4868.47086678982, -2458.615485721162, -2534.905178882273, -2594.0115854513974, -2625.2815882772766)
iter: 1023, mse: 1.9041396389991483E8, mae: 9767.20779704711, weight: DenseVector(68.71335901941318, 5.653537701101183, 89.90670136457571, 113.23464453254226, 6257.1783354231975, -3160.352894832762, -3258.4083200489818, 

iter: 8447, mse: 3.9057052687171556E7, mae: 4072.2504812422167, weight: DenseVector(236.68217188096, 17.19703834232217, 310.4474233122585, 391.64233656070326, 21776.16695552539, -11002.360741899369, -11343.651085700569, -11608.070104613496, -11747.959905881573)
iter: 8703, mse: 3.880366176987184E7, mae: 4078.3288310125713, weight: DenseVector(238.18258689334064, 17.300152970344012, 312.4174474162227, 394.12926894752366, 21914.793424962656, -11072.41104528899, -11415.87413364415, -11681.976513934685, -11822.75689065964)
iter: 8959, mse: 3.8586354926543444E7, mae: 4084.18003047841, weight: DenseVector(239.5720677909819, 17.395643754374778, 314.24181656593436, 396.43232844552887, 22043.17046038362, -11137.282136041003, -11482.757325733646, -11750.418607034055, -11892.023713967012)
iter: 9215, mse: 3.839999361312079E7, mae: 4089.9775092083387, weight: DenseVector(240.8588165564763, 17.484074368049445, 315.9312998507944, 398.56510981082135, 22162.055860050645, -11197.356942454795, -11544.69

iter: 16639, mse: 3.729072236031199E7, mae: 4192.120086799926, weight: DenseVector(255.23840325826663, 18.472298107232838, 334.8114977785843, 422.3992220948905, 23490.615838448488, -11868.70080583001, -12236.86234882233, -12522.100169837895, -12673.004073271315)
iter: 16895, mse: 3.72888652977039E7, mae: 4193.275954581575, weight: DenseVector(255.36685179220376, 18.481125613414505, 334.9801489223593, 422.6121250693685, 23502.48346614957, -11874.697719149312, -12243.045267913552, -12528.427199258098, -12679.407343672076)
iter: 17151, mse: 3.728727268965974E7, mae: 4194.349326700694, weight: DenseVector(255.4858034040996, 18.489300452450415, 335.1363307391944, 422.80928693035173, 23513.473653434354, -11880.251246962098, -12248.771049046125, -12534.286435833064, -12685.337184303418)
iter: 17407, mse: 3.7285906876589425E7, mae: 4195.345557963, weight: DenseVector(255.59596025477688, 18.496870879690842, 335.28096515651265, 422.99187150683224, 23523.651274404372, -11885.394171250868, -12254.

iter: 24831, mse: 3.7277777200167984E7, mae: 4206.801342312513, weight: DenseVector(256.8269775501411, 18.581471399848294, 336.89727379396044, 425.0322781645337, 23637.387527417297, -11942.867026631875, -12313.328977253277, -12600.349068231732, -12752.195874304802)
iter: 25087, mse: 3.727776359004507E7, mae: 4206.906680165664, weight: DenseVector(256.83797385831895, 18.582227110913575, 336.9117117941042, 425.05050450505763, 23638.403499243035, -11943.380414406367, -12313.85828873454, -12600.890716806623, -12752.74404976866)
iter: 25343, mse: 3.727775191806852E7, mae: 4207.004229799284, weight: DenseVector(256.8481571476292, 18.582926948012435, 336.92508231175117, 425.06738326978814, 23639.344354561617, -11943.85584453726, -12314.348465243069, -12601.392318258117, -12753.251695539535)
iter: 25599, mse: 3.727774190823692E7, mae: 4207.094928482839, weight: DenseVector(256.8575875291277, 18.58357504222122, 336.9374642718808, 425.0830140925774, 23640.21564715849, -11944.296123446484, -12314

iter: 33023, mse: 3.727768232710339E7, mae: 4208.111716012818, weight: DenseVector(256.9629732827602, 18.59081758024886, 337.07583430494134, 425.2576905931821, 23649.952456530922, -11949.216298167446, -12319.87518362689, -12607.047852366759, -12758.975379080894)
iter: 33279, mse: 3.7277682227356896E7, mae: 4208.120798670763, weight: DenseVector(256.96391466207984, 18.590882275668207, 337.077070322934, 425.2592509259564, 23650.03943252786, -11949.26024861209, -12319.920497278914, -12607.09422218096, -12759.022307653384)
iter: 33535, mse: 3.7277682141814895E7, mae: 4208.129209796778, weight: DenseVector(256.9647864399357, 18.590942187791587, 337.07821495516947, 425.26069589455955, 23650.11997790052, -11949.300949553359, -12319.962460637964, -12607.137163614174, -12759.065766532733)
iter: 33791, mse: 3.727768206845434E7, mae: 4208.136999040968, weight: DenseVector(256.9655937623553, 18.59099767027496, 337.0792749583107, 425.26203402851235, 23650.19456810111, -11949.338641245287, -12320.00

iter: 41215, mse: 3.727768163179345E7, mae: 4208.2240448692655, weight: DenseVector(256.9746156965803, 18.59161769432424, 337.09112063283453, 425.2769878514417, 23651.02812340337, -11949.759850860299, -12320.435595649249, -12607.621326368377, -12759.555763469352)
iter: 41471, mse: 3.727768163106242E7, mae: 4208.224822423531, weight: DenseVector(256.97469628681193, 18.591623232813074, 337.0912264466914, 425.27712142945563, 23651.035569302807, -11949.763613399426, -12320.439474890745, -12607.625296026505, -12759.559780962038)
iter: 41727, mse: 3.7277681630435474E7, mae: 4208.225542488841, weight: DenseVector(256.97477091855467, 18.59162836181029, 337.0913244371349, 425.2772451312959, 23651.042464685048, -11949.76709775288, -12320.443067318103, -12607.628972185506, -12759.563501418892)
iter: 41983, mse: 3.727768162989784E7, mae: 4208.226209315685, weight: DenseVector(256.97484003235365, 18.5916331115919, 337.0914151825944, 425.2773596871641, 23651.04885025293, -11949.770324488492, -12320.

In [42]:
val y_pred_train = model.predict(X)
val y_pred_test = model.predict(X_test)
val mse_train = model.mse(y_pred_train, y_train)
val mse_test = model.mse(y_pred_test, y_test)
val mae_train = model.mae(y_pred_train, y_train)
val mae_test = model.mae(y_pred_test, y_test)

println(s"\nMSE on train: $mse_train")
println(s"\nMSE on test: $mse_test")
println(s"\nMAE on train: $mae_train")
println(s"\nMAE on test: $mae_test")


MSE on train: 3.727768162688414E7

MSE on test: 3.3596916398922496E7

MAE on train: 4208.232364660341

MAE on test: 4181.192594980418


y_pred_train: breeze.linalg.DenseVector[Double] = DenseVector(7094.533840317377, 8344.722622457019, 9153.76613735711, 11128.369720726445, 10087.996305820692, 17561.007038089905, 1103.7951866022377, 23596.26072085609, 12652.811012026708, 33163.23383471584, 5195.5312852686475, 13162.398328638741, 10377.398662118681, 17773.060463656282, 9884.645446737806, 11384.352799636188, 32252.06433096952, 3313.145709713972, 13660.446370474136, 39347.181851844995, 5503.657977174022, 6112.821354135427, 3454.334881777704, 11360.295800700515, 25561.28550140542, 14095.048010334362, 9277.33746093285, 10028.88799949866, 13479.021859007884, 1040.0603780027504, 2519.8260836643585, 30745.109416184532, 30011.116683321994, 13579.037721387158, 3742.857504274536, 25412.954531406478, 13416.086144440542, 30729.596076...


In [43]:
y_pred_test

res19: breeze.linalg.DenseVector[Double] = DenseVector(8969.542376874415, 7068.741212034653, 36858.378177765495, 9454.670185673836, 26973.149483479425, 10864.103565615434, 170.2807283673974, 16903.435332911118, 1092.4300028708167, 11218.33330429887, 28101.65960452978, 9377.72634349369, 5263.054916216428, 38416.00813998628, 40255.78765915444, 37098.220618695086, 15240.38048312438, 35912.85077447107, 9112.515972984682, 31461.89313959843, 3847.685128182886, 10130.111080608607, 2370.539817920331, 7140.209207419106, 11301.757875146344, 12961.64220221925, 14509.459681530225, 6159.892217428784, 9963.849793181216, 2177.855303103297, 9115.928723441542, 13073.677766570177, 4561.819769995949, 3408.2046093194767, 4459.809679971124, 13032.053524646213, 1979.99185667866, 8813.27525777867, 33271.26170...


In [44]:
y_test

res20: breeze.linalg.DenseVector[Double] = DenseVector(9095.068, 5272.176, 29330.983, 9301.894, 33750.292, 4536.259, 2117.339, 14210.536, 3732.625, 10264.442, 18259.216, 7256.723, 3947.413, 46151.124, 48673.559, 44202.654, 9800.888, 42969.853, 8233.097, 21774.322, 5080.096, 7441.501, 1256.299, 2755.021, 11085.587, 10923.933, 12644.589, 18804.752, 9715.841, 1131.507, 15828.822, 11842.624, 2020.552, 5693.431, 2904.088, 7448.404, 2597.779, 7337.748, 23887.663, 38709.176, 4687.797, 2643.269, 11674.13, 12124.992, 4889.999, 12333.828, 3579.829, 4391.652, 42124.515, 4463.205, 13887.204, 1719.436, 28476.735, 1708.926, 10594.226, 25333.333, 3645.089, 38746.355, 11848.141, 10564.885, 13880.949, 4753.637, 27941.288, 8017.061, 23045.566, 4133.642, 17942.106, 25992.821, 3594.171, 1682.597, 6079.672,...
