In [0]:
data = sqlContext.sql('select * from sample_linear_regression_data')

# using data from http://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression

In [0]:
display(data.take(5))

label,features
-9.490009878824548,"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.4551273600657362, 0.36644694351969087, -0.38256108933468047, -0.4458430198517267, 0.33109790358914726, 0.8067445293443565, -0.2624341731773887, -0.44850386111659524, -0.07269284838169332, 0.5658035575800715))"
0.2577820163584905,"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.8386555657374337, -0.1270180511534269, 0.499812362510895, -0.22686625128130267, -0.6452430441812433, 0.18869982177936828, -0.5804648622673358, 0.651931743775642, -0.6555641246242951, 0.17485476357259122))"
-4.438869807456516,"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.5025608135349202, 0.14208069682973434, 0.16004976900412138, 0.505019897181302, -0.9371635223468384, -0.2841601610457427, 0.6355938616712786, -0.1646249064941625, 0.9480713629917628, 0.42681251564645817))"
-19.782762789614537,"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.0388509668871313, -0.4166870051763918, 0.8997202693189332, 0.6409836467726933, 0.273289095712564, -0.26175701211620517, -0.2794902492677298, -0.1306778297187794, -0.08536581111046115, -0.05462315824828923))"
-7.966593841555266,"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.06195495876886281, 0.6546448480299902, -0.6979368909424835, 0.6677324708883314, -0.07938725467767771, -0.43885601665437957, -0.608071585153688, -0.6414531182501653, 0.7313735926547045, -0.026818676347611925))"


In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')
lrModel = lr.fit(train_data)

In [0]:
# Evaluate model on test data
results = lrModel.evaluate(test_data)

In [0]:
display(results.residuals.head(5))

residuals
-28.767804744874702
-18.45379015668744
-21.21257199605114
-21.27804516097566
-19.020387721421248


In [0]:
results.rootMeanSquaredError

In [0]:
lrModel.coefficients

In [0]:
lrModel.intercept

In [0]:
model_summary = lrModel.summary

In [0]:
model_summary.r2

In [0]:
model_summary.rootMeanSquaredError

In [0]:
# deploy the model on unlabeled data

unlabeled_data = test_data.select('features')
display(unlabeled_data.take(5))

features
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.4572552704218824, -0.576096954000229, -0.20809839485012915, 0.9140086345619809, -0.5922981637492224, -0.8969369345510854, 0.3741080343476908, -0.01854004246308416, 0.07834089512221243, 0.3838413057880994))"
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.17363144173810174, -0.3340314573781735, 0.9351424971322297, -0.6430601902397572, -0.13363305808148818, -0.42446359566938585, -0.4093070316761178, -0.9302259781839204, 0.47004365892170585, -0.6231289889808045))"
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.4785033857256795, 0.520350718059089, -0.2988515012130126, -0.46260150057299754, 0.5394344995663083, 0.39320468081626836, 0.1890560923345248, 0.13123799325264507, 0.43613839380760355, 0.39541998419731494))"
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.5609065808412279, -0.9201904391147984, 0.908305865183735, 0.9255146658282842, 0.6871419344095282, 0.4201876217923466, -0.42906289792612684, 0.5787691868233418, 0.7260522064761288, 0.28251641556690554))"
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.9325810772922609, -0.6411471147334535, 0.9949216290375054, 0.483048267470493, -0.8736297429070232, -0.36222771685582544, 0.26397860162786957, 0.45527588775737704, -0.9424989711186325, 0.6251162293059616))"


In [0]:
predict = lrModel.transform(unlabeled_data)

In [0]:
display(predict.take(5))

features,prediction
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.4572552704218824, -0.576096954000229, -0.20809839485012915, 0.9140086345619809, -0.5922981637492224, -0.8969369345510854, 0.3741080343476908, -0.01854004246308416, 0.07834089512221243, 0.3838413057880994))",1.962321316391634
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.17363144173810174, -0.3340314573781735, 0.9351424971322297, -0.6430601902397572, -0.13363305808148818, -0.42446359566938585, -0.4093070316761178, -0.9302259781839204, 0.47004365892170585, -0.6231289889808045))",-4.383670260231905
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.4785033857256795, 0.520350718059089, -0.2988515012130126, -0.46260150057299754, 0.5394344995663083, 0.39320468081626836, 0.1890560923345248, 0.13123799325264507, 0.43613839380760355, 0.39541998419731494))",-0.2198157681146683
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(0.5609065808412279, -0.9201904391147984, 0.908305865183735, 0.9255146658282842, 0.6871419344095282, 0.4201876217923466, -0.42906289792612684, 0.5787691868233418, 0.7260522064761288, 0.28251641556690554))",1.0659679020169857
"Map(vectorType -> sparse, length -> 10, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), values -> List(-0.9325810772922609, -0.6411471147334535, 0.9949216290375054, 0.483048267470493, -0.8736297429070232, -0.36222771685582544, 0.26397860162786957, 0.45527588775737704, -0.9424989711186325, 0.6251162293059616))",-0.8526033166471594
