In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.appName('Linear').getOrCreate()

In [4]:
training = spark.read.format('libsvm').option("numFeatures", "780").load('data/sample_linear_regression_data.txt')

In [5]:
training.show(5)  # avoid storing all the data into training set. Split the data into training and testing sets

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(780,[0,1,2,3,4,5...|
| 0.2577820163584905|(780,[0,1,2,3,4,5...|
| -4.438869807456516|(780,[0,1,2,3,4,5...|
|-19.782762789614537|(780,[0,1,2,3,4,5...|
| -7.966593841555266|(780,[0,1,2,3,4,5...|
+-------------------+--------------------+
only showing top 5 rows



In [7]:
all_data = spark.read.format('libsvm').option("numFeatures", "780").load('data/sample_linear_regression_data.txt')

In [8]:
all_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                501|
|   mean|0.25688882219498976|
| stddev| 10.317884030544564|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [9]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

In [10]:
train_data.show(5)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.046018037776633|(780,[0,1,2,3,4,5...|
|-26.805483428483072|(780,[0,1,2,3,4,5...|
|-26.736207182601724|(780,[0,1,2,3,4,5...|
| -23.51088409032297|(780,[0,1,2,3,4,5...|
|-23.487440120936512|(780,[0,1,2,3,4,5...|
+-------------------+--------------------+
only showing top 5 rows



In [11]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                349|
|   mean|  0.055118864021097|
| stddev| 10.277545059813779|
|    min|-28.046018037776633|
|    max| 27.111027963108548|
+-------+-------------------+



In [12]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                152|
|   mean| 0.7201632656337295|
| stddev| 10.429204420527412|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [13]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='predicton')   #model

In [14]:
lrModel = lr.fit(train_data)    # fiting data to model

In [15]:
lrModel.coefficients

DenseVector([-0.6819, 0.5039, -1.0164, 2.6711, 0.1989, 1.849, -0.3281, -0.8597, -0.3791, 0.3666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [16]:
lrModel.intercept

-0.10806112374768163

In [17]:
training_summary = lrModel.summary

In [18]:
training_summary.rootMeanSquaredError

10.06979413842118

In [19]:
test_results = lrModel.evaluate(test_data)

In [20]:
test_results.rootMeanSquaredError

10.467141511659351

### Prediction on Unlabeled data

In [21]:
unlabeled_data = test_data.select('features')   # remove labels from test_data

In [22]:
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|(780,[0,1,2,3,4,5...|
|(780,[0,1,2,3,4,5...|
|(780,[0,1,2,3,4,5...|
|(780,[0,1,2,3,4,5...|
|(780,[0,1,2,3,4,5...|
+--------------------+
only showing top 5 rows



In [23]:
predictions = lrModel.transform(unlabeled_data)

In [24]:
predictions.show(5)

+--------------------+--------------------+
|            features|           predicton|
+--------------------+--------------------+
|(780,[0,1,2,3,4,5...| -0.5650657006275394|
|(780,[0,1,2,3,4,5...|  3.5493427252871443|
|(780,[0,1,2,3,4,5...| -3.1100981411253015|
|(780,[0,1,2,3,4,5...|-0.49120019889845135|
|(780,[0,1,2,3,4,5...|  -0.251160244033078|
+--------------------+--------------------+
only showing top 5 rows

