In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("lrex").getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
# Load in our training data:
training = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

In [6]:
training.show()
# This is the actual format that Spark needs to run an actual MachineLearning algorithm on it.
# The features column has inside of it some vector; a vector of all the features belonging to that row.

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [7]:
# Here we are creating an instance of our model
lr = LinearRegression(featuresCol="features", labelCol="label",
                     predictionCol="prediction")
# By default, the featuresCol column is called "features".
# By default, the labelCol column is called "label".
# By default, the predictionCol column is called "prediction".

In [8]:
# The next step is to simply fit (or train) the model.
lr_Model = lr.fit(dataset=training)

In [9]:
# Print the coefficients of our model.
lr_Model.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [10]:
# Print the intercept of our model.
lr_Model.intercept

0.14228558260358093

In [11]:
training_summary = lr_Model.summary

In [13]:
print(training_summary.r2)  # How much variance is explained by your model.
print(training_summary.rootMeanSquaredError)  # Some error metric.

0.027839179518600154
10.16309157133015


## Actually, we've comitted the error of training our model on _all_ of the available data!

Let us see in the following, how we can perform a train/test split.

In [14]:
all_data = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

In [16]:
split_object = all_data.randomSplit([0.7, 0.3])
# First dataframe out will contain 70% of the data
# Second dataframe out will contain 30% of the data

split_object  # i.e. what we get back is a list of 2 dataframes.

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [17]:
# In the following, we use tuple unpacking, which makes a lot more sense:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

# It is already RANDOMLY SPLIT for you.  It's not like the top 70% is grabbed and then the last 30% at the bottom.

In [20]:
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                361|
|   mean| 0.2800458889274214|
| stddev| 10.073894730679347|
|    min|-28.571478869743427|
|    max| 26.903524792043335|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                140|
|   mean| 0.1971766715492226|
| stddev| 10.959265603140585|
|    min|-28.046018037776633|
|    max|  27.78383192005107|
+-------+-------------------+



In [21]:
correct_model = lr.fit(train_data)

In [22]:
# Now I want to evaluate how well my model did.
# Not on the training data, but instead on the test data.
test_results = correct_model.evaluate(dataset=test_data)
# We are comparing the labels in the test_data with the predicted labels made by the trained model.

In [24]:
test_results.residuals.show()  # show the actual residuals.
print(test_results.rootMeanSquaredError)  # Show the error metric.

+-------------------+
|          residuals|
+-------------------+
| -27.03755538787312|
| -27.25365238524946|
| -21.54844689306325|
|-19.830761582535636|
|-20.016220607064138|
| -19.01886284128381|
|-15.162437869421884|
| -16.85583236685477|
|-18.729534334061096|
| -16.64722581254161|
|-16.984828866170478|
|-15.506648073408895|
|-11.622945117308959|
|-13.428786664602901|
|-16.964773941721454|
| -8.886664209243794|
|-13.805522067728921|
|-11.420648238188766|
|  -9.02719799547021|
|-10.680619015242364|
+-------------------+
only showing top 20 rows

10.979761016808233


Once you are comfortable with your model, your model is ready for deployment.  
It makes sense to deploy your model onto data which has no labels.  
Let's mimic that in the following:

In [26]:
unlabeled_data = test_data.select("features")
unlabeled_data.show()  # Notice that this is only the "features" column.

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [27]:
predictions = correct_model.transform(unlabeled_data)

In [28]:
predictions.show()  # gives you the predicted value based off the values of the features.

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -1.0084626499035132|
|(10,[0,1,2,3,4,5,...|   4.303826449053384|
|(10,[0,1,2,3,4,5,...|  0.1160591288974433|
|(10,[0,1,2,3,4,5,...|  0.1634429671639186|
|(10,[0,1,2,3,4,5,...|  0.6138845768495844|
|(10,[0,1,2,3,4,5,...|  0.7436492752791787|
|(10,[0,1,2,3,4,5,...|  -2.331762487461459|
|(10,[0,1,2,3,4,5,...|-0.47088836582117644|
|(10,[0,1,2,3,4,5,...|  1.6641347081850795|
|(10,[0,1,2,3,4,5,...|-0.07187102106348087|
|(10,[0,1,2,3,4,5,...|  0.8334795148933665|
|(10,[0,1,2,3,4,5,...| 0.13079035009659873|
|(10,[0,1,2,3,4,5,...| -1.5303904890565716|
|(10,[0,1,2,3,4,5,...|  0.3888586004982866|
|(10,[0,1,2,3,4,5,...|  4.0425508383510325|
|(10,[0,1,2,3,4,5,...|  -3.886562790007404|
|(10,[0,1,2,3,4,5,...|  1.3047482823738663|
|(10,[0,1,2,3,4,5,...| -1.0586319732627305|
|(10,[0,1,2,3,4,5,...|  -3.383748407369951|
|(10,[0,1,2,3,4,5,...| -1.449734