# Linear Regression with PySpark

<a href='https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression'>Link</a> to documentation


## Content
1. [Example from the documentation](#doc)
2. [Example with ecommerce data](#ecom)

<a id='doc'></a>
## 1. Example from the documentation

In [2]:
# set up PySpark
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [5]:
# load the trainind data
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [6]:
training.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
# in Spark every feature gets vectorized so that there are only two columns for a supervised problem
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [9]:
# define linear regression
lr = LinearRegression()

# fit the data
lrModel = lr.fit(training)

In [10]:
# coefficients
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

### Get metrics from model 
`.summary` can be used to output general metrics of the linear regression

In [18]:
# Error of the model
rmse = lrModel.summary.rootMeanSquaredError
r2 = lrModel.summary.r2
pvalue = lrModel.summary.pValues

print('root mean squared error: ', rmse)
print('r2: ', r2)
print('p value: ', pvalue)

root mean squared error:  10.16309157133015
r2:  0.027839179518600154
p value:  [0.9927505031240562, 0.30967074330990396, 0.3178269194409711, 0.003972477331573909, 0.5201486327242175, 0.16213017210149872, 0.7102819001865635, 0.5266812209137877, 0.46256007153356316, 0.37825808848978526, 0.7592692146070568]


### Split data in train and test
`.randomSplit` can be used to split the data into train and test.

In [26]:
# define split object
train, test = training.randomSplit([0.7, 0.3])

In [28]:
# check count of train split
train.describe().show()

+-------+--------------------+
|summary|               label|
+-------+--------------------+
|  count|                 342|
|   mean|0.013897725894133819|
| stddev|  10.132893652003213|
|    min| -28.571478869743427|
|    max|  27.111027963108548|
+-------+--------------------+



In [29]:
# check count of test split 
test.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                159|
|   mean| 0.7795489161251338|
| stddev|  10.71889983030931|
|    min|-20.212077258958672|
|    max|  27.78383192005107|
+-------+-------------------+



In [38]:
correct_model = lr.fit(train)

In [39]:
test_results = correct_model.evaluate(test)

In [40]:
test_results.rootMeanSquaredError

10.956008749492854

### Make prediction on unlabeled data
call `.transform` on the trained model and give it the unlabeled data

In [35]:
# create unlableled data
unlabeled_data = test.select('features')

In [36]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [41]:
# use transform to get the predictions of unlabled data
preds = correct_model.transform(unlabeled_data)

In [42]:
preds.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  2.6583969552816993|
|(10,[0,1,2,3,4,5,...|  1.1398125000378085|
|(10,[0,1,2,3,4,5,...|  2.1990693041097664|
|(10,[0,1,2,3,4,5,...|  0.2795713619866958|
|(10,[0,1,2,3,4,5,...| -2.7924028557941023|
|(10,[0,1,2,3,4,5,...|  1.2987132064268772|
|(10,[0,1,2,3,4,5,...| -1.2429343191114903|
|(10,[0,1,2,3,4,5,...| -3.0186387404689357|
|(10,[0,1,2,3,4,5,...|  -0.894030222946144|
|(10,[0,1,2,3,4,5,...|  1.0074809063787016|
|(10,[0,1,2,3,4,5,...|-0.07537250549319433|
|(10,[0,1,2,3,4,5,...|-0.17744263622502174|
|(10,[0,1,2,3,4,5,...|  1.3685817631905408|
|(10,[0,1,2,3,4,5,...|   2.661268475726091|
|(10,[0,1,2,3,4,5,...|  0.8671677276779142|
|(10,[0,1,2,3,4,5,...|  1.0810631984499863|
|(10,[0,1,2,3,4,5,...|  0.0879488502015591|
|(10,[0,1,2,3,4,5,...|  -0.839627126562648|
|(10,[0,1,2,3,4,5,...|  -2.932291650139565|
|(10,[0,1,2,3,4,5,...| -3.252311

In [43]:
# stop session to creat a new one 
spark.stop()

<a id='ecom'></a>
## 2. Example with ecommerce data
Goal: create a model that predicts the yearly spending of a customer for a product

In [45]:
spark = SparkSession.builder.appName('lr_example_2').getOrCreate()

In [48]:
# inferSchema = true will automatically infer the column types based on the data
# header = when the first row has the column names it should be set to True 
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [51]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [55]:
# get the first row in a more readable way
data.head().asDict()

{'Email': 'mstephenson@fernandez.com',
 'Address': '835 Frank TunnelWrightmouth, MI 82180-9605',
 'Avatar': 'Violet',
 'Avg Session Length': 34.49726772511229,
 'Time on App': 12.65565114916675,
 'Time on Website': 39.57766801952616,
 'Length of Membership': 4.0826206329529615,
 'Yearly Amount Spent': 587.9510539684005}

In [50]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



`Yearly Amount Spent` is the label we want to predict with the provided features

### Transform dataframe for machine learning
- `Vectors`
- `VectorAssembler`: A feature transformer that merges multiple columns into a vector column

In [56]:
# imports
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [57]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [58]:
# combine every input feature into a vector which is called 'features'
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], 
                            outputCol='features')

In [59]:
output = assembler.transform(data)

In [62]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [63]:
final_data = output.select('features', 'Yearly Amount Spent')

### Split data in train and test

In [66]:
train, test = final_data.randomSplit([0.7, 0.3])

### Train and Test Model

In [68]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

lrModel = lr.fit(train)

test_results = lrModel.evaluate(test)

In [69]:
test_results.rootMeanSquaredError

10.736820526313888

In [72]:
test_results.r2

0.9799957531461406

### Make predictions

In [75]:
# create unlableled data
unlabeled_data = test.select('features')

# use transform to get the predictions of unlabled data
preds = lrModel.transform(unlabeled_data)

preds.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|399.56050346986785|
|[30.7377203726281...|451.15676799880134|
|[30.8364326747734...|471.52828062336107|
|[31.3091926408918...| 429.8306938264004|
|[31.4252268808548...| 534.9386303187684|
|[31.5257524169682...|450.19904893786565|
|[31.5316044825729...| 432.0719853344576|
|[31.5741380228732...| 559.2845068661395|
|[31.5761319713222...| 543.8941230476541|
|[31.6005122003032...|461.19030760838723|
|[31.6098395733896...| 426.8225711425032|
|[31.6610498227460...|417.53312414136303|
|[31.6739155032749...|502.59262646203024|
|[31.7216523605090...|348.85431836644807|
|[31.7366356860502...| 495.8913513131672|
|[31.8093003166791...| 549.2532862862183|
|[31.8124825597242...| 396.5433068847433|
|[31.8164283341993...| 517.9121815165729|
|[31.8512531286083...| 464.5393202969494|
|[31.8627411090001...| 557.8244286511037|
+--------------------+------------