# Linear Regression Code Along

This notebook is the reference for the video lecture on the Linear Regression Code Along. Basically what we do here is examine a dataset with Ecommerce Customer Data for a company's website and mobile app. Then we want to see if we can build a regression model that will predict the customer's yearly spend on the company's product.

First thing to do is start a Spark Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
# Use Spark to read in the Ecommerse Customers csv file
data = spark.read.csv('Ecommerce_Customers.csv', header=True, inferSchema=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [11]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [9]:
[print(f'{row} \n') for row in data.head(5)]

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005) 

Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264) 

Row(Email='pallen@yahoo.com', Address='24645 Valerie Unions Suite 582Cobbborough, DC 99414-7564', Avatar='Bisque', Avg Session Length=33.000914755642675, Time on App=11.330278057777512, Time on Website=37.110597442120856, Length of Membership=4.104543202376424, Yearly Amount Spent=487.54750486747207) 

Row(Email='riverarebecca@gmail.com', Address='1414 David ThroughwayPort Jason, OH 22070-1220', Avatar='Sadd

[None, None, None, None, None]

In [10]:
[print(item) for item in data.head()]

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


[None, None, None, None, None, None, None, None]

## Setting Up DataFrame for Machine Learning 

In [12]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [13]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [14]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length',
               'Time on App',
               'Time on Website',
               'Length of Membership'],
    outputCol='features')

In [16]:
output = assembler.transform(data)

In [17]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [20]:
[print(row) for row in output.head()]

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005
[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]


[None, None, None, None, None, None, None, None, None]

In [21]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [22]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [24]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                337|
|   mean|  497.4162612666706|
| stddev|  79.73116700313672|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [25]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                163|
|   mean| 503.23766308237725|
| stddev|  78.54508605008368|
|    min|  275.9184206503857|
|    max|  744.2218671047146|
+-------+-------------------+



In [26]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [28]:
# Fit the model to the data and call this lrModel
lrModel = lr.fit(train_data)

In [29]:
# Print the coefficients and intercept for Linear Regression
print('Coefficients: {c} Intercept: {i}'.format(c=lrModel.coefficients,i=lrModel.intercept))

Coefficients: [26.212805337022,38.507772265226514,0.22355051088576328,61.37072895272366] Intercept: -1055.592338046795


In [31]:
test_results = lrModel.evaluate(test_data)

In [33]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -5.464732493066833|
|   6.91406402273401|
|-12.841359241581529|
|  4.075478460507952|
| 0.5465893020877957|
| -7.734025480146897|
| -4.774456884673384|
| 3.9521213672738327|
|-14.080389638069846|
| 17.965385567098565|
| -4.890558814063354|
| -26.40737125111798|
| -4.475839136350771|
| -11.36868249902841|
|-4.5310855158138565|
| -2.827928976664623|
|-2.5874838860943328|
|-1.6593131748574024|
|  4.248995380538872|
| -9.255733751025446|
+-------------------+
only showing top 20 rows



In [34]:
unlabeled_data = test_data.select('features')

In [35]:
predictions = lrModel.transform(unlabeled_data)

In [36]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...|287.93597821298135|
|[30.9716756438877...| 487.7245457341587|
|[31.0662181616375...| 461.7746524492559|
|[31.3584771924370...|491.10047198896746|
|[31.3895854806643...| 409.5230217578951|
|[31.4474464941278...| 426.3367675753709|
|[31.5171218025062...| 280.6928775350591|
|[31.5316044825729...|432.56348436208873|
|[31.5741380228732...| 558.4896617986567|
|[31.6005122003032...|461.20746592399837|
|[31.6253601348306...|381.22745957098755|
|[31.6739155032749...| 502.1324391609992|
|[31.7656188210424...| 501.0299207719579|
|[31.8093003166791...| 548.1405818618696|
|[31.8124825597242...| 397.3414304996111|
|[31.8186165667690...| 449.2466023468003|
|[31.8530748017465...| 461.8726073484463|
|[31.8627411090001...| 557.9574543489041|
|[31.9764800614612...|326.34545065356133|
|[32.0085045178551...|452.45295477978084|
+--------------------+------------

In [37]:
test_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.4925366965402...|  282.4712457199145|
|[30.9716756438877...|  494.6386097568927|
|[31.0662181616375...| 448.93329320767435|
|[31.3584771924370...|  495.1759504494754|
|[31.3895854806643...|  410.0696110599829|
|[31.4474464941278...|   418.602742095224|
|[31.5171218025062...|  275.9184206503857|
|[31.5316044825729...| 436.51560572936256|
|[31.5741380228732...|  544.4092721605869|
|[31.6005122003032...| 479.17285149109694|
|[31.6253601348306...|  376.3369007569242|
|[31.6739155032749...|  475.7250679098812|
|[31.7656188210424...| 496.55408163560713|
|[31.8093003166791...|  536.7718993628412|
|[31.8124825597242...|  392.8103449837972|
|[31.8186165667690...| 446.41867337013565|
|[31.8530748017465...|   459.285123462352|
|[31.8627411090001...|  556.2981411740467|
|[31.9764800614612...|  330.5944460341002|
|[32.0085045178551...|  443.1972210287554|
+----------

In [39]:
print(f"RMSE: {test_results.rootMeanSquaredError}")
print(f"MSE: {test_results.meanSquaredError}")

RMSE: 10.868120011444327
MSE: 118.11603258315665
