# **Linear Regression Code Along**

Dataset with Ecommerce Customer Data for a company's website and mobile app. Building a regression model that will predict the customer's yearly spend on the company's product.

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 45 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 48.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=d4703103b3f75901c55a0cd47d5b347df286e3bf7b4bb844020ca042aa927952
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [6]:
from pyspark.ml.regression import LinearRegression

In [7]:
#Importing the dataset
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True , header=True)
data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [9]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [8]:
for items in data.head():
  print(items)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


**Model building**

In [10]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors 
from pyspark.ml.feature import VectorAssembler

In [12]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [17]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'] ,
                            outputCol='features')

In [18]:
output = assembler.transform(data)

In [21]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [23]:
output.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [25]:
final_data = output.select('features','Yearly Amount Spent')
final_data.show(1,truncate=False)

+--------------------------------------------------------------------------+-------------------+
|features                                                                  |Yearly Amount Spent|
+--------------------------------------------------------------------------+-------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|587.9510539684005  |
+--------------------------------------------------------------------------+-------------------+
only showing top 1 row



In [27]:
#Splitting the data into training and testing data
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [30]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                377|
|   mean| 500.85410936974694|
| stddev|  82.46376660587752|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [31]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                123|
|   mean| 494.59365769838035|
| stddev|  68.86605059739848|
|    min|  314.4385182951061|
|    max|  700.9170916173961|
+-------+-------------------+



In [32]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [33]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [34]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [25.706349395099977,39.23790057562983,0.3571075937861314,61.48667860603226] Intercept: -1053.2837022709684


In [35]:
test_results = lrModel.evaluate(test_data)

In [36]:
# Interesting results....
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -13.331586803671769|
| -2.1550360294100983|
|  2.3558975186946327|
|  18.630382600220912|
|  -5.126640239150902|
|  -6.083662324156876|
| -27.089733575117975|
| -19.519078716490014|
|  -2.418120387045235|
|   7.636850623044381|
| -3.4598111137275964|
|   4.213065564573185|
|   8.299215572910384|
| -18.713446048004016|
|  -4.458947056940474|
|-0.00788773938171...|
|   4.539705260615733|
| -0.3822492332888032|
|  -6.684088684937933|
|  2.1201832830473677|
+--------------------+
only showing top 20 rows



In [37]:
unlabeled_data = test_data.select('features')

In [38]:
predictions = lrModel.transform(unlabeled_data)

In [39]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[31.0662181616375...| 462.2648800113461|
|[31.2606468698795...|423.48166728636147|
|[31.3091926408918...|  430.364820321239|
|[31.3123495994443...| 444.9610354277197|
|[31.4252268808548...| 535.8933588939128|
|[31.5147378578019...| 495.8961503206183|
|[31.6739155032749...| 502.8148014849992|
|[31.8164283341993...| 520.6415702201464|
|[31.8186165667690...| 448.8367937571809|
|[31.8209982016720...|417.03843039016897|
|[31.8627411090001...| 559.7579522877743|
|[31.9480174211613...| 457.7078113283246|
|[31.9549038566348...| 431.6986643670166|
|[31.9563005605233...| 565.8393777952028|
|[31.9673209478824...|450.20878829659273|
|[32.0047530203648...|463.75386886001115|
|[32.0215955013870...| 517.0324694972117|
|[32.0444861274404...| 448.6120784198388|
|[32.0883806304482...|  518.849955073096|
|[32.1164001209281...| 461.3648121212186|
+--------------------+------------

In [40]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))

RMSE: 10.408804958257027
MSE: 108.34322065903609
