In [1]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 71 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 67.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=e79f5ce82a02a3f980246d845158535aec8c54ec704988916dc6742d5aa686bf
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('LinReg').getOrCreate()

In [5]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from pyspark.ml.regression import LinearRegression

In [8]:
file_path = '/content/drive/MyDrive/'
data=spark.read.csv(file_path +'Ecommerce_Customers.csv',inferSchema=True,header=True)

In [9]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [10]:
data.head(2)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005),
 Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264)]

In [11]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [12]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [13]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="Independent Features")

In [14]:
output=featureassembler.transform(data)

In [15]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- Independent Features: vector (nullable = true)



In [16]:
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [17]:
output.head(3)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, Independent Features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826])),
 Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264, Independent Features=DenseVector([31.9263, 11.1095, 37.269, 2.664])),
 Row(Email='pallen@yahoo.com', Address='24645 Valerie Unions Suite 582Cobbborough, DC 99414-7564', Avatar='Bisque', Avg Session Length=33.000914755642675, Time on App=11.330278057777512, Time on Website=37.110597442120856, Length of Membership=4.104543202376424, Yearly A

In [18]:
finaldata=output.select('Independent Features','Yearly Amount Spent')

In [19]:
finaldata.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [20]:
train_data,test_data=finaldata.randomSplit([0.7,0.3])

In [21]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                348|
|   mean| 497.10313824269963|
| stddev|  79.98563993283894|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [22]:
from pyspark.ml.regression import LinearRegression

In [23]:
lr=LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')

In [24]:
lr_model=lr.fit(train_data)

In [25]:
pred=lr_model.evaluate(test_data)

In [27]:

pred.predictions.show()

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[29.5324289670579...|  408.6403510726275| 399.7806471335791|
|[30.4925366965402...|  282.4712457199145| 289.0886373903404|
|[30.5743636841713...| 442.06441375806565|  443.313526722847|
|[30.8364326747734...|  467.5019004269896|473.95771828891316|
|[31.0613251567161...|  487.5554580579016| 495.5438783253064|
|[31.1239743499119...|  486.9470538397658|510.44856176350254|
|[31.1280900496166...|  557.2526867470547| 566.6474270219662|
|[31.3123495994443...|  463.5914180279406|445.50053469912723|
|[31.3584771924370...|  495.1759504494754| 492.9698136690299|
|[31.4474464941278...|   418.602742095224| 426.5619058443149|
|[31.5147378578019...|  489.8124879964614|496.15384922055637|
|[31.5257524169682...|  443.9656268098819|  450.474248649994|
|[31.5702008293202...|  545.9454921414049| 565.6776384490611|
|[31.600

In [28]:
pred.r2

0.9808938697908688

In [29]:
pred.rootMeanSquaredError

10.715822836964982

In [33]:
unlabelled_data=test_data.select('Independent Features')

In [34]:
pre=lr_model.transform(unlabelled_data)

In [35]:
pre.show()

+--------------------+------------------+
|Independent Features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 399.7806471335791|
|[30.4925366965402...| 289.0886373903404|
|[30.5743636841713...|  443.313526722847|
|[30.8364326747734...|473.95771828891316|
|[31.0613251567161...| 495.5438783253064|
|[31.1239743499119...|510.44856176350254|
|[31.1280900496166...| 566.6474270219662|
|[31.3123495994443...|445.50053469912723|
|[31.3584771924370...| 492.9698136690299|
|[31.4474464941278...| 426.5619058443149|
|[31.5147378578019...|496.15384922055637|
|[31.5257524169682...|  450.474248649994|
|[31.5702008293202...| 565.6776384490611|
|[31.6005122003032...| 462.4021922035222|
|[31.6098395733896...| 429.0485981620459|
|[31.6548096756927...|469.99057747908387|
|[31.6610498227460...| 418.2682024285125|
|[31.6739155032749...| 503.5111745777781|
|[31.8093003166791...| 549.3698620498844|
|[31.8164283341993...| 520.9169682416943|
+--------------------+------------