In [3]:
# Install pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=7ac46e596de41fe14b116ec1066501ea78e7d44c4996013f07af695603c17067
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [4]:
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [8]:
import findspark
sc.stop()
findspark.init()

from pyspark import SparkContext
sc = SparkContext("local", "first app")

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark Create RDD example").config("spark.some.config.option", "some-value").getOrCreate()

In [10]:

df = spark.read.csv("/content/ecommerce_customers.csv", inferSchema=True, header=True)

In [11]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [12]:
df.show(5)

+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|          Avatar|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|          Violet|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|       DarkGreen|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|          Bisque|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|     SaddleBrown|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|MediumAquaMarine|       33.33067252|12.79518855|     37.5366533|         4.4463083

In [13]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:

assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol='features')
output = assembler.transform(df)

In [15]:
final_df = output.select('features', 'Yearly Amount Spent')
final_df.show(truncate=False)

+-------------------------------------------------+-------------------+
|features                                         |Yearly Amount Spent|
+-------------------------------------------------+-------------------+
|[34.49726773,12.65565115,39.57766802,4.082620633]|587.951054         |
|[31.92627203,11.10946073,37.26895887,2.664034182]|392.2049334        |
|[33.00091476,11.33027806,37.11059744,4.104543202]|487.5475049        |
|[34.30555663,13.71751367,36.72128268,3.120178783]|581.852344         |
|[33.33067252,12.79518855,37.5366533,4.446308318] |599.406092         |
|[33.87103788,12.02692534,34.47687763,5.493507201]|637.1024479        |
|[32.0215955,11.36634831,36.68377615,4.685017247] |521.5721748        |
|[32.73914294,12.35195897,37.37335886,4.434273435]|549.9041461        |
|[33.9877729,13.38623528,37.53449734,3.273433578] |570.200409         |
|[31.93654862,11.81412829,37.14516822,3.202806072]|427.1993849        |
|[33.99257277,13.33897545,37.22580613,2.482607771]|492.6060127  

In [16]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                354|
|   mean|   501.110543436723|
| stddev|  81.80063986294292|
|    min|        256.6705823|
|    max|        765.5184619|
+-------+-------------------+

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                146|
|   mean| 494.95812845068474|
| stddev|  73.02191790378373|
|    min|        282.4712457|
|    max|        712.3963268|
+-------+-------------------+



In [17]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')
model = lr.fit(train_data)

In [18]:
res = model.evaluate(test_data)
print(res.meanAbsoluteError)
print(res.meanSquaredError)
print(res.rootMeanSquaredError)
print(res.r2)

8.320211906475746
104.28219010971677
10.211865163118674
0.9803080589825842


In [19]:
predictions = model.transform(test_data)
predictions.show()

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.4925367,11.56...|        282.4712457| 287.3886267833184|
|[30.57436368,11.3...|        442.0644138| 441.6244209338877|
|[30.87948434,13.2...|           490.2066| 494.7089602869521|
|[30.97167564,11.7...|        494.6386098| 487.5288950664951|
|[31.06132516,12.3...|        487.5554581| 493.7262642865808|
|[31.06621816,11.7...|        448.9332932|461.70561074923967|
|[31.26810421,12.1...|        423.4705332|  427.206228855297|
|[31.28344748,12.7...|        591.7810894| 569.7792421052368|
|[31.30919264,11.9...|        432.7207178|429.68722280795646|
|[31.36621217,11.1...|        430.5888826|426.51936135492883|
|[31.42522688,13.2...|        530.7667187|  535.508216022459|
|[31.57613197,12.5...|         541.226584|  543.550990159398|
|[31.65480968,13.0...|        475.2634237| 469.4778426782293|
|[31.661