In [26]:
import findspark

In [27]:
findspark.init('/home/vboxuser/spark-3.5.0-bin-hadoop3')

In [28]:
import pyspark

In [29]:
from pyspark.sql import SparkSession

In [30]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [34]:
data = spark.read.csv('/home/vboxuser/Downloads/Ecommerce_Customers.csv',inferSchema=True, header=True)

In [35]:
data = data.na.drop(subset=['Yearly Amount Spent'])

In [37]:
data.show()

+------------------+---------------+----------------+------------------+----------------+----------------+--------------------+-------------------+
|             Email|        Address|          Avatar|Avg Session Length|     Time on App| Time on Website|Length of Membership|Yearly Amount Spent|
+------------------+---------------+----------------+------------------+----------------+----------------+--------------------+-------------------+
|       Wrightmouth| MI 82180-9605"|          Violet|  34.4972677251123|12.6556511491668|39.5776680195262|    4.08262063295296|   587.951053968401|
|       Diazchester| CA 06566-8576"|       DarkGreen|  31.9262720263602|11.1094607286826|37.2689588682977|    2.66403418213262|   392.204933444326|
|       Cobbborough| DC 99414-7564"|          Bisque|  33.0009147556427|11.3302780577775|37.1105974421208|    4.10454320237642|   487.547504867472|
|        Port Jason| OH 22070-1220"|     SaddleBrown|  34.3055566297555|13.7175136651425|36.7212826779031|    3.

In [39]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [40]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [41]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [42]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"],
    outputCol="features",
    handleInvalid="keep"  # or "skip"
)


In [43]:
output = assembler.transform(data)

In [44]:
output.head(1)

[Row(Email='Wrightmouth', Address=' MI 82180-9605"', Avatar='Violet', Avg Session Length=34.4972677251123, Time on App=12.6556511491668, Time on Website=39.5776680195262, Length of Membership=4.08262063295296, Yearly Amount Spent=587.951053968401, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [45]:
final_data = output.select('features','Yearly Amount Spent')

In [46]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251123...|   587.951053968401|
|[31.9262720263602...|   392.204933444326|
|[33.0009147556427...|   487.547504867472|
|[34.3055566297555...|   581.852344035218|
|[33.3306725236464...|   599.406092045764|
|[33.871037879342,...|   637.102447915074|
|[32.021595501387,...|   521.572174757827|
|[33.9877728956856...|   570.200408963619|
|[33.9925727749537...|   492.606012717997|
|[29.5324289670579...|   408.640351072628|
|[33.1903340437227...|   573.415867331387|
|[32.3879758531539...|   470.452733300956|
|[30.7377203726282...|    461.78074219623|
|[32.1253868972878...|   457.847695944949|
|[32.3388993230672...|   407.704547549544|
|[32.1878120459322...|   452.315675480036|
|[32.6178560628234...|   605.061038804892|
|[32.9127851111598...|   534.705743806023|
|[34.5075509985266...|   700.917091617396|
|[33.0293319535069...|   423.179991680598|
+----------

In [47]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [48]:
from pyspark.ml.regression import LinearRegression

# Now you can use LinearRegression in your code
lr = LinearRegression(labelCol='Yearly Amount Spent', regParam=0.01)


In [24]:
# Assuming 'label' is your label column
data = data.na.drop(subset=['Yearly Amount Spent'])


In [49]:
lr_model = lr.fit(train_data)

24/01/10 22:14:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/01/10 22:14:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [50]:
test_results = lr_model.evaluate(test_data)

In [51]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  4.907099923562953|
| -4.680772856896851|
| 4.7517971847234435|
| 19.025434617524127|
|-3.8260329731140246|
|-0.8225975741752904|
| -8.421046371832972|
| 0.7100483722109061|
|  3.981077624315219|
|  17.75701527196628|
| 17.882875942013925|
|  5.071181679215215|
|-11.185533051730658|
|-2.1108715810296985|
|  0.810929639620042|
| -2.393573042398657|
| 3.1752142618909147|
|0.18476938929546805|
| 3.0190307489078805|
| -1.916806401206884|
+-------------------+
only showing top 20 rows



In [52]:
unlabled_data = test_data.select('features')

In [53]:
unlabled_data.show()

+--------------------+
|            features|
+--------------------+
|[31.0472221394875...|
|[31.0613251567161...|
|[31.3662121671877...|
|[31.6005122003033...|
|[31.6253601348306...|
|[31.721652360509,...|
|[31.8854062999118...|
|[32.0047530203648...|
|[32.0123007682454...|
|[32.0180740106321...|
|[32.0478146331398...|
|[32.0542618511848...|
|[32.0637746203137...|
|[32.0775900443291...|
|[32.0961089938452...|
|[32.1223647957978...|
|[32.1898447292735...|
|[32.2046546456785...|
|[32.2301491159331...|
|[32.2620031069949...|
+--------------------+
only showing top 20 rows



In [55]:
predictions = lr_model.transform(unlabled_data)

In [56]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[31.0472221394875...|387.59029926545804|
|[31.0613251567161...|492.23623091479885|
|[31.3662121671877...|425.83708537176153|
|[31.6005122003033...|460.14741687357287|
|[31.6253601348306...|380.16293373003805|
|[31.721652360509,...| 348.5995242060483|
|[31.8854062999118...|398.52431934430797|
|[32.0047530203648...| 463.0359327484191|
|[32.0123007682454...| 488.9639754416428|
|[32.0180740106321...|340.02609547334873|
|[32.0478146331398...| 479.5066818168291|
|[32.0542618511848...| 556.8034759897678|
|[32.0637746203137...|389.51643995853465|
|[32.0775900443291...| 403.1440068001357|
|[32.0961089938452...|374.58752577062296|
|[32.1223647957978...| 531.3271916074186|
|[32.1898447292735...| 530.2213395249521|
|[32.2046546456785...| 478.3995166190475|
|[32.2301491159331...| 514.1461048450701|
|[32.2620031069949...| 444.2799237850709|
+--------------------+------------