<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/apache_spark_lrex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [84]:
try:
  import pyspark
except:
  !pip install pyspark -q
  import pyspark
finally:
  from pyspark.sql import SparkSession

In [85]:
sparkSession = SparkSession.builder.appName("lrex").getOrCreate()

In [86]:
import os.path
FILE_PATH1 = "/content/csvdata1/Ecommerce_Customers.csv"

if not os.path.isfile(FILE_PATH1):
  !rm -rf /content/csvdata1
  !wget -P "/content/csvdata1/" "https://raw.githubusercontent.com/JarekMaleszyk/data-science-project-sandbox/refs/heads/main/Ecommerce_Customers.csv"

In [87]:
from pyspark.ml.regression import LinearRegression

In [88]:
data = sparkSession.read.csv(FILE_PATH1, sep=",", header=True, inferSchema=True)

In [89]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [90]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [91]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length',
               'Time on App',
               'Time on Website',
               'Length of Membership'],
    outputCol="features"
)

In [92]:
output = assembler.transform(data)

In [93]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [94]:
output.show(1)

+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avatar|Avg Session Length|      Time on App|  Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|Violet| 34.49726772511229|12.65565114916675|39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
only showing top 1 row



In [95]:
final_data = output.select("features", "Yearly Amount Spent")

In [96]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [97]:
lr = LinearRegression(labelCol="Yearly Amount Spent")

In [98]:
lr_model = lr.fit(train_data)

In [99]:
test_result = lr_model.evaluate(test_data)

In [100]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  9.308365276320728|
|-12.175875021040952|
| 10.342380598011118|
|-3.9030317347791765|
|   6.20165908235299|
| 3.9734410169716057|
| -21.86047427467804|
|  -4.15775025120638|
| -4.778470383110118|
| -6.003728323643713|
|-1.9077546548932673|
| -11.67875905549181|
|-4.0588560393305215|
|  8.214099183507926|
|-2.5257703858601417|
|-1.5039731360053565|
|-17.308959014176537|
| 3.5594850829861002|
| 1.4369822839946664|
| 22.784562238149476|
+-------------------+
only showing top 20 rows



In [101]:
test_result.rootMeanSquaredError

9.649386467363483

In [102]:
test_result.r2

0.9846659109364552

In [103]:
import os.path
FILE_PATH2 = "/content/csvdata2/cruise_ship_info.csv"

if not os.path.isfile(FILE_PATH2):
  !rm -rf /content/csvdata2
  !wget -P "/content/csvdata2/" "https://raw.githubusercontent.com/LeondraJames/Hyundai-Cruise-Ship-Crew-Prediction/refs/heads/master/cruise_ship_info.csv"

In [104]:
cruise_ship_data = sparkSession.read.csv(FILE_PATH2, sep=",", header=True, inferSchema=True)

In [105]:
cruise_ship_data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [106]:
cruise_ship_data.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [107]:
from pyspark.ml.feature import StringIndexer

In [111]:
indexer = StringIndexer(inputCol="Cruise_line",
                        outputCol="Cruise_category")
indexed = indexer.fit(cruise_ship_data).transform(cruise_ship_data)
indexed.head(3)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_category=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_category=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, Cruise_category=1.0)]

In [112]:
cruise_assembler = VectorAssembler(inputCols=[ "Age",
                                               "Tonnage",
                                               "passengers",
                                               "length",
                                               "cabins",
                                               "passenger_density",
                                               "Cruise_category"],
                                   outputCol="features")

In [113]:
cruise_output = cruise_assembler.transform(indexed)

In [114]:
cruise_output.select("features", "crew").show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [115]:
cruise_final_data = cruise_output.select(["features", "crew"])
cruise_train_data, cruise_test_data = cruise_final_data.randomSplit([0.7, 0.3])

In [116]:
cruise_lr = LinearRegression(labelCol="crew")
cruise_lr_model = cruise_lr.fit(cruise_train_data)

In [118]:
cruise_lr_results = cruise_lr_model.evaluate(cruise_test_data)

In [119]:
cruise_lr_results.rootMeanSquaredError

1.0985271404248407

In [120]:
cruise_lr_results.r2

0.9050375016981465

In [121]:
cruise_lr_results.meanSquaredError

1.2067618782499776

In [122]:
cruise_lr_results.meanAbsoluteError

0.756192855999996