<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/apache_spark_lrex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
  import pyspark
except:
  !pip install pyspark -q
  import pyspark
finally:
  from pyspark.sql import SparkSession

In [2]:
sparkSession = SparkSession.builder.appName("lrex").getOrCreate()

In [3]:
import os.path
FILE_PATH1 = "/content/csvdata1/Ecommerce_Customers.csv"

if not os.path.isfile(FILE_PATH1):
  !rm -rf /content/csvdata1
  !wget -P "/content/csvdata1/" "https://raw.githubusercontent.com/JarekMaleszyk/data-science-project-sandbox/refs/heads/main/Ecommerce_Customers.csv"

--2024-11-04 11:01:46--  https://raw.githubusercontent.com/JarekMaleszyk/data-science-project-sandbox/refs/heads/main/Ecommerce_Customers.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 86871 (85K) [text/plain]
Saving to: ‘/content/csvdata1/Ecommerce_Customers.csv’


2024-11-04 11:01:46 (3.32 MB/s) - ‘/content/csvdata1/Ecommerce_Customers.csv’ saved [86871/86871]



In [4]:
from pyspark.ml.regression import LinearRegression

In [5]:
data = sparkSession.read.csv(FILE_PATH1, sep=",", header=True, inferSchema=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length',
               'Time on App',
               'Time on Website',
               'Length of Membership'],
    outputCol="features"
)

In [9]:
output = assembler.transform(data)

In [10]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
output.show(1)

+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avatar|Avg Session Length|      Time on App|  Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|Violet| 34.49726772511229|12.65565114916675|39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
+--------------------+--------------------+------+------------------+-----------------+-----------------+--------------------+-------------------+--------------------+
only showing top 1 row



In [12]:
final_data = output.select("features", "Yearly Amount Spent")

In [13]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [14]:
lr = LinearRegression(labelCol="Yearly Amount Spent")

In [15]:
lr_model = lr.fit(train_data)

In [16]:
test_result = lr_model.evaluate(test_data)

In [17]:
test_result.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|   9.522088035573233|
|-0.44017872444698014|
| -7.9902137857459365|
| -1.1945799854214556|
|   3.338698344073805|
| -6.8340439912199145|
|  -9.960096291199932|
| -17.747140088436026|
| -14.577422485223678|
|   6.001196613443142|
| -1.1152558259062744|
| -1.9832262223521298|
|  -17.90060555423844|
|  0.7178587774976677|
|   8.375464074109061|
| -10.601064533619478|
|  -9.476178644060553|
| -17.476975234261147|
|  -5.979095278624186|
|   7.825250904926065|
+--------------------+
only showing top 20 rows



In [18]:
test_result.rootMeanSquaredError

9.910401194790387

In [19]:
test_result.r2

0.9812751993030568

In [20]:
import os.path
FILE_PATH2 = "/content/csvdata2/cruise_ship_info.csv"

if not os.path.isfile(FILE_PATH2):
  !rm -rf /content/csvdata2
  !wget -P "/content/csvdata2/" "https://raw.githubusercontent.com/LeondraJames/Hyundai-Cruise-Ship-Crew-Prediction/refs/heads/master/cruise_ship_info.csv"

--2024-11-04 11:02:10--  https://raw.githubusercontent.com/LeondraJames/Hyundai-Cruise-Ship-Crew-Prediction/refs/heads/master/cruise_ship_info.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8734 (8.5K) [text/plain]
Saving to: ‘/content/csvdata2/cruise_ship_info.csv’


2024-11-04 11:02:10 (41.9 MB/s) - ‘/content/csvdata2/cruise_ship_info.csv’ saved [8734/8734]



In [21]:
cruise_ship_data = sparkSession.read.csv(FILE_PATH2, sep=",", header=True, inferSchema=True)

In [22]:
cruise_ship_data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [23]:
cruise_ship_data.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [24]:
from pyspark.ml.feature import StringIndexer

In [25]:
indexer = StringIndexer(inputCol="Cruise_line",
                        outputCol="Cruise_category")
indexed = indexer.fit(cruise_ship_data).transform(cruise_ship_data)
indexed.head(3)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_category=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_category=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, Cruise_category=1.0)]

In [26]:
cruise_assembler = VectorAssembler(inputCols=[ "Age",
                                               "Tonnage",
                                               "passengers",
                                               "length",
                                               "cabins",
                                               "passenger_density",
                                               "Cruise_category"],
                                   outputCol="features")

In [27]:
cruise_output = cruise_assembler.transform(indexed)

In [28]:
cruise_output.select("features", "crew").show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [29]:
cruise_final_data = cruise_output.select(["features", "crew"])
cruise_train_data, cruise_test_data = cruise_final_data.randomSplit([0.7, 0.3])

In [30]:
cruise_lr = LinearRegression(labelCol="crew")
cruise_lr_model = cruise_lr.fit(cruise_train_data)

In [31]:
cruise_lr_results = cruise_lr_model.evaluate(cruise_test_data)

In [32]:
cruise_lr_results.rootMeanSquaredError

0.8021584201724185

In [33]:
cruise_lr_results.r2

0.9382336303374784

In [34]:
cruise_lr_results.meanSquaredError

0.6434581310535102

In [35]:
cruise_lr_results.meanAbsoluteError

0.6176775745469526

In [36]:
from pyspark.sql.functions import corr

In [37]:
cruise_ship_data.select(corr('crew', 'passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [38]:
cruise_ship_data.select(corr('crew', 'cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [39]:
cruise_ship_data.head(3)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)]