In [1]:
!pip install delta-spark==3.2.0

Collecting delta-spark==3.2.0
  Downloading delta_spark-3.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.0->delta-spark==3.2.0)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading delta_spark-3.2.0-py3-none-any.whl (21 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: py4j, delta-spark
Successfully installed delta-spark-3.2.0 py4j-0.10.9.7


In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
from delta import *

In [3]:
builder = SparkSession.builder \
                      .master("spark://spark:7077") \
                      .config("spark.sql.warehouse.dir", "/opt/prefect/spark-warehouse") \
                      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## BRONZE

In [4]:
delta_bronze = DeltaTable.forPath(spark, "/opt/prefect/spark-warehouse/delta_bronze")

delta_bronze.toDF().show(truncate=False)

+-----+---+------+------------------------------------------------------------------------------------------------------------------------------------+----------+-----+---------+-----------+-----------------------------+--------+----------+------------+-----------+------+
|id   |sol|camera|img_src                                                                                                                             |earth_date|rover|camera_id|camera_name|camera_full_name             |rover_id|rover_name|landing_date|launch_date|status|
+-----+---+------+------------------------------------------------------------------------------------------------------------------------------------+----------+-----+---------+-----------+-----------------------------+--------+----------+------------+-----------+------+
|727  |0  |NULL  |http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/00000/opgs/edr/fcam/FRA_397502305EDR_D0010000AUT_04096M_.JPG|2012-08-06|NULL |20       |FHA

## SILVER

In [5]:
delta_silver = DeltaTable.forPath(spark, "/opt/prefect/spark-warehouse/delta_silver")

delta_silver.toDF().show(truncate=False)

+-----+---+-----------+------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------+
|id   |sol|camera_name|img_src                                                                                                                             |earth_date         |rover_name|
+-----+---+-----------+------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------+
|727  |0  |FHAZ       |http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/00000/opgs/edr/fcam/FRA_397502305EDR_D0010000AUT_04096M_.JPG|2012-08-06 00:00:00|Curiosity |
|9720 |0  |FHAZ       |http://mars.jpl.nasa.gov/msl-raw-images/proj/msl/redops/ods/surface/sol/00000/opgs/edr/fcam/FRA_397506083EDR_F0010008AUT_04096M_.JPG|2012-08-06 00:00:00|Curiosity |
|9721 |0  |FHAZ       |http://mars.jpl.nasa.gov/msl-raw-imag

## GOLD

### Агреговані дані для підрахунку кількості знімків за камери та солами

In [7]:
delta_agg_sol_cam_gold = DeltaTable.forPath(spark, "/opt/prefect/spark-warehouse/agg_sol_cam_gold")
delta_agg_sol_cam_gold.toDF().show()

+-----------+---+-------------------+-----------+
|camera_name|sol|         earth_date|photo_count|
+-----------+---+-------------------+-----------+
|    CHEMCAM| 45|2012-09-21 00:00:00|          8|
|       RHAZ|115|2012-12-02 00:00:00|          1|
|       FHAZ| 73|2012-10-20 00:00:00|          1|
|    CHEMCAM|120|2012-12-07 00:00:00|          2|
|     NAVCAM|147|2013-01-04 00:00:00|         57|
|       MAST|132|2012-12-19 00:00:00|         26|
|       MAST|171|2013-01-28 00:00:00|        120|
|       MAST|  3|2012-08-09 00:00:00|        338|
|       RHAZ|198|2013-02-25 00:00:00|          1|
|      MAHLI|170|2013-01-27 00:00:00|         16|
|       MAST|135|2012-12-22 00:00:00|        140|
|       MAST|186|2013-02-13 00:00:00|        562|
|    CHEMCAM|174|2013-01-31 00:00:00|          4|
|     NAVCAM|128|2012-12-15 00:00:00|          8|
|     NAVCAM|227|2013-03-27 00:00:00|         32|
|       MAST|150|2013-01-07 00:00:00|         82|
|       RHAZ|158|2013-01-15 00:00:00|          1|


### Результати підрахунку середньої кількості знімків на сол для активних камер

In [8]:
avg_photos_per_sol_gold = DeltaTable.forPath(spark, "/opt/prefect/spark-warehouse/avg_photos_per_sol_gold")
avg_photos_per_sol_gold.toDF().show()

+-----------+------------------+
|camera_name|avg_photos_per_sol|
+-----------+------------------+
|    CHEMCAM|               5.7|
|     NAVCAM|             27.03|
|      MARDI|             21.27|
|      MAHLI|              21.8|
|       MAST|            119.73|
|       FHAZ|              2.81|
|       RHAZ|              1.24|
+-----------+------------------+



### Визначаємо найбільш активні періоди місії

In [14]:
active_periods_gold = DeltaTable.forPath(spark, "/opt/prefect/spark-warehouse/active_periods_gold")
active_periods_gold.toDF().orderBy(desc("total_photos")).show()

+---+-------------------+------------+
|sol|         earth_date|total_photos|
+---+-------------------+------------+
|  0|2012-08-06 00:00:00|        3702|
| 42|2012-09-18 00:00:00|        2683|
| 20|2012-08-26 00:00:00|        1046|
| 37|2012-09-13 00:00:00|        1006|
|173|2013-01-30 00:00:00|         759|
|192|2013-02-19 00:00:00|         744|
|172|2013-01-29 00:00:00|         728|
| 78|2012-10-25 00:00:00|         716|
| 66|2012-10-12 00:00:00|         703|
|184|2013-02-11 00:00:00|         686|
|186|2013-02-13 00:00:00|         584|
| 44|2012-09-20 00:00:00|         558|
|137|2012-12-24 00:00:00|         550|
|109|2012-11-25 00:00:00|         534|
| 64|2012-10-10 00:00:00|         510|
|111|2012-11-28 00:00:00|         501|
| 69|2012-10-15 00:00:00|         490|
| 74|2012-10-21 00:00:00|         478|
|110|2012-11-27 00:00:00|         476|
| 19|2012-08-25 00:00:00|         444|
+---+-------------------+------------+
only showing top 20 rows



In [28]:
spark.stop()