In [1]:
pip install delta-spark

Collecting delta-spark
  Downloading delta_spark-2.2.0-py3-none-any.whl (20 kB)
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: py4j, delta-spark
Successfully installed delta-spark-2.2.0 py4j-0.10.9.5
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+-----------------+
|        namespace|
+-----------------+
|      areasverdes|
|          default|
|             demo|
|  gold_sales_demo|
|silver_sales_demo|
+-----------------+



In [6]:
spark.sql(
    """
    DROP DATABASE IF EXISTS AreasVerdes CASCADE
    """
)

DataFrame[]

In [7]:
# you can choose any location in HDFS, just be organized 
# Your data lake will grow with time and will become a swamp
spark.sql(
    """
    CREATE DATABASE AreasVerdes LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse/AreasVerdes.db/'
    """
)

DataFrame[]

In [8]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+-----------------+
|        namespace|
+-----------------+
|      areasverdes|
|          default|
|             demo|
|  gold_sales_demo|
|silver_sales_demo|
+-----------------+



In [9]:
spark.sql(
    """
    SHOW TABLES FROM AreasVerdes
    """
).show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [10]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.InfQuintais_Table
    """
)


DataFrame[]

In [11]:

spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.InfQuintais_Table (
    ParksID STRING,
    Borough STRING,
    InspectionID INTEGER,
    TotalFenceLength FLOAT,
    TotalSidewalkArea FLOAT,
    TotalSidewalkLength FLOAT,
    OnSiteService BOOLEAN,
    HydrantW_in5m BOOLEAN,
    HydrantOnGardenSide BOOLEAN,
    RainHarvesting BOOLEAN,
    RainLitres DOUBLE,
    SolarPanels BOOLEAN,
    Composting STRING,
    Plants STRING,
    OpenLawnOrCommunalArea BOOLEAN,
    PavedArea BOOLEAN,
    TreesInGarden BOOLEAN,
    FruitTrees BOOLEAN,
    StreetTrees BOOLEAN,
    EmptyTreePits BOOLEAN,
    Murals BOOLEAN,
    BlankShed BOOLEAN,
    ParksSign BOOLEAN,
    Chickens BOOLEAN,
    Pond BOOLEAN,
    Turtles BOOLEAN,
    Aquaponics BOOLEAN,
    FarmersMarket BOOLEAN,
    CSApickup BOOLEAN,
    Greenhouse BOOLEAN,
    StructureForSeasonExtension BOOLEAN
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/silver/InfQuintais'
    """
)

DataFrame[]

In [12]:
spark.sql(
    """
    SHOW TABLES FROM AreasVerdes
    """
).show()

+-----------+-----------------+-----------+
|  namespace|        tableName|isTemporary|
+-----------+-----------------+-----------+
|areasverdes|infquintais_table|      false|
+-----------+-----------------+-----------+



In [13]:
# Let's look into HDFS

In [14]:
spark.sql(
    """
    SELECT *
    FROM areasverdes.infquintais_table
    """
).show()

+-----------+---------+------------+----------------+-----------------+-------------------+-------------+-------------+-------------------+--------------+------------------+-----------+--------------------+-------+----------------------+---------+-------------+----------+-----------+-------------+------+---------+---------+--------+-----+-------+----------+-------------+---------+----------+---------------------------+
|    ParksID|  Borough|InspectionID|TotalFenceLength|TotalSidewalkArea|TotalSidewalkLength|OnSiteService|HydrantW_in5m|HydrantOnGardenSide|RainHarvesting|        RainLitres|SolarPanels|          Composting| Plants|OpenLawnOrCommunalArea|PavedArea|TreesInGarden|FruitTrees|StreetTrees|EmptyTreePits|Murals|BlankShed|ParksSign|Chickens| Pond|Turtles|Aquaponics|FarmersMarket|CSApickup|Greenhouse|StructureForSeasonExtension|
+-----------+---------+------------+----------------+-----------------+-------------------+-------------+-------------+-------------------+--------------+

In [16]:
spark.sql(
    """
    DESCRIBE FORMATTED areasverdes.infquintais_table
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,ParksID,string,
1,Borough,string,
2,InspectionID,int,
3,TotalFenceLength,float,
4,TotalSidewalkArea,float,
5,TotalSidewalkLength,float,
6,OnSiteService,boolean,
7,HydrantW_in5m,boolean,
8,HydrantOnGardenSide,boolean,
9,RainHarvesting,boolean,


In [17]:
spark.stop()