In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.park_number_lotsizeAVG
    """
)
spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.park_number_lotsizeAVG (
      borough STRING,
      number_park INT,
      avg_lotsize INT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/park_number_lotsizeAVG'
    """
)

DataFrame[]

In [3]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.park_number_lotsizeAVG
    """
).show()

+-------------+-----------+-----------+
|      borough|number_park|avg_lotsize|
+-------------+-----------+-----------+
|       Queens|         74|       1317|
|     Brooklyn|        483|       1418|
|Staten Island|         21|          0|
|    Manhattan|        247|        505|
|        Bronx|        216|        594|
+-------------+-----------+-----------+



In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.areacovered
    """
)
spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.areacovered (
      borough STRING,
      areacovered STRING,
      number_park INT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/areacovered'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.areacovered
    """
).show()

+-------------+-----------+-----------+
|      borough|areacovered|number_park|
+-------------+-----------+-----------+
|        Bronx|   Full Lot|         17|
|     Brooklyn|   Full Lot|        152|
|       Queens|   Full Lot|          5|
|       Queens|Partial Lot|          6|
|Staten Island|    Unknown|         21|
|       Queens|    Unknown|         63|
|     Brooklyn|Partial Lot|         15|
|     Brooklyn|    Unknown|        316|
|        Bronx|Partial Lot|         10|
|        Bronx|    Unknown|        189|
|    Manhattan|Partial Lot|         10|
|    Manhattan|    Unknown|        216|
|    Manhattan|   Full Lot|         21|
+-------------+-----------+-----------+



In [21]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.status
    """
)
spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.status (
      borough STRING,
      status STRING,
      numberStatus INT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/status'
    """
)

DataFrame[]

In [19]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.status
    """
)

DataFrame[]

In [18]:
spark.sql(
    """
    SELECT Distinct status
    FROM AreasVerdes.status
    WHERE borough == "Queens"
    
    """
).show()

+--------------------+
|              status|
+--------------------+
|              Active|
|Inactive (Group F...|
|      Not GreenThumb|
| Inactive (No Group)|
+--------------------+



In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.potential
    """
)
spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.potential (
      Borough STRING,
      Potencial_Urban_Ag STRING,
      Potencial INT,
      Total_Area INT
        
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/potential'
    """
)

DataFrame[]

In [2]:
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.potential
    """
)

DataFrame[]