In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
#ver data bases existentes
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+-----------+
|  namespace|
+-----------+
|areasverdes|
|    default|
|       demo|
+-----------+



In [3]:
#criar nova database
spark.sql(
    """
    CREATE DATABASE AreasVerdes LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse/AreasVerdes.db/'
    """
)

AnalysisException: Namespace 'AreasVerdes' already exists

In [None]:
#ver se a database foi criada com sucesso
spark.sql(
    """
    SHOW DATABASES
    """
).show()

In [None]:
#tables da database que criei
spark.sql(
    """
    SHOW TABLES FROM AreasVerdes
    """
).show()

In [None]:
#crio a tabela
spark.sql(
    """
    DROP TABLE IF EXISTS AreasVerdes.edificiosverdes_Table
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE AreasVerdes.edificiosverdes_Table (
   The_geom STRING,      
   Asset_Id INTEGER,
   GI_ID STRING,
   DEP_ContrA STRING,
   Project_Ty STRING,
   Row_Onsite STRING,
   Project_Na STRING,
   Asset_Type STRING,
   Status STRING,
   Asset_X_Co FLOAT,
   Asset_Y_Co FLOAT,
   Borough STRING,
   Sewer_Type STRING,
   Outfall STRING,
   Waterbody STRING,
   BBL DOUBLE,
   Community INTEGER,
   City_Counc INTEGER,
   Assembly_D STRING,
   Asset_leng DOUBLE,
   Asset_Widt DOUBLE,
   Asset_Area DOUBLE,
   GI_Feature STRING,
   Tree_Latin STRING,
   Tree_Commo STRING,
   Constructi STRING,
   Status_Gro STRING
    )
    USING DELTA
    
    LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/silver/edificiosverdes'
    """
)

In [None]:
#tables da database que criei depois de criar a tabela
spark.sql(
    """
    SHOW TABLES FROM AreasVerdes
    """
).show()

In [4]:
#Verificar se a tabela foi carregada corretamente
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.edificiosverdes_Table
    """
).show()

+--------------------+--------+-------+----------+----------+----------+--------------------+----------+--------------------+----------+----------+-------+----------+-------+--------------------+-----------------+---------+----------+--------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+------------+
|            The_geom|Asset_Id|  GI_ID|DEP_ContrA|Project_Ty|Row_Onsite|          Project_Na|Asset_Type|              Status|Asset_X_Co|Asset_Y_Co|Borough|Sewer_Type|Outfall|           Waterbody|              BBL|Community|City_Counc|          Assembly_D|        Asset_leng|        Asset_Widt|        Asset_Area|          GI_Feature|          Tree_Latin|          Tree_Commo|          Constructi|  Status_Gro|
+--------------------+--------+-------+----------+----------+----------+--------------------+----------+--------------------+----------+----------+-------+----------+-------+------

In [5]:
spark.stop()