In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_jardins`
""").show()

++
||
++
++



In [3]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.numero_jardins_presto
""").show()

++
||
++
++



In [4]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.numero_jardins_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS AreasVerdes.numero_jardins_presto (
    Borough STRING,
      number_gardens INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_jardins/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++



In [5]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_plantas`
""").show()

++
||
++
++



In [6]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.numero_plantas_presto
""").show()

++
||
++
++



In [7]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.numero_plantas_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS AreasVerdes.numero_plantas_presto (
  Borough STRING,
  Plants STRING,
  number_plants INT
    
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_plantas/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++



In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_compostagem`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.numero_compostagem_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS AreasVerdes.numero_compostagem_presto (
 Borough STRING,
      Composting STRING,
      number_compost INT
    
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_compostagem/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++



In [10]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/area_sidewalk`
""").show()

++
||
++
++



In [11]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.area_sidewalk_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS AreasVerdes.area_sidewalk_presto (
 Borough STRING,
area_sidewalk FLOAT
    
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/area_sidewalk/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++



In [12]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/avg_capacity_capturesystem`
""").show()

++
||
++
++



In [13]:
spark.sql("""
DROP TABLE IF EXISTS AreasVerdes.avg_capacity_capturesystem_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS AreasVerdes.avg_capacity_capturesystem_presto (
 Borough STRING,
avg_capacity_capturesystem FLOAT
    
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/avg_capacity_capturesystem/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

