In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
infquintais_gold = spark.table("AreasVerdes.infquintais_table")

In [4]:
tipo_compostagem_gold = infquintais_gold \
    .groupBy("Borough","Composting") \
    .agg(
        count(infquintais_gold.Composting).alias("number_compost"),
    )

tipo_compostagem_gold = tipo_compostagem_gold.withColumn("number_compost",col("number_compost").cast(IntegerType())) 

In [5]:
passeios_filtrados = infquintais_gold.filter((infquintais_gold.TotalSidewalkArea > 0))
area_passeios_gold = passeios_filtrados \
    .groupBy("Borough") \
    .agg(
        avg(infquintais_gold.TotalSidewalkArea).alias("area_sidewalk"),
    )

area_passeios_gold = area_passeios_gold.withColumn("area_sidewalk",col("area_sidewalk").cast(FloatType()))

In [6]:
capturaagua_filtrados = infquintais_gold.filter((infquintais_gold.RainHarvesting == True))
media_capturaagua_gold = capturaagua_filtrados \
    .groupBy("Borough") \
    .agg(
        avg(infquintais_gold.RainLitres).alias("avg_capacity_capturesystem"),
    )
media_capturaagua_gold = media_capturaagua_gold.withColumn("avg_capacity_capturesystem",col("avg_capacity_capturesystem").cast(FloatType()))

In [7]:
tipo_plantas_gold = infquintais_gold \
    .groupBy("Borough","Plants") \
    .agg(
        count(infquintais_gold.Plants).alias("number_plants"),
    )

tipo_plantas_gold = tipo_plantas_gold.withColumn("number_plants",col("number_plants").cast(IntegerType()))

In [4]:
tipo_plantas_gold.printSchema();

NameError: name 'tipo_plantas_gold' is not defined

In [6]:
#guarda os anos
numero_fontesagua_gold \
    .select("Borough","OnSiteService","number_onsiteservices") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/number_onsiteservices")

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: fd07e06d-21bb-4a25-b759-926c87ba5f72).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- Borough: string (nullable = true)
-- number_onsiteservices: integer (nullable = true)


Data schema:
root
-- Borough: string (nullable = true)
-- OnSiteService: boolean (nullable = true)
-- number_onsiteservices: integer (nullable = true)

         
To overwrite your schema or change partitioning, please set:
'.option("overwriteSchema", "true")'.

Note that the schema can't be overwritten when using
'replaceWhere'.
         

In [10]:
tipo_plantas_gold \
    .select("Borough","Plants","number_plants") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_plantas")

In [11]:
tipo_compostagem_gold \
    .select("Borough","Composting","number_compost") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/numero_compostagem")

In [12]:
area_passeios_gold \
    .select("Borough","area_sidewalk") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/area_sidewalk")

In [13]:
media_capturaagua_gold \
    .select("Borough","avg_capacity_capturesystem") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/infquintais/avg_capacity_capturesystem")

In [1]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.avg_capacity_capturesystem
    """
).show()

NameError: name 'spark' is not defined