In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
arvores_gold = spark.table("AreasVerdes.arvores")

In [3]:
#Cria a coluna problems_general e cria um count para essa coluna
arvores_problems_gold = arvores_gold.select("year","boroname", "problems_canopy_general","problems_guard_general","problems_wires_general","problems_paving_general","problems_outlet_general","problems_shoes_general","problems_lights_general","problems_trunk_general","problems_trunk_light","problems_trunk_wire","problems_trunk_other","problems_root_stone","problems_root_grate","problems_root_other", "problems_branch_light", "problems_branch_shoe","problems_branch_other")
arvores_problems_gold = arvores_problems_gold.withColumn("problems_general", lit("None"))
arvores_problems_gold = arvores_problems_gold.withColumn("problems_general", when((arvores_problems_gold.problems_guard_general == "Yes") & (arvores_problems_gold.problems_wires_general == "Yes") & (arvores_problems_gold.problems_shoes_general== "Yes") & (arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_trunk_general== "Yes") & (arvores_problems_gold.problems_root_stone== "Yes") & (arvores_problems_gold.problems_root_grate== "Yes") & (arvores_problems_gold.problems_branch_light== "Yes") & (arvores_problems_gold.problems_branch_shoe== "Yes") ,"all") \
      .when((arvores_problems_gold.problems_root_stone== "Yes") & (arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_wires_general== "Yes") ,"Wires, Light, Stones" ) \
      .when((arvores_problems_gold.problems_root_stone== "Yes") & (arvores_problems_gold.problems_trunk_general== "Yes") & (arvores_problems_gold.problems_wires_general== "Yes") ,"Wires, Trunk, Stones" ) \
      .when((arvores_problems_gold.problems_root_stone== "Yes") & (arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_trunk_general== "Yes") & (arvores_problems_gold.problems_wires_general== "Yes") ,"Trunk, Light, Stones, Wires" )                              
      .when((arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_trunk_general== "Yes") ,"Trunk, Light" ) \
      .when((arvores_problems_gold.problems_root_stone== "Yes") & (arvores_problems_gold.problems_trunk_general== "Yes") ,"Trunk, Stone" ) \
      .when((arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_root_stone== "Yes") ,"Stone, Light" ) \
      .when((arvores_problems_gold.problems_lights_general== "Yes") & (arvores_problems_gold.problems_wires_general== "Yes") ,"Wires, Light" ) \
      .when((arvores_problems_gold.problems_wires_general== "Yes") & (arvores_problems_gold.problems_root_stone== "Yes") ,"Stone, Wires" ) \
      .when((arvores_problems_gold.problems_trunk_general== "Yes") & (arvores_problems_gold.problems_wires_general== "Yes") ,"Trunk, Wires" ) \
      .when(arvores_problems_gold.problems_shoes_general== "Yes" ,"Shoes" ) \
      .when(arvores_problems_gold.problems_lights_general== "Yes" ,"Light" ) \
      .when(arvores_problems_gold.problems_root_stone== "Yes" ,"Stone" ) \
      .when(arvores_problems_gold.problems_trunk_general== "Yes" ,"Trunk" ) \
      .when(arvores_problems_gold.problems_wires_general== "Yes" ,"Wires" ) \
      .when(arvores_problems_gold.year== 1995 ,"Unknown" ) \
      .otherwise(arvores_problems_gold.problems_general))

arvores_problems_gold = arvores_problems_gold \
    .groupBy("year", "boroname", "problems_general") \
    .agg(
        count(arvores_problems_gold.problems_general).alias("number_problems"),
    )
arvores_problems_gold = arvores_problems_gold.withColumn("number_problems",col("number_problems").cast(IntegerType())) 


In [5]:
# estatisticas para as species
arvores_species_gold = arvores_gold \
    .groupBy("year", "boroname", "comun_species_name") \
    .agg(
        count(arvores_gold.comun_species_name).alias("number_species"),
    )
arvores_species_gold = arvores_species_gold.withColumn("number_species",col("number_species").cast(IntegerType())) 

In [3]:
# estatisticas para os passeios
arvores_sidewalk_gold = arvores_gold \
    .groupBy("year", "boroname", "sidewalk") \
    .agg(
        count(arvores_gold.sidewalk).alias("number_sidewalk"),
    )

arvores_sidewalk_gold = arvores_sidewalk_gold.withColumn("number_sidewalk",col("number_sidewalk").cast(IntegerType())) 

In [4]:
# estatisticas para a saude das arvores
arvores_health_gold = arvores_gold \
    .groupBy("year", "boroname", "health") \
    .agg(
        count(arvores_gold.health).alias("number_health"),
    )

arvores_health_gold = arvores_health_gold.withColumn("number_health",col("number_health").cast(IntegerType())) 

In [5]:
# estatisticas para o numero das arvores
arvores_year_gold = arvores_gold \
    .groupBy("year") \
    .agg(
        count(arvores_gold.health).alias("number_year"),
    )

arvores_year_gold = arvores_year_gold.withColumn("number_year",col("number_year").cast(IntegerType())) 
arvores_year_gold = arvores_year_gold.withColumn("objective", lit("316212"))
arvores_year_gold = arvores_year_gold.withColumn("objective", when(arvores_year_gold.year == 2005,407611) \
      .when(arvores_year_gold.year == 1995,483011) \
      .otherwise(arvores_year_gold.objective))
arvores_year_gold = arvores_year_gold.withColumn("objective",col("objective").cast(IntegerType())) 

+----+-----------+---------+
|year|number_year|objective|
+----+-----------+---------+
|2015|     683788|   316212|
|2005|     592389|   407611|
|1995|     516989|   483011|
+----+-----------+---------+



In [6]:
#guarda os anos
arvores_year_gold \
    .select("year","number_year","objective") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/arvores_ano")

In [9]:
#guarda a saude
arvores_health_gold \
    .select("year","boroname","health","number_health") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/arvores_health")

In [10]:
#guarda a sidewalk
arvores_sidewalk_gold \
    .select("year","boroname","sidewalk","number_sidewalk") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/arvores_sidewalk")

In [6]:
#guarda as species
arvores_species_gold \
    .select("year","boroname","comun_species_name","number_species") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/arvores_species")

In [5]:
#guardar problemas
arvores_problems_gold \
    .select("year","boroname","problems_general","number_problems") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/arvores_problems")

In [6]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.arvores_problems
    """
).show()

+----+-------------+--------------------+---------------+
|year|     boroname|    problems_general|number_problems|
+----+-------------+--------------------+---------------+
|2015|    Manhattan|Wires, Trunk, Stones|            210|
|2015|        Bronx|               Light|           4674|
|2005|    Manhattan|        Wires, Light|             38|
|2015|Staten Island|                None|          87481|
|2015|       Queens|               Light|          15313|
|2015|Staten Island|Wires, Light, Stones|            239|
|2015|    Manhattan|        Stone, Light|            185|
|2005|    Manhattan|        Trunk, Light|            267|
|2015|       Queens|               Stone|          39385|
|2005|        Bronx|               Trunk|          17934|
|2015|    Manhattan|               Stone|          10378|
|2005|     Brooklyn|               Shoes|            175|
|2015|     Brooklyn|        Trunk, Wires|           1682|
|2015|Staten Island|Wires, Trunk, Stones|             88|
|2015|    Manh

In [7]:
spark.stop()