In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
edificiosverdes_gold = spark.table("AreasVerdes.edificiosverdes_Table")

In [3]:
# estatisticas para numero de arvores por bairro
numero_arvores_gold = edificiosverdes_gold \
    .groupBy("borough", "Tree_Commo") \
    .agg(
        count(edificiosverdes_gold.Tree_Commo).alias("numero_arvores"),
    )

numero_arvores_gold = numero_arvores_gold.withColumn("numero_arvores",col("numero_arvores").cast(IntegerType())) 


numero_arvores_gold.show();

+--------+--------------------+--------------+
| borough|          Tree_Commo|numero_arvores|
+--------+--------------------+--------------+
|  Queens|October Glory Red...|            14|
|  Queens|        Chestnut Oak|             2|
|Brooklyn|Summer Glow Bird ...|             4|
|   Bronx|   Sweetbay Magnolia|             6|
|   Bronx|     Dura-Heat Birch|             2|
|Brooklyn|         Field Maple|             5|
|  Queens|Não temos informação|           378|
|Brooklyn|Fastigiata Europe...|             1|
|Brooklyn|Spring Snow Craba...|             3|
|  Queens|      Eastern Redbud|            83|
|Brooklyn|Regent Japanese P...|             2|
|   Bronx|   Hardy Rubber Tree|             3|
|  Queens|  Crimson Point Plum|             4|
|   Bronx|Emerald Avenue Ho...|             1|
|Brooklyn|        Sawtooth Oak|            45|
|   Bronx|Village Green Zel...|             1|
|Brooklyn|Thornless Common ...|            82|
|Brooklyn|   Eastern Black Oak|             5|
|   Bronx|   

In [4]:
#estatisticas para  estado de contrucao dos edificios por bairro

estado_construcao_gold = edificiosverdes_gold \
    .groupBy("borough", "Status") \
    .agg(
        count(edificiosverdes_gold.Status).alias("estado"),
    )

estado_construcao_gold = estado_construcao_gold.withColumn("estado",col("estado").cast(IntegerType())) 

estado_construcao_gold.show();

+-------------+--------------------+------+
|      borough|              Status|estado|
+-------------+--------------------+------+
|       Queens|         Constructed|   176|
|    Manhattan|Constructed (Full...|     6|
|       Queens|100% Design Accepted|   214|
|     Brooklyn|Constructed (Full...|  3539|
|        Bronx|Constructed (In G...|   353|
|        Bronx|Constructed (Full...|   378|
|Staten Island|         Constructed|    39|
|        Bronx|         Constructed|   190|
|        Bronx|100% Design Submi...|     5|
|       Queens|     In Construction|   788|
|     Brooklyn|100% Design Submi...|   659|
|        Bronx|90% Design Submitted|     1|
|    Manhattan|100% Design Accepted|     6|
|       Queens|Constructed (In G...|  1663|
|        Bronx|100% Design Accepted|    14|
|       Queens|Constructed (Full...|  4142|
|     Brooklyn|100% Design Accepted|   684|
|    Manhattan|     In Construction|     2|
|     Brooklyn|Constructed (In G...|    28|
|    Manhattan|         Construc

In [5]:
#estatisticas para media do valor do lote por bairro continua mal

media_valor_gold = edificiosverdes_gold \
    .groupBy("borough") \
    .agg(
        avg(edificiosverdes_gold.BBL).alias("valor_medio"),
        sum(edificiosverdes_gold.Asset_leng).alias("area_edificiosverdes"),
    )

media_valor_gold = media_valor_gold.withColumn("valor_medio",col("valor_medio")) 
media_valor_gold = media_valor_gold.withColumn("area_edificiosverdes",col("area_edificiosverdes").cast(IntegerType()))

media_valor_gold.show();

+-------------+--------------------+--------------------+
|      borough|         valor_medio|area_edificiosverdes|
+-------------+--------------------+--------------------+
|       Queens|3.7936124765423098E9|               36503|
|     Brooklyn|2.8256555587891145E9|               27176|
|Staten Island| 4.654986778883938E9|                 213|
|    Manhattan| 9.445894957021874E8|                 259|
|        Bronx|1.9086224215276945E9|                6112|
+-------------+--------------------+--------------------+



In [4]:
#guarda numero de arvores
numero_arvores_gold \
    .select("borough", "Tree_Commo","numero_arvores") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/edificiosverdes/numero_arvores")

In [7]:
#guarda estado de construcao

#guarda numero de arvores
estado_construcao_gold \
    .select("borough", "Status","estado") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/edificiosverdes/estado_construcao")

In [8]:
#guarda valor medio lote e area

#guarda numero de arvores
media_valor_gold \
    .select("borough", "valor_medio","area_edificiosverdes") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/edificiosverdes/valor_medio_lote_e_area")