In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct, split

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
gs_gold = spark.table("AreasVerdes.GreenStreets")

In [None]:
# Bairro com maior densidade de GreenStreets 
gs_dens_bairro = gs_gold \

#Converter para float
gs_dens_bairro = gs_dens_bairro.withColumn("hectares",col("hectares").cast(FloatType()))

#Criar mova coluna que assossia a cada rua a sua respetiva densidade, calculada pela area que ocupa e a area do bairro em que esta
gs_dens_bairro_final = gs_dens_bairro.withColumn("Hectares_por_km2", when(gs_dens_bairro.Borough == "Q", gs_dens_bairro.hectares/281.5) \
        .when(gs_dens_bairro.Borough == "B", gs_dens_bairro.hectares/179.7) 
        .when(gs_dens_bairro.Borough == "R", gs_dens_bairro.hectares/148.9)  
        .when(gs_dens_bairro.Borough == "M", gs_dens_bairro.hectares/58.8)  
        .when(gs_dens_bairro.Borough == "X", gs_dens_bairro.hectares/109.3)  
        )

#Filtrar linhas cujos os dados nao sao atualizados
gs_dens_bairro_final = gs_dens_bairro_final.filter(col('FeatureStatus') == ('Active'))

#Converter colunas para strings para mais facil analise no Tableau
gs_dens_bairro_final = gs_dens_bairro_final.withColumn("CommunityBoard",col("CommunityBoard").cast(StringType()))
gs_dens_bairro_final = gs_dens_bairro_final.withColumn("Precinct",col("Precinct").cast(StringType()))
gs_dens_bairro_final = gs_dens_bairro_final.withColumn("ZipCode",col("ZipCode").cast(StringType()))



In [None]:
#Guardar colunas pretendidas a analizar na tabela gold 
gs_dens_bairro_final \
    .select("Hectares","CommunityBoard","Precinct","ZipCode","Borough","Hectares_por_km2" ) \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/Gold/gs_gold")

In [None]:
spark.stop()