In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [11]:
goldBlock = spark.table("AreasVerdes.BlockLot_Table")

In [12]:
goldLocQuintais = spark.table("AreasVerdes.LocQuintais_Table")

In [13]:
# join to make a flat table with product information
flat_LocQuintais = goldLocQuintais \
    .join(goldBlock, goldLocQuintais.parksid1 == goldBlock.parksid)

In [14]:
flat_LocQuintais = flat_LocQuintais.select("assemblydist","borough","communityboard","congressionaldist","coundist","gardenname", "parksid1", "statesenatedist", "status","zipcode", "lotsize", "lotnum", "areacovered")

In [15]:
flat_LocQuintais.toPandas()

Unnamed: 0,assemblydist,borough,communityboard,congressionaldist,coundist,gardenname,parksid1,statesenatedist,status,zipcode,lotsize,lotnum,areacovered
0,79,Bronx,201,15,17,St. Ann's Block Garden Association,X345-GT001,32,Active,10455,70,104.0,Full Lot
1,79,Bronx,201,15,17,St. Ann's Block Garden Association,X345-GT001,32,Active,10455,16,16.0,Full Lot
2,79,Bronx,201,15,17,St. Ann's Block Garden Association,X345-GT001,32,Active,10455,1119,17.0,Full Lot
3,60,Brooklyn,316,9,42,Gethsemane Garden,B436-GT001,19,Inactive (Group Forming),11212,0,30.0,Unknown
4,56,Brooklyn,303,8,36,Hart to Hart Community Garden,B474-GT001,25,Active,11206,0,123.0,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,29,Queens,412,5,27,Senior Citizen Garden Club,Q448-GT001,14,Not GreenThumb,11434,0,180.0,Unknown
1037,56,Brooklyn,308,9,36,Garden Kitchen Lab at St. John's Rec.,B245-GT001,25,Not GreenThumb,11213,900,1.0,Partial Lot
1038,56,Brooklyn,303,8,36,Von King Park and Cultural Center Garden,B088-GT001,25,Not GreenThumb,11216,0,1.0,Unknown
1039,69,Manhattan,107,10,6,Garden People,M071-GT001,31,Not GreenThumb,10024,0,10.0,Unknown


In [16]:
park_bairro = flat_LocQuintais \
    .groupBy("borough") \
    .agg(
        count(flat_LocQuintais.parksid1).alias("number_park"),
        avg(flat_LocQuintais.lotsize).alias("avg_lotsize"),
    )

park_bairro= park_bairro.withColumn("number_park",col("number_park").cast(IntegerType())) 
park_bairro= park_bairro.withColumn("avg_lotsize",col("avg_lotsize").cast(IntegerType())) 
park_bairro.toPandas()

Unnamed: 0,borough,number_park,avg_lotsize
0,Queens,74,1317
1,Brooklyn,483,1418
2,Staten Island,21,0
3,Manhattan,247,505
4,Bronx,216,594


In [17]:
park_bairro \
    .select("borough", "number_park", "avg_lotsize") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/park_number_lotsizeAVG")

In [18]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.park_number_lotsizeAVG
    """
).show()

+-------------+-----------+-----------+
|      borough|number_park|avg_lotsize|
+-------------+-----------+-----------+
|       Queens|         74|       1317|
|     Brooklyn|        483|       1418|
|Staten Island|         21|          0|
|    Manhattan|        247|        505|
|        Bronx|        216|        594|
+-------------+-----------+-----------+



In [19]:
status = flat_LocQuintais \
    .groupBy ("borough", "status") \
    .agg(
        count(flat_LocQuintais.status).alias("numberStatus")
)
status = status.withColumn("numberStatus",col("numberStatus").cast(IntegerType()))
status.toPandas()

Unnamed: 0,borough,status,numberStatus
0,Bronx,Active (Unlicensed),2
1,Brooklyn,Inactive (No Group),2
2,Staten Island,Active,7
3,Brooklyn,Not GreenThumb,18
4,Queens,Inactive (No Group),5
5,Queens,Not GreenThumb,6
6,Staten Island,Closed (Other),9
7,Manhattan,Closed (Construction),3
8,Queens,Inactive (Group Forming),2
9,Brooklyn,Inactive (Group Forming),3


In [20]:
status \
    .select("borough", "status", "numberStatus") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/status")

In [21]:
areacovered = flat_LocQuintais \
    .groupBy( "borough","areacovered") \
    .agg(
        count(flat_LocQuintais.parksid1).alias("number_park"), 
    )

areacovered= areacovered.withColumn("number_park",col("number_park").cast(IntegerType()))
areacovered.toPandas()

Unnamed: 0,borough,areacovered,number_park
0,Bronx,Full Lot,17
1,Brooklyn,Full Lot,152
2,Queens,Full Lot,5
3,Queens,Partial Lot,6
4,Staten Island,Unknown,21
5,Queens,Unknown,63
6,Brooklyn,Partial Lot,15
7,Brooklyn,Unknown,316
8,Bronx,Partial Lot,10
9,Bronx,Unknown,189


In [22]:
areacovered \
    .select("borough", "areacovered", "number_park") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/areacovered")