In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
hdfs_path = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/LocQuintais_csv/LocQuintais.csv"

In [4]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
customSchema = StructType([
    StructField("assemblydist", IntegerType(), True),
    StructField("borough", StringType(), True),
    StructField("communityboard", IntegerType(), True),
    StructField("congressionaldist", IntegerType(), True),
    StructField("coundist", IntegerType(), True),
    StructField("gardenname", StringType(), True),
    StructField("juris", StringType(), True),
    StructField("multipolygon", StringType(), True),
    StructField("openhrsf", StringType(), True),
    StructField("openhrsm", StringType(), True),
    StructField("openhrssa", StringType(), True),
    StructField("openhrssu", StringType(), True),
    StructField("openhrsth", StringType(), True),
    StructField("openhrstu", StringType(), True),
    StructField("openhrsw", StringType(), True),
    StructField("parksid1", StringType(), True),
    StructField("policeprecinct", StringType(), True),
    StructField("statesenatedist", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("zipcode", IntegerType(), True)
])

LocQuintais = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)


In [5]:
# Escolho as colunas que quero
LocQuintais = LocQuintais.select("assemblydist","borough","communityboard","congressionaldist","coundist","gardenname", "juris", "multipolygon", "openhrsf", "openhrsm", "openhrssa", "openhrssu", "openhrsth", "openhrstu", "openhrsw", "parksid1","policeprecinct","statesenatedist","status","zipcode")

In [24]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("juris", when(LocQuintais.juris == None,"Unknown") \
      .when(LocQuintais.juris == "None","Unknown") \
      .otherwise(LocQuintais.juris))

In [25]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrsf", when(LocQuintais.openhrsf == None,"Unknown") \
      .when(LocQuintais.openhrsf == "None","Unknown") \
      .otherwise(LocQuintais.openhrsf))

In [26]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrsm", when(LocQuintais.openhrsm == None,"Unknown") \
      .when(LocQuintais.openhrsm == "None","Unknown") \
      .otherwise(LocQuintais.openhrsm))

In [27]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrssa", when(LocQuintais.openhrssa == None,"Unknown") \
      .when(LocQuintais.openhrssa == "None","Unknown") \
      .otherwise(LocQuintais.openhrssa))

In [28]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrssu", when(LocQuintais.openhrssu == None,"Unknown") \
      .when(LocQuintais.openhrssu == "None","Unknown") \
      .otherwise(LocQuintais.openhrssu))

In [29]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrsth", when(LocQuintais.openhrsth == None,"Unknown") \
      .when(LocQuintais.openhrsth == "None","Unknown") \
      .otherwise(LocQuintais.openhrsth))

In [30]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("openhrstu", when(LocQuintais.openhrstu == None,"Unknown") \
      .when(LocQuintais.openhrstu == "None","Unknown") \
      .otherwise(LocQuintais.openhrstu))

In [7]:
#substituir a informação das linhas para Unknown

LocQuintais = LocQuintais.withColumn("openhrsw", when(LocQuintais.openhrsw == None,"Unknown") \
      .when(LocQuintais.openhrsw == "None","Unknown") \
      .otherwise(LocQuintais.openhrsw))

In [32]:
#substituir a informação das linhas para Unknown
LocQuintais = LocQuintais.withColumn("policeprecinct", when(LocQuintais.policeprecinct == None,"Unknown") \
      .when(LocQuintais.policeprecinct == "None","Unknown") \
      .otherwise(LocQuintais.policeprecinct))

In [33]:
#substituição das iniciais pelos nomes dos respetivos bairros
LocQuintais = LocQuintais.withColumn("borough", when(LocQuintais.borough == "X","Bronx") \
      .when(LocQuintais.borough == "B","Brooklyn") \
      .when(LocQuintais.borough == "M","Manhattan") \
      .when(LocQuintais.borough == "Q", "Queens") \
      .when(LocQuintais.borough == "R", "Staten Island") \
      .otherwise(LocQuintais.borough))

In [34]:
#ver os possiveis resultados na coluna borough
LocQuintais.createOrReplaceTempView("LocQuintais")

sqlized_df = spark.sql(
    """
    SELECT Distinct borough
    FROM LocQuintais
    """
)

sqlized_df.show()

+-------------+
|      borough|
+-------------+
|       Queens|
|     Brooklyn|
|Staten Island|
|    Manhattan|
|        Bronx|
+-------------+



In [35]:
#ver os possiveis resultados na coluna gardenname
LocQuintais.createOrReplaceTempView("LocQuintais")

sqlized_df = spark.sql(
    """
    SELECT Distinct gardenname
    FROM LocQuintais
    """
)

sqlized_df.show()

+--------------------+
|          gardenname|
+--------------------+
|     Bayview Habitat|
|Oko Farms Aquapon...|
|Family Community ...|
|NYCHA - Hylan - H...|
|St. John's Place ...|
| The People's Garden|
|Morris Jumel Comm...|
|      Garden of Love|
|Neighbors of Vega...|
| Bette's Rose Garden|
|Roy Wilkins Commu...|
| Wyckoff Bond Garden|
|     Greene Garden .|
|       Garden People|
|        Dolly's Park|
|St. Luke's Commun...|
|      Garden of Eden|
|Lower East Side E...|
|   Relaxation Garden|
|Fred McLeod Commu...|
+--------------------+
only showing top 20 rows



In [17]:
LocQuintais.toPandas()

Unnamed: 0,assemblydist,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw,parksid1,policeprecinct,statesenatedist,status,zipcode
0,79,Bronx,201,15,17,St. Ann's Block Garden Association,Unknown,MULTIPOLYGON (((-73.91110273250362 40.81723264...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,X345-GT001,Unknown,32,Active,10455
1,60,Brooklyn,316,9,42,Gethsemane Garden,Unknown,MULTIPOLYGON (((-73.90957411598448 40.65888700...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B436-GT001,Unknown,19,Inactive (Group Forming),11212
2,56,Brooklyn,303,8,36,Hart to Hart Community Garden,Unknown,MULTIPOLYGON (((-73.94706962955922 40.69329106...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B474-GT001,Unknown,25,Active,11206
3,53,Brooklyn,301,7,34,El Puente: Espiritu Tierra Community Garden - ...,Unknown,MULTIPOLYGON (((-73.95967599680367 40.71246980...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B424-GT001,Unknown,18,Active,11211
4,56,Brooklyn,308,9,36,Eden’s Community Garden,Unknown,MULTIPOLYGON (((-73.93250525668881 40.67469872...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B540-GT001,Unknown,25,Active,11213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,29,Queens,412,5,27,Senior Citizen Garden Club,Unknown,MULTIPOLYGON (((-73.77112451557377 40.68779304...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Q448-GT001,Unknown,14,Not GreenThumb,11434
602,56,Brooklyn,308,9,36,Garden Kitchen Lab at St. John's Rec.,Unknown,MULTIPOLYGON (((-73.93500081856017 40.67391829...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B245-GT001,Unknown,25,Not GreenThumb,11213
603,56,Brooklyn,303,8,36,Von King Park and Cultural Center Garden,Unknown,MULTIPOLYGON (((-73.9469681111235 40.689049288...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,B088-GT001,Unknown,25,Not GreenThumb,11216
604,69,Manhattan,107,10,6,Garden People,Unknown,MULTIPOLYGON (((-73.97899093812129 40.79309028...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,M071-GT001,Unknown,31,Not GreenThumb,10024


In [36]:
#write df to hive deltalake_table
#.select("customer_name","month", "sales", "year")  reorder columns to match parquet table sequence
LocQuintais \
    .select("assemblydist","borough","communityboard","congressionaldist","coundist","gardenname", "juris","multipolygon","openhrsf", "openhrsm", "openhrssa", "openhrssu", "openhrsth", "openhrstu", "openhrsw", "parksid1", "policeprecinct", "statesenatedist","status","zipcode") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/silver/LocQuintais")
    

In [94]:
LocQuintais.printSchema()

root
 |-- assemblydist: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- communityboard: integer (nullable = true)
 |-- congressionaldist: integer (nullable = true)
 |-- coundist: integer (nullable = true)
 |-- gardenname: string (nullable = true)
 |-- juris: string (nullable = false)
 |-- multipolygon: string (nullable = true)
 |-- openhrsf: string (nullable = false)
 |-- openhrsm: string (nullable = false)
 |-- openhrssa: string (nullable = false)
 |-- openhrssu: string (nullable = false)
 |-- openhrsth: string (nullable = false)
 |-- openhrstu: string (nullable = false)
 |-- openhrsw: string (nullable = false)
 |-- parksid1: string (nullable = true)
 |-- policeprecinct: string (nullable = false)
 |-- statesenatedist: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- zipcode: integer (nullable = true)



In [20]:
LocQuintais.show()

+------------+---------+--------------+-----------------+--------+--------------------+-------+--------------------+--------+--------+---------+---------+---------+---------+--------+----------+--------------+---------------+--------------------+-------+
|assemblydist|  borough|communityboard|congressionaldist|coundist|          gardenname|  juris|        multipolygon|openhrsf|openhrsm|openhrssa|openhrssu|openhrsth|openhrstu|openhrsw|   parksid|policeprecinct|statesenatedist|              status|zipcode|
+------------+---------+--------------+-----------------+--------+--------------------+-------+--------------------+--------+--------+---------+---------+---------+---------+--------+----------+--------------+---------------+--------------------+-------+
|          79|    Bronx|           201|               15|      17|St. Ann's Block G...|Unknown|MULTIPOLYGON (((-...| Unknown| Unknown|  Unknown|  Unknown|  Unknown|  Unknown| Unknown|X345-GT001|       Unknown|             32|          

In [76]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.LocQuintais_Table
    """
).show()

+------------+---------+--------------+-----------------+--------+--------------------+-------+--------------------+--------+--------+---------+---------+---------+---------+--------+----------+--------------+---------------+--------------------+-------+
|assemblydist|  borough|communityboard|congressionaldist|coundist|          gardenname|  juris|        multipolygon|openhrsf|openhrsm|openhrssa|openhrssu|openhrsth|openhrstu|openhrsw|   parksid|policeprecinct|statesenatedist|              status|zipcode|
+------------+---------+--------------+-----------------+--------+--------------------+-------+--------------------+--------+--------+---------+---------+---------+---------+--------+----------+--------------+---------------+--------------------+-------+
|          79|    Bronx|           201|               15|      17|St. Ann's Block G...|Unknown|MULTIPOLYGON (((-...| Unknown| Unknown|  Unknown|  Unknown|  Unknown|  Unknown| Unknown|X345-GT001|       Unknown|             32|          

In [24]:
LocQuintais=LocQuintais.select("assemblydist","borough","communityboard","congressionaldist","coundist","gardenname","multipolygon","parksid","statesenatedist","status","zipcode")

In [26]:
LocQuintais.printSchema()
LocQuintais.toPandas()

root
 |-- assemblydist: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- communityboard: integer (nullable = true)
 |-- congressionaldist: integer (nullable = true)
 |-- coundist: integer (nullable = true)
 |-- gardenname: string (nullable = true)
 |-- multipolygon: string (nullable = true)
 |-- parksid: string (nullable = true)
 |-- statesenatedist: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- zipcode: integer (nullable = true)



Unnamed: 0,assemblydist,borough,communityboard,congressionaldist,coundist,gardenname,multipolygon,parksid,statesenatedist,status,zipcode
0,79,Bronx,201,15,17,St. Ann's Block Garden Association,MULTIPOLYGON (((-73.91110273250362 40.81723264...,X345-GT001,32,Active,10455
1,60,Brooklyn,316,9,42,Gethsemane Garden,MULTIPOLYGON (((-73.90957411598448 40.65888700...,B436-GT001,19,Inactive (Group Forming),11212
2,56,Brooklyn,303,8,36,Hart to Hart Community Garden,MULTIPOLYGON (((-73.94706962955922 40.69329106...,B474-GT001,25,Active,11206
3,53,Brooklyn,301,7,34,El Puente: Espiritu Tierra Community Garden - ...,MULTIPOLYGON (((-73.95967599680367 40.71246980...,B424-GT001,18,Active,11211
4,56,Brooklyn,308,9,36,Eden’s Community Garden,MULTIPOLYGON (((-73.93250525668881 40.67469872...,B540-GT001,25,Active,11213
...,...,...,...,...,...,...,...,...,...,...,...
601,29,Queens,412,5,27,Senior Citizen Garden Club,MULTIPOLYGON (((-73.77112451557377 40.68779304...,Q448-GT001,14,Not GreenThumb,11434
602,56,Brooklyn,308,9,36,Garden Kitchen Lab at St. John's Rec.,MULTIPOLYGON (((-73.93500081856017 40.67391829...,B245-GT001,25,Not GreenThumb,11213
603,56,Brooklyn,303,8,36,Von King Park and Cultural Center Garden,MULTIPOLYGON (((-73.9469681111235 40.689049288...,B088-GT001,25,Not GreenThumb,11216
604,69,Manhattan,107,10,6,Garden People,MULTIPOLYGON (((-73.97899093812129 40.79309028...,M071-GT001,31,Not GreenThumb,10024
