In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/LocQuintais_csv/Potencial.csv"

In [3]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
customSchema = StructType([
    StructField("Data_Created", StringType(), True),
    StructField("Borough", IntegerType(), True),
    StructField("Block", IntegerType(), True),
    StructField("Lot", IntegerType(), True),
    StructField("Address", StringType(), True),
    StructField("Parcel_Name", StringType(), True),
    StructField("Agency", StringType(), True),
    StructField("Total_Area", IntegerType(), True),
    StructField("Community_Board", IntegerType(), True),
    StructField("Council_District", IntegerType(), True),
    StructField("Coordinates", StringType(), True),
    StructField("Potencial_Urban_Ag", StringType(), True),
    StructField("Latitude", IntegerType(), True),
    StructField("Longitude", IntegerType(), True),
    StructField("BIN", IntegerType(), True),
    StructField("NTA", StringType(), True)
])

Potencial = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)

In [4]:
# Escolho as colunas que quero
Potencial = Potencial.select("Data_Created","Borough","Block","Lot","Address","Parcel_Name", "Agency", "Total_Area", "Community_Board", "Council_District", "Coordinates", "Potencial_Urban_Ag", "Latitude", "Longitude", "BIN", "NTA")

In [5]:
#substituição das iniciais pelos nomes dos respetivos bairros
Potencial = Potencial.withColumn("Borough", when(Potencial.Borough == 2,"Bronx") \
      .when(Potencial.Borough == 3,"Brooklyn") \
      .when(Potencial.Borough == 1,"Manhattan") \
      .when(Potencial.Borough == 4, "Queens") \
      .when(Potencial.Borough == 5, "Staten Island") \
      .otherwise(Potencial.Borough))

In [6]:
#substituir a informação das linhas para Unknown
Potencial = Potencial.withColumn("Latitude", when(Potencial.Latitude == None,"Unknown") \
      .when(Potencial.Latitude == "None","Unknown") \
      .otherwise(Potencial.Latitude))

In [7]:
#substituir a informação das linhas para Unknown
Potencial = Potencial.withColumn("Longitude", when(Potencial.Longitude == None,"Unknown") \
      .when(Potencial.Longitude == "None","Unknown") \
      .otherwise(Potencial.Longitude))

In [8]:
#substituir a informação das linhas para Unknown
Potencial = Potencial.withColumn("BIN", when(Potencial.BIN == None,"Unknown") \
      .when(Potencial.BIN == "None","Unknown") \
      .otherwise(Potencial.BIN))

In [9]:
#substituir a informação das linhas para Unknown
Potencial = Potencial.withColumn("NTA", when(Potencial.NTA == None,"Unknown") \
      .when(Potencial.NTA == "None","Unknown") \
      .otherwise(Potencial.NTA))

In [10]:
Potencial = Potencial.withColumn("Potencial_Urban_Ag", when(Potencial.Potencial_Urban_Ag == "Potentially Suitable 1 - Site is available for a 4 year renewable license through the Parks Department's GreenThumb program contingent on program capacity","Most Potential") \
      .when(Potencial.Potencial_Urban_Ag == "Potentially Suitable 3 - Site has specific instructions or conditions that limit its availability and/or use","Least Potencial") \
      .otherwise(Potencial.Potencial_Urban_Ag))

In [11]:
#write df to hive deltalake_table
#.select("customer_name","month", "sales", "year")  reorder columns to match parquet table sequence
Potencial \
    .select("Data_Created","Borough","Block","Lot","Address","Parcel_Name", "Agency", "Total_Area", "Community_Board", "Council_District", "Coordinates", "Potencial_Urban_Ag", "Latitude", "Longitude", "BIN", "NTA") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/silver/Potencial")

In [12]:
Potencial.toPandas()

Unnamed: 0,Data_Created,Borough,Block,Lot,Address,Parcel_Name,Agency,Total_Area,Community_Board,Council_District,Coordinates,Potencial_Urban_Ag,Latitude,Longitude,BIN,NTA
0,11/02/2017 12:00:00 AM,Queens,3916,35,5 AVENUE,,DCAS,15000,407,19.0,0/0,Least Potencial,,,,
1,11/02/2017 12:00:00 AM,Queens,6391,24,63 ROAD,,DCAS,4700,407,24.0,0/0,Least Potencial,,,,
2,11/02/2017 12:00:00 AM,Queens,14246,1189,DAVENPORT COURT,HAMILTON BEACH FIRE DEPT ACCES,DCAS,3200,410,32.0,0/0,Least Potencial,,,,
3,11/02/2017 12:00:00 AM,Queens,14254,1640,104 STREET,HAMILTON BEACH,DCAS,1600,410,32.0,0/0,Least Potencial,,,,
4,11/02/2017 12:00:00 AM,Queens,15622,78,BEACH 13 STREET,,DCAS,3400,414,31.0,0/0,Least Potencial,,,,
5,11/02/2017 12:00:00 AM,Queens,15622,180,NEW HAVEN AVENUE,,DCAS,4057,414,31.0,0/0,Least Potencial,,,,
6,11/02/2017 12:00:00 AM,Staten Island,5687,130,ARTHUR KILL ROAD,,DCAS,35206,503,51.0,0/0,Least Potencial,,,,
7,11/02/2017 12:00:00 AM,Queens,14228,62,102 STREET,,DCAS,100128,410,32.0,0/0,Least Potencial,,,,
8,11/02/2017 12:00:00 AM,Bronx,2875,67,PLIMPTON AVENUE,CROSS BX EXPWY-SEC-1,DOT,2573,205,14.0,0/0,Most Potential,,,,
9,11/02/2017 12:00:00 AM,Brooklyn,7074,1,1528 SURF AVENUE,CONEY ISLAND EAST,DOT,7469,313,47.0,988935/148775,Least Potencial,11224.0,,,3000000.0
