In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit, substring, avg, sum, count, countDistinct

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
goldPot = spark.table("AreasVerdes.Potencial_Table")

In [3]:
goldPot = goldPot.select("Data_Created","Borough","Block","Lot","Address","Parcel_Name", "Agency", "Total_Area", "Community_Board", "Council_District", "Coordinates", "Potencial_Urban_Ag", "Latitude", "Longitude", "BIN", "NTA")

In [4]:
goldPot.toPandas()

Unnamed: 0,Data_Created,Borough,Block,Lot,Address,Parcel_Name,Agency,Total_Area,Community_Board,Council_District,Coordinates,Potencial_Urban_Ag,Latitude,Longitude,BIN,NTA
0,11/02/2017 12:00:00 AM,Queens,3916,35,5 AVENUE,,DCAS,15000,407,19.0,0/0,Least Potencial,,,,
1,11/02/2017 12:00:00 AM,Queens,6391,24,63 ROAD,,DCAS,4700,407,24.0,0/0,Least Potencial,,,,
2,11/02/2017 12:00:00 AM,Queens,14246,1189,DAVENPORT COURT,HAMILTON BEACH FIRE DEPT ACCES,DCAS,3200,410,32.0,0/0,Least Potencial,,,,
3,11/02/2017 12:00:00 AM,Queens,14254,1640,104 STREET,HAMILTON BEACH,DCAS,1600,410,32.0,0/0,Least Potencial,,,,
4,11/02/2017 12:00:00 AM,Queens,15622,78,BEACH 13 STREET,,DCAS,3400,414,31.0,0/0,Least Potencial,,,,
5,11/02/2017 12:00:00 AM,Queens,15622,180,NEW HAVEN AVENUE,,DCAS,4057,414,31.0,0/0,Least Potencial,,,,
6,11/02/2017 12:00:00 AM,Staten Island,5687,130,ARTHUR KILL ROAD,,DCAS,35206,503,51.0,0/0,Least Potencial,,,,
7,11/02/2017 12:00:00 AM,Queens,14228,62,102 STREET,,DCAS,100128,410,32.0,0/0,Least Potencial,,,,
8,11/02/2017 12:00:00 AM,Bronx,2875,67,PLIMPTON AVENUE,CROSS BX EXPWY-SEC-1,DOT,2573,205,14.0,0/0,Most Potential,,,,
9,11/02/2017 12:00:00 AM,Brooklyn,7074,1,1528 SURF AVENUE,CONEY ISLAND EAST,DOT,7469,313,47.0,988935/148775,Least Potencial,11224.0,,,3000000.0


In [5]:
potential = goldPot \
    .groupBy("Borough", "Potencial_Urban_Ag") \
    .agg(
        count(goldPot.Potencial_Urban_Ag).alias("Potencial"),
        avg(goldPot.Total_Area).alias("Total_Area")
    )

potential= potential.withColumn("Potencial",col("Potencial").cast(IntegerType())) 
potential= potential.withColumn("Total_Area",col("Total_Area").cast(IntegerType())) 
potential.toPandas()


Unnamed: 0,Borough,Potencial_Urban_Ag,Potencial,Total_Area
0,Brooklyn,Most Potential,1,2500
1,Queens,Least Potencial,9,15720
2,Bronx,Most Potential,3,2699
3,Queens,Most Potential,10,6353
4,Staten Island,Least Potencial,4,19763
5,Brooklyn,Least Potencial,2,7134


In [6]:
potential \
    .select("Borough", "Potencial_Urban_Ag", "Potencial", "Total_Area") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/gold/potential")
