In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import when, col, concat, lit

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/AreasVerdes/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [16]:
hdfs_path = "hdfs://hdfs-nn:9000/AreasVerdes/bronze/LocQuintais_csv/BlockLot.csv"

In [17]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
customSchema = StructType([
    StructField("parksid", StringType(), True),
    StructField("block", IntegerType(), True),
    StructField("lotnum", IntegerType(), True),
    StructField("lotsize", IntegerType(), True),
    StructField("areacovered", StringType(), True)
])

BlockLot = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)

In [18]:
# Escolho as colunas que quero
BlockLot = BlockLot.select("parksid", "block", "lotnum", "lotsize", "areacovered")

In [19]:
#substituir a informação das linhas para Unknown
BlockLot = BlockLot.na.fill(0,["lotsize"])

In [20]:
#substituir a informação das linhas para Unknown

BlockLot = BlockLot.na.fill("Unknown",["areacovered"])

In [21]:
#ver os possiveis resultados na coluna parksid
BlockLot.createOrReplaceTempView("BlockLot")

sqlized_df = spark.sql(
    """
    SELECT Distinct parksid
    FROM BlockLot
    """
)

sqlized_df.show()

+----------+
|   parksid|
+----------+
|B505-GT001|
|M333-GT001|
|    QGT008|
|    MGT038|
|X339-GT001|
|    MGT039|
|Q507-GT001|
|    MGT091|
|M315-GT001|
|    XGT031|
|B490-GT001|
|B507-GT002|
|B444-GT001|
|    BGT060|
|B471-GT001|
|X333-GT001|
|B508-GT001|
|    BGT056|
|X361-GT001|
|Q484-GT001|
+----------+
only showing top 20 rows



In [22]:
#caso haja, remover parksid duplicados
parksid = BlockLot.dropDuplicates()
print("Distinct count: "+str(parksid.count()))
parksid.show(truncate=False)

Distinct count: 1043
+----------+-----+------+-------+-----------+
|parksid   |block|lotnum|lotsize|areacovered|
+----------+-----+------+-------+-----------+
|MGT008    |1788 |31    |0      |Unknown    |
|BGT014    |3043 |14    |0      |Unknown    |
|X235-GT001|2625 |17    |0      |Unknown    |
|MGT018    |1631 |2     |0      |Unknown    |
|B504-GT001|8173 |66    |2000   |Full Lot   |
|M328-GT001|1953 |22    |0      |Unknown    |
|X276-GT001|2616 |29    |0      |Unknown    |
|M321-GT001|439  |10    |8225   |Partial Lot|
|B424-GT001|2407 |30    |0      |Unknown    |
|X285-GT001|2267 |62    |0      |Unknown    |
|B166-GT001|8312 |1     |6250   |Partial Lot|
|BGT173    |3245 |39    |5000   |Full Lot   |
|B523-GT001|7247 |240   |0      |Unknown    |
|X364-GT001|3540 |1     |0      |Unknown    |
|M387-GT001|1923 |20    |0      |Unknown    |
|X317-GT001|2406 |28    |0      |Unknown    |
|B562-GT001|7052 |56    |30369  |Full Lot   |
|M362-GT001|1722 |5     |0      |Unknown    |
|X358-GT001|2

In [23]:
#write df to hive deltalake_table
#.select("customer_name","month", "sales", "year")  reorder columns to match parquet table sequence
BlockLot \
    .select("parksid", "block", "lotnum", "lotsize", "areacovered") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/AreasVerdes/silver/BlockLot")

In [14]:
spark.sql(
    """
    SELECT *
    FROM AreasVerdes.BlockLot_Table
    """
).show()

+-----------+-----+------+-------+-----------+
|    parksid|block|lotnum|lotsize|areacovered|
+-----------+-----+------+-------+-----------+
| B024-GT001| 2912|     1|      0|    Unknown|
| M071-GT001| 1254|    10|      0|    Unknown|
| B088-GT001| 1790|     1|      0|    Unknown|
| B245-GT001| 1353|     1|    900|    Unknown|
| Q448-GT001|12406|   180|      0|    Unknown|
|X179A-GT001| 2685|    30|      0|    Unknown|
|X179A-GT001| 2685|    78|      0|    Unknown|
|     MGT043| 1080|     1|      0|    Unknown|
|     MGT086| 1205|     6|      0|    Unknown|
|     MGT087| 1220|    17|      0|    Unknown|
|     MGT088|  259|    44|      0|    Unknown|
| M105-GT002|  418|     1|      0|    Unknown|
|     MGT091|  431|    14|      0|    Unknown|
|     RGT005|  569|   245|      0|    Unknown|
| M278-GT001|  606|     1|      0|    Unknown|
| Q470-GT001|  638|    37|      0|    Unknown|
|     BGT123| 1552|    21|      0|    Unknown|
|     BGT129| 1670|    49|      0|    Unknown|
|     MGT051|