# 1. Imports

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import col, trim

---
# 2. Leitura

In [0]:
schema = StructType([
    StructField("geolocation_zip_code_prefix", StringType(), False),
    StructField("geolocation_lat",             DoubleType(), True),
    StructField("geolocation_lng",             DoubleType(), True),
    StructField("geolocation_city",            StringType(), True),
    StructField("geolocation_state",           StringType(), True),
])

In [0]:
geolocation_file_path = [file.path for file in dbutils.fs.ls('dbfs/FileStore/Datum/KaggleOlistBrData/bronze') if '_geolocation_' in str(file.name).lower()]

if len(geolocation_file_path) == 0:
    # Caso não tenha nenhum arquivo para leitura retornará um erro
    raise ValueError('Sem arquivo geolocation no dir "dbfs:/FileStore/Datum/KaggleOlistBrData/bronze/"')

if len(geolocation_file_path) == 1:
    df_geolocation = spark.read.csv(geolocation_file_path[0], header=True, schema=schema)

else:
    # Se tivermos mais de um arquivo identificado para a tabela 'S_D_K_CUSTOMERS' não será processado e é necesária uma revisão
    raise ValueError('Mais de um arquivo geolocation no dir "dbfs:/FileStore/Datum/KaggleOlistBrData/bronze/"')

---
# 3. Delta Lake

Utilizando `format("delta")` para aproveitar as vantagens da estrutura.

In [0]:
%sql

USE CATALOG datum

In [0]:
%sql

USE DATABASE silver

In [0]:
%sql

CREATE TABLE IF NOT EXISTS o_geolocation
(
  geolocation_zip_code_prefix STRING NOT NULL,
  geolocation_lat             DOUBLE,
  geolocation_lng             DOUBLE,
  geolocation_city            STRING, 
  geolocation_state           STRING
)
USING DELTA
LOCATION 'abfss://unity-datum@datumunity.dfs.core.windows.net/silver/olist_geolocation'

In [0]:
if df_geolocation.count() != 0 and df_geolocation is not None:
    df_geolocation.write.format('delta').mode("overwrite").save('abfss://unity-datum@datumunity.dfs.core.windows.net/silver/olist_geolocation')