In [None]:
import time, json, requests
from datetime import datetime
from zoneinfo import ZoneInfo
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, to_date, upper
from delta.tables import DeltaTable
from delta import configure_spark_with_delta_pip
from pyspark.sql.types import *

In [None]:
# Configuração do Spark com Delta Lake
builder = SparkSession.builder \
    .appName("LabDemo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
   

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

## Consultando dados da Api

In [None]:
# A API permite results=N; ajuste conforme necessidade
n=1
url = f"https://randomuser.me/api/?results={n}&inc=gender,name,location,email,login,registered,dob,phone,cell,id,picture,nat"
r = requests.get(url, timeout=30)
r.raise_for_status()
result= r.json()["results"]
print(result)

## Enviando mensagem para o Kafka

In [None]:
topic="bronze-user"
bootstrap_servers="kafka-broker:29092"
data=result
print(data)
# Cria um DataFrame com uma coluna 'value' contendo JSON
json_data = [json.dumps(record) for record in data]
df = spark.createDataFrame([(row,) for row in json_data], ["value"])
print(df)
# Envia para o Kafka
df.write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("topic", topic) \
    .save()

In [None]:
# Raízes do data lake (partições lógicas)
BRONZE_ROOT = f"s3a://bronze/randomuser"
SILVER_ROOT = f"s3a://silver/randomuser"
GOLD_ROOT   = f"s3a://gold/randomuser"

In [None]:
saopaulo_tz = ZoneInfo("America/Sao_Paulo")
now_sp = datetime.now(saopaulo_tz)
ts = now_sp.strftime("%Y%m%dT%H%M%S")

ingestion_date=now_sp.date()

# Caminho da tabela Delta Bronze
bronze_path = f"{BRONZE_ROOT}/ingestion_date={ingestion_date}/randomuser_{ts}.json"

# Caminho da tabela Delta Silver
silver_path = f"{SILVER_ROOT}/ingestion_date={ingestion_date}"

### Ingestão camada Bronze

In [None]:
# Paraleliza os objetos como strings JSON
rdd_json = spark.sparkContext.parallelize([json.dumps(obj) for obj in result])

#Lê com o leitor de JSON (ele infere nested structs corretamente)
df = spark.read.option("samplingRatio", 1.0).json(rdd_json)

# Salva como JSON
df.coalesce(1).write.mode("append").json(bronze_path)
print("Bronze gravado em:", bronze_path)

### Lendo camada Bronze

In [None]:
df_bronze = spark.read.option("multiLine", False).json(bronze_path)

In [None]:
print(df_bronze.printSchema())

### Iniciando preparação para camada Silver

In [29]:
df_transformed = (
    df_bronze
    .withColumn("full_name", concat_ws(" ", col("name.first"), col("name.last")))
    .withColumn("country", col("location.country"))
    .withColumn("state", col("location.state"))
    .withColumn("city", col("location.city"))
    .withColumn("email", upper(col("email")))
    .withColumn("birthdate", to_date("dob.date"))
    .withColumn("registered_date", to_date("registered.date"))
    .drop("location", "name", "dob", "registered", "login", "picture", "id")
)


### Realizando o merge

In [None]:
if not DeltaTable.isDeltaTable(spark, silver_path):
    print("⚙️ Silver ainda não existe. Criando nova tabela Delta...")
    df_transformed.write.format("delta").partitionBy("nat").save(silver_path)
else:
    print("🔁 Silver já existe. Realizando merge (upsert)...")
    delta_table = DeltaTable.forPath(spark, silver_path)

    delta_table.alias("silver") \
        .merge(
            df_transformed.alias("bronze"),
            "silver.email = bronze.email"
        ) \
        .whenMatchedUpdate(set={
            "full_name": "bronze.full_name",
            "country": "bronze.country",
            "state": "bronze.state",
            "city": "bronze.city",
            "birthdate": "bronze.birthdate",
            "registered_date": "bronze.registered_date",
            "nat": "bronze.nat"
        }) \
        .whenNotMatchedInsert(values={
            "full_name": "bronze.full_name",
            "country": "bronze.country",
            "state": "bronze.state",
            "city": "bronze.city",
            "birthdate": "bronze.birthdate",
            "registered_date": "bronze.registered_date",
            "email": "bronze.email",
            "nat": "bronze.nat"
        }) \
        .execute()