## Configuração

In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

## ETL Trust

### Lendo a Camada Context

In [3]:
path = 's3a://context/loft'

df_context = spark.read.format('parquet').load(path)

df_context.show(10, False)

+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+--------------+--------+------------------+----------+-----------+-------+------+-------+---------+-------+---------+-------------+---------+----------+
|Link_Apto                                                                                                           |Endereco                                                                                  |Bairro        |Valor_RS|Andares           |Tamanho_m2|Valor_RS/m2|Quartos|Suites|Garagem|Banheiros|Varanda|Mobiliado|Portaria     |Metro    |Refdate   |
+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+--------------+--------+------------------+----------+-----------+-

In [4]:
(df_context
 .select("Refdate")
 .distinct()
 .orderBy("Refdate", ascending=True)
 .show()
)

+----------+
|   Refdate|
+----------+
|2023-04-17|
|2023-05-02|
|2023-05-17|
|2023-06-03|
|2023-06-17|
|2023-07-02|
+----------+



In [5]:
(df_context
 .groupBy(fn.col("Link_Apto"))
 .count()
 # .filter(fn.col("count") == 344)
 .orderBy("count", ascending=False)
).show(20, False)

+----------------------------------------------------------------------------------------------------------+-----+
|Link_Apto                                                                                                 |count|
+----------------------------------------------------------------------------------------------------------+-----+
|https://loft.com.br/imovel/apartamento-jua-mirim-jose-bonifacio-sao-paulo-2-quartos-36m2/yaqi4a           |344  |
|https://loft.com.br/imovel/apartamento-jua-mirim-jose-bonifacio-sao-paulo-2-quartos-42m2/z194kv           |341  |
|https://loft.com.br/imovel/apartamento-alvaro-de-carvalho-republica-sao-paulo-1-quarto-23m2/xqslbi        |331  |
|https://loft.com.br/imovel/apartamento-dona-paulina-se-sao-paulo-1-quarto-21m2/zf57g1                     |331  |
|https://loft.com.br/imovel/apartamento-mauro-de-araujo-ribeiro-jaragua-sao-paulo-2-quartos-50m2/stp5gu    |325  |
|https://loft.com.br/imovel/apartamento-pascal-campo-belo-sao-paulo-1-quarto-35m

In [6]:
(df_context
 .orderBy("Refdate", ascending=False)
 .dropDuplicates(["Link_Apto", "Endereco", "Valor_RS"])
 .groupBy(fn.col("Link_Apto"))
 .count()
 .orderBy("count", ascending=False)
 # .filter(fn.col("Link_Apto") == "https://loft.com.br/imovel/casa-avenida-belisario-pena-vila-maria-sao-paulo-4-quartos-297m2/1a1zp40")
).show(10, False)

+-----------------------------------------------------------------------------------------------------------------+-----+
|Link_Apto                                                                                                        |count|
+-----------------------------------------------------------------------------------------------------------------+-----+
|https://loft.com.br/imovel/apartamento-rua-henrique-fausto-lancellotti-campo-belo-sao-paulo-1-quarto-29m2/19oqvlp|4    |
|https://loft.com.br/imovel/apartamento-rua-goncalves-ledo-ipiranga-sao-paulo-3-quartos-118m2/13ugre              |3    |
|https://loft.com.br/imovel/apartamento-avenida-carioca-ipiranga-sao-paulo-2-quartos-52m2/134johq                 |3    |
|https://loft.com.br/imovel/casa-rua-desembargador-do-vale-vila-romana-sao-paulo-3-quartos-180m2/12dr31p          |3    |
|https://loft.com.br/imovel/apartamento-avenida-paulista-jardim-america-sao-paulo-1-quarto-27m2/wxpxq4            |3    |
|https://loft.com.br/imo

In [7]:
(df_context
 .orderBy("Refdate", ascending=False)
 .dropDuplicates(["Link_Apto", "Endereco", "Valor_RS"])
 .filter(fn.col("Link_Apto") == "https://loft.com.br/imovel/apartamento-rua-henrique-fausto-lancellotti-campo-belo-sao-paulo-1-quarto-29m2/19oqvlp")
).show(10, False)

+-----------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------+----------+--------+------------------+----------+-----------+-------+------+-------+---------+-------+---------+-------------+---------+----------+
|Link_Apto                                                                                                        |Endereco                                                     |Bairro    |Valor_RS|Andares           |Tamanho_m2|Valor_RS/m2|Quartos|Suites|Garagem|Banheiros|Varanda|Mobiliado|Portaria     |Metro    |Refdate   |
+-----------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------+----------+--------+------------------+----------+-----------+-------+------+-------+---------+-------+---------+-------------+---------+----------+
|https://loft.com.br/i

In [8]:
df_trust = (df_context
            .orderBy("Refdate", ascending=False)
            .dropDuplicates(["Link_Apto", "Endereco", "Valor_RS"])
            .orderBy("Refdate", ascending=False)
            .withColumn("id", fn.monotonically_increasing_id())
           )

df_trust = (df_trust
            .select("id", "Refdate", "Link_Apto", "Endereco", "Bairro", "Valor_RS", 
                    "Andares", "Tamanho_m2", "Valor_RS/m2", "Quartos", "Suites", 
                    "Garagem", "Banheiros", "Varanda", "Mobiliado", "Portaria", 
                    "Metro")            
           )

df_trust.show(10, False)

+---+----------+----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+---------------+--------+------------------+----------+-----------+-------+------+-------+---------+-------+---------+-------------+---------+
|id |Refdate   |Link_Apto                                                                                                 |Endereco                                                                                            |Bairro         |Valor_RS|Andares           |Tamanho_m2|Valor_RS/m2|Quartos|Suites|Garagem|Banheiros|Varanda|Mobiliado|Portaria     |Metro    |
+---+----------+----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+---------------+--------+-----------------

### Testando Drop Duplicates

In [10]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string3', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=3, b=3., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string3|2000-02-01|2000-01-02 12:00:00|
|  3|3.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [11]:
(df
 .orderBy("a", ascending=False)
 .dropDuplicates(["b", "c"])
).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  3|3.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

