# **Preparando o Ambiente**

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=83e6dc4f874e9d7d6834d5d7ca199d550aae7bbfe72ea3ae44a26fffe5b351bc
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Challenge da Alura") \
    .getOrCreate()

spark

Montando o Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dados = spark.read.parquet('/content/drive/MyDrive/Challenge_Pyspark_Alura/parquet/part-00000-67f0ea53-49e1-482e-954e-8c385eabc181-c000.snappy.parquet')

In [5]:
dados

DataFrame[id: string, andar: bigint, area_total: string, area_util: string, banheiros: bigint, caracteristicas: array<string>, quartos: bigint, suites: bigint, tipo_anuncio: string, tipo_unidade: string, tipo_uso: string, vaga: bigint, bairro: string, zona: string, condominio: string, iptu: string, tipo: string, valor: string]

In [6]:
dados.show()

+--------------------+-----+----------+---------+---------+--------------------+-------+------+------------+------------+-----------+----+--------------------+------------+----------+----+-----+------+
|                  id|andar|area_total|area_util|banheiros|     caracteristicas|quartos|suites|tipo_anuncio|tipo_unidade|   tipo_uso|vaga|              bairro|        zona|condominio|iptu| tipo| valor|
+--------------------+-----+----------+---------+---------+--------------------+-------+------+------------+------------+-----------+----+--------------------+------------+----------+----+-----+------+
|03a386b6-7ab8-4ef...|    0|        43|       43|        1|[Churrasqueira, A...|      2|  null|       Usado| Apartamento|Residencial|   1|            Realengo|  Zona Oeste|       285|null|Venda| 22999|
|1fe78d41-b8e0-4d2...|    0|        44|       44|        1|                  []|      2|     0|       Usado| Apartamento|Residencial|   0|               Irajá|  Zona Norte|       170|   0|Vend

# Seleção de Freatures

Algumas colunas possuem apenas um único valor. Essas colunas estão dessa forma devido aos filtros que foram realizados na base de dados durante a semana 1.

In [7]:
dados.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: long (nullable = true)
 |-- area_total: string (nullable = true)
 |-- area_util: string (nullable = true)
 |-- banheiros: long (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: long (nullable = true)
 |-- suites: long (nullable = true)
 |-- tipo_anuncio: string (nullable = true)
 |-- tipo_unidade: string (nullable = true)
 |-- tipo_uso: string (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: string (nullable = true)
 |-- iptu: string (nullable = true)
 |-- tipo: string (nullable = true)
 |-- valor: string (nullable = true)



In [8]:
dados.select("tipo").distinct().show(5)

+-----+
| tipo|
+-----+
|Venda|
+-----+



In [9]:
dados.select("tipo_uso").distinct().show(5)

+-----------+
|   tipo_uso|
+-----------+
|Residencial|
+-----------+



In [10]:
dados.select("tipo_unidade").distinct().show(5)

+------------+
|tipo_unidade|
+------------+
| Apartamento|
+------------+



In [11]:
dados.select("tipo_anuncio").distinct().show(5)

+------------+
|tipo_anuncio|
+------------+
|       Usado|
+------------+



In [12]:
dados.select("area_total", "area_util").show()

+----------+---------+
|area_total|area_util|
+----------+---------+
|        43|       43|
|        44|       44|
|      null|       55|
|      null|       55|
|      null|       50|
|        47|       47|
|     17089|       45|
|        48|       48|
|      null|       55|
|        70|       70|
|        50|       48|
|        40|       40|
|        65|       65|
|        50|       50|
|        45|       45|
|        48|       48|
|        44|       44|
|        64|       64|
|        55|       55|
|        60|       60|
+----------+---------+
only showing top 20 rows



In [13]:
from pyspark.sql import functions as f

dados_nulos = dados\
        .select('area_util', 'area_total')\
        .na\
        .drop()

quantidade_total = dados_nulos.count()
quantidades_iguais = dados_nulos.filter(f.col('area_util') == f.col('area_total')).count()
quantidades_diferentes = quantidade_total - quantidades_iguais

In [14]:
print(f'Quantidade total de dados: {quantidade_total}')
print(f'Quantidade de dados iguais nas duas colunas: {quantidades_iguais}')
print(f'Quantidades de dados diferentes nas duas colunas: {quantidades_diferentes}')

Quantidade total de dados: 57368
Quantidade de dados iguais nas duas colunas: 55384
Quantidades de dados diferentes nas duas colunas: 1984


In [15]:
colunas = ["area_util", "area_total"]

dados\
    .select([f.count(f.when(dados[c].isNull(), True)).alias(c) for c in colunas])\
    .show()

+---------+----------+
|area_util|area_total|
+---------+----------+
|        0|      9194|
+---------+----------+



In [16]:
dados\
  .drop("area_total", "tipo_anuncio", "tipo_unidade", "tipo_uso", "tipo")\
  .show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|iptu| valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|03a386b6-7ab8-4ef...|    0|       43|        1|[Churrasqueira, A...|      2|  null|   1|            Realengo|  Zona Oeste|       285|null| 22999|
|1fe78d41-b8e0-4d2...|    0|       44|        1|                  []|      2|     0|   0|               Irajá|  Zona Norte|       170|   0|110000|
|1fa1c1e5-e98c-433...|    4|       55|        1|                  []|      2|     0|   1|              Cosmos|  Zona Oeste|      null|null|115000|
|a6ab01ae-3d40-40e...|    2|       55|        1|                  []|      2|     0|   0|        Tomás Coelho|  Zona N

In [17]:
dataset = dados\
              .drop("area_total", "tipo_anuncio", "tipo_unidade", "tipo_uso", "tipo")
dataset.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|iptu| valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|03a386b6-7ab8-4ef...|    0|       43|        1|[Churrasqueira, A...|      2|  null|   1|            Realengo|  Zona Oeste|       285|null| 22999|
|1fe78d41-b8e0-4d2...|    0|       44|        1|                  []|      2|     0|   0|               Irajá|  Zona Norte|       170|   0|110000|
|1fa1c1e5-e98c-433...|    4|       55|        1|                  []|      2|     0|   1|              Cosmos|  Zona Oeste|      null|null|115000|
|a6ab01ae-3d40-40e...|    2|       55|        1|                  []|      2|     0|   0|        Tomás Coelho|  Zona N

# Convertendo os tipos das colunas

In [18]:
dataset.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: long (nullable = true)
 |-- area_util: string (nullable = true)
 |-- banheiros: long (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: long (nullable = true)
 |-- suites: long (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: string (nullable = true)
 |-- iptu: string (nullable = true)
 |-- valor: string (nullable = true)



## Colunas numéricas para inteiro ("andar", "banheiros", "suites" e "quartos")

In [19]:
from pyspark.sql.types import IntegerType

In [20]:
dataset\
        .withColumn("andar", dataset["andar"].cast(IntegerType()))\
        .withColumn("banheiros", dataset["banheiros"].cast(IntegerType()))\
        .withColumn("suites", dataset["suites"].cast(IntegerType()))\
        .withColumn("quartos", dataset["quartos"].cast(IntegerType()))\
        .printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: string (nullable = true)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: string (nullable = true)
 |-- iptu: string (nullable = true)
 |-- valor: string (nullable = true)



In [21]:
dataset = dataset\
        .withColumn("andar", dataset["andar"].cast(IntegerType()))\
        .withColumn("banheiros", dataset["banheiros"].cast(IntegerType()))\
        .withColumn("suites", dataset["suites"].cast(IntegerType()))\
        .withColumn("quartos", dataset["quartos"].cast(IntegerType()))

# Colunas Strings para DoubleType ("area_util", "condominio", "iptu" e "valor")

In [22]:
from pyspark.sql.types import DoubleType

In [23]:
dataset\
      .withColumn("area_util", dataset["area_util"].cast(DoubleType()))\
      .withColumn("condominio", dataset["condominio"].cast(DoubleType()))\
      .withColumn("iptu", dataset["iptu"].cast(DoubleType()))\
      .withColumn("valor", dataset["valor"].cast(DoubleType()))\
      .printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: double (nullable = true)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: double (nullable = true)
 |-- iptu: double (nullable = true)
 |-- valor: double (nullable = true)



In [24]:
dataset = dataset\
      .withColumn("area_util", dataset["area_util"].cast(DoubleType()))\
      .withColumn("condominio", dataset["condominio"].cast(DoubleType()))\
      .withColumn("iptu", dataset["iptu"].cast(DoubleType()))\
      .withColumn("valor", dataset["valor"].cast(DoubleType()))

In [25]:
dataset.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: double (nullable = true)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: double (nullable = true)
 |-- iptu: double (nullable = true)
 |-- valor: double (nullable = true)



# Tratamento da coluna "caracteristicas"


In [26]:
dataset.select(f.explode("caracteristicas")).distinct().show()

+------------------+
|               col|
+------------------+
|Condomínio fechado|
|        Playground|
| Portão eletrônico|
|           Piscina|
|Animais permitidos|
|      Portaria 24h|
|          Elevador|
|          Academia|
|   Salão de festas|
|     Churrasqueira|
+------------------+



In [27]:
dataset.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|  iptu|   valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|  null|   1|            Realengo|  Zona Oeste|     285.0|  null| 22999.0|
|1fe78d41-b8e0-4d2...|    0|     44.0|        1|                  []|      2|     0|   0|               Irajá|  Zona Norte|     170.0|   0.0|110000.0|
|1fa1c1e5-e98c-433...|    4|     55.0|        1|                  []|      2|     0|   1|              Cosmos|  Zona Oeste|      null|  null|115000.0|
|a6ab01ae-3d40-40e...|    2|     55.0|        1|                  []|      2|     0|   0|     

In [28]:
dataset = dataset.withColumn('caracteristicas', f.when((f.size(f.col('caracteristicas')) == 0), f.lit(None)).otherwise(f.col('caracteristicas')))
dataset.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|  iptu|   valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|  null|   1|            Realengo|  Zona Oeste|     285.0|  null| 22999.0|
|1fe78d41-b8e0-4d2...|    0|     44.0|        1|                null|      2|     0|   0|               Irajá|  Zona Norte|     170.0|   0.0|110000.0|
|1fa1c1e5-e98c-433...|    4|     55.0|        1|                null|      2|     0|   1|              Cosmos|  Zona Oeste|      null|  null|115000.0|
|a6ab01ae-3d40-40e...|    2|     55.0|        1|                null|      2|     0|   0|     

# Tratando dados faltantes


In [29]:
dataset.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: double (nullable = true)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: double (nullable = true)
 |-- iptu: double (nullable = true)
 |-- valor: double (nullable = true)



In [30]:
dataset\
      .select([f.count(f.when(dataset[c].isNull(), True)).alias(c) for c in dataset.columns])\
      .show()

+---+-----+---------+---------+---------------+-------+------+----+------+----+----------+----+-----+
| id|andar|area_util|banheiros|caracteristicas|quartos|suites|vaga|bairro|zona|condominio|iptu|valor|
+---+-----+---------+---------+---------------+-------+------+----+------+----+----------+----+-----+
|  0|    0|        0|        0|          12736|      0|  5554|3017|     0|   0|      2371|7199|    0|
+---+-----+---------+---------+---------------+-------+------+----+------+----+----------+----+-----+



In [31]:
'''
dataset\
    .select([f.count(f.when(f.isnan(c) | f.isnull(c), True)).alias(c) for c in dataset.columns if c != 'caracteristicas' ])\
    .show()
'''

"\ndataset    .select([f.count(f.when(f.isnan(c) | f.isnull(c), True)).alias(c) for c in dataset.columns if c != 'caracteristicas' ])    .show()\n"

In [32]:
dataset\
    .select('*')\
    .na\
    .fill(0)\
    .show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|  iptu|   valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|            Realengo|  Zona Oeste|     285.0|   0.0| 22999.0|
|1fe78d41-b8e0-4d2...|    0|     44.0|        1|                null|      2|     0|   0|               Irajá|  Zona Norte|     170.0|   0.0|110000.0|
|1fa1c1e5-e98c-433...|    4|     55.0|        1|                null|      2|     0|   1|              Cosmos|  Zona Oeste|       0.0|   0.0|115000.0|
|a6ab01ae-3d40-40e...|    2|     55.0|        1|                null|      2|     0|   0|     

In [33]:
dataset = dataset\
    .select('*')\
    .na\
    .fill(0)

In [34]:
dataset\
    .select('zona')\
    .groupBy('zona')\
    .count()\
    .show()

+------------+-----+
|        zona|count|
+------------+-----+
|  Zona Norte|11897|
|  Zona Oeste|32979|
|Zona Central| 1144|
|    Zona Sul|20531|
|            |   11|
+------------+-----+



In [35]:
dataset = dataset\
    .where(f.col('zona') != '')

# Preparação dos dados para ML


Para conseguirmos utilizar nossos dados em modelos de Machine Learning, é necessário realizarmos algumas técnicas, como a transformação de variáveis categóricas em binárias.

## Variáveis Dummy


In [36]:
dataset.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: double (nullable = false)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: long (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: double (nullable = false)
 |-- iptu: double (nullable = false)
 |-- valor: double (nullable = false)



In [37]:
dataset.select("*", f.explode("caracteristicas").alias("caracteristica")).show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+------------+----------+----------+----+--------+------------------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|      bairro|      zona|condominio|iptu|   valor|    caracteristica|
+--------------------+-----+---------+---------+--------------------+-------+------+----+------------+----------+----------+----+--------+------------------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|    Realengo|Zona Oeste|     285.0| 0.0| 22999.0|     Churrasqueira|
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|    Realengo|Zona Oeste|     285.0| 0.0| 22999.0|          Academia|
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|    Realengo|Zona Oeste|     285.0| 0.0| 22999.0|        Playground|
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Chu

In [38]:
dataset\
    .select("*", f.explode("caracteristicas").alias("cara"))\
    .groupBy("id")\
    .pivot('cara')\
    .agg(f.lit(1))\
    .na\
    .fill(0)\
    .show(truncate=False)

+------------------------------------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|id                                  |Academia|Animais permitidos|Churrasqueira|Condomínio fechado|Elevador|Piscina|Playground|Portaria 24h|Portão eletrônico|Salão de festas|
+------------------------------------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|fd96bbd5-d631-416a-9f84-29091cefe14c|1       |1                 |1            |1                 |1       |1      |1         |1           |1                |1              |
|bfffedfe-99e7-4aed-9a9f-f766225a0f1f|0       |1                 |1            |1                 |1       |0      |1         |0           |1                |1              |
|fcb67af3-5601-415f-9d1f-75280923a4e3|1       |1                 |1            |0                 |0       |1      |1        

In [39]:
caracteristicas = dataset\
    .select("*", f.explode("caracteristicas").alias("cara"))\
    .groupBy("id")\
    .pivot('cara')\
    .agg(f.lit(1))

In [40]:
zona = dataset\
    .groupBy("id")\
    .pivot('zona')\
    .agg(f.lit(1))

In [41]:
dataset.show(10, truncate=False)

+------------------------------------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------+-------+------+----+-----------------------+------------+----------+------+--------+
|id                                  |andar|area_util|banheiros|caracteristicas                                                                                                                |quartos|suites|vaga|bairro                 |zona        |condominio|iptu  |valor   |
+------------------------------------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------+-------+------+----+-----------------------+------------+----------+------+--------+
|03a386b6-7ab8-4eff-891d-f8a16efc1989|0    |43.0     |1        |[Churrasqueira, Academia, Playground, Salão de festas, Condomínio fechado, Portão eletrônico, Portaria 24

In [42]:
linhas_originais = dataset.count()

In [43]:
# juntando os dados utilizando o metodo join
dataset = dataset\
    .join(zona, 'id', how='inner')\
    .join(caracteristicas, 'id', how='left')\
    .drop('zona', 'cara', 'caracteristicas')\
    .na\
    .fill(0)

In [44]:
# comparando se o join funcionou corretamente e ainda temos a mesma quantidade de linhas do dataset original
print("Quantidade de linhas antes: ", linhas_originais)
print("Quantidade de linhas depois: ", dataset.count())
print("Quantidade de linhas perdidas: ", linhas_originais - dataset.count())

Quantidade de linhas antes:  66551
Quantidade de linhas depois:  66551
Quantidade de linhas perdidas:  0


In [45]:
dataset.show(5)

+--------------------+-----+---------+---------+-------+------+----+-----------+----------+------+---------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|                  id|andar|area_util|banheiros|quartos|suites|vaga|     bairro|condominio|  iptu|    valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|Academia|Animais permitidos|Churrasqueira|Condomínio fechado|Elevador|Piscina|Playground|Portaria 24h|Portão eletrônico|Salão de festas|
+--------------------+-----+---------+---------+-------+------+----+-----------+----------+------+---------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|02fba6ef-a691-442...|    3|     64.0|        1|      2|     2|   1|Jacarepaguá|     784.0|  80.0| 380000.0|           0|         0| 

# Salvando os dados em parquet


In [46]:
dataset.write.parquet(
    '/content/drive/MyDrive/Challenge_Pyspark_Alura/Data/Dataset_ML_Regressao', 
    mode='overwrite')

# Vetorizando os dados


Para utiliz os dados em modelos do PySpark, precisa utilizar a técnica de vetorização dos dados, uutilizando a classe **VectorAssembler** da biblioteca **pyspark.ml.feature**.

In [47]:
from pyspark.ml.feature import VectorAssembler

In [48]:
dataset = dataset.withColumnRenamed('valor','label')

In [49]:
input_col = [i for i in dataset.columns if i not in ['label', 'id', 'bairro']]

In [50]:
assembler = VectorAssembler(inputCols = input_col, outputCol = 'features')

In [51]:
dataset.show(5)

+--------------------+-----+---------+---------+-------+------+----+-----------+----------+------+---------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|                  id|andar|area_util|banheiros|quartos|suites|vaga|     bairro|condominio|  iptu|    label|Zona Central|Zona Norte|Zona Oeste|Zona Sul|Academia|Animais permitidos|Churrasqueira|Condomínio fechado|Elevador|Piscina|Playground|Portaria 24h|Portão eletrônico|Salão de festas|
+--------------------+-----+---------+---------+-------+------+----+-----------+----------+------+---------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|02fba6ef-a691-442...|    3|     64.0|        1|      2|     2|   1|Jacarepaguá|     784.0|  80.0| 380000.0|           0|         0| 

In [52]:
dataset_prep = assembler.transform(dataset).select('features', 'label')

In [53]:
dataset_prep.show(5)

+--------------------+---------+
|            features|    label|
+--------------------+---------+
|(22,[1,2,3,4,5,6,...|4600000.0|
|(22,[0,1,2,3,5,6,...| 360000.0|
|(22,[1,2,3,4,5,10...|1200000.0|
|(22,[1,2,3,6,7,11...| 750000.0|
|(22,[1,2,3,5,6,7,...|1025000.0|
+--------------------+---------+
only showing top 5 rows



# Criação dos modelos de regressão

## Random Forest

In [54]:
from pyspark.ml.regression import RandomForestRegressor

In [55]:
treino, teste = dataset_prep.randomSplit([0.7, 0.3], seed=101)

In [56]:
rfr = RandomForestRegressor(seed=100, maxDepth=13, numTrees=13)

In [57]:
modelo_rfr = rfr.fit(treino)

In [58]:
previsoes_rfr_treino = modelo_rfr.transform(treino)

In [59]:
previsoes_rfr_treino.show()

+--------------------+---------+------------------+
|            features|    label|        prediction|
+--------------------+---------+------------------+
|(22,[0,1,2,3,4,5,...| 445000.0|353749.82709296077|
|(22,[0,1,2,3,4,5,...| 539000.0|  402984.195057041|
|(22,[0,1,2,3,4,5,...| 262000.0|470215.75994919706|
|(22,[0,1,2,3,4,5,...| 460000.0| 336086.3240018727|
|(22,[0,1,2,3,4,5,...| 540000.0| 612086.2377825807|
|(22,[0,1,2,3,4,5,...|1047795.0| 667489.4120075089|
|(22,[0,1,2,3,4,5,...| 750000.0| 676780.9080635932|
|(22,[0,1,2,3,4,5,...| 795000.0| 782285.8340253829|
|(22,[0,1,2,3,4,5,...| 391000.0|379210.16681507626|
|(22,[0,1,2,3,4,5,...| 473400.0| 485153.7257598934|
|(22,[0,1,2,3,4,5,...| 659900.0| 514647.4615972986|
|(22,[0,1,2,3,4,5,...| 797204.0| 598542.4073768324|
|(22,[0,1,2,3,4,5,...| 499000.0| 617244.4348080646|
|(22,[0,1,2,3,4,5,...| 550000.0| 662771.8504180672|
|(22,[0,1,2,3,4,5,...|1400000.0|1403538.8029024499|
|(22,[0,1,2,3,4,5,...|2450000.0| 2396153.769230769|
|(22,[0,1,2,

## Avaliação dos modelos

In [60]:
from pyspark.ml.evaluation import RegressionEvaluator

In [61]:
evaluator = RegressionEvaluator()

In [62]:
# Métricas do Treino

print(f'Métricas r2:', evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "r2"} ))
print(f'Métricas rmse:', evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "rmse"}))

Métricas r2: 0.9170089757352828
Métricas rmse: 424991.2660603118


In [63]:
previsoes_rfr_teste = modelo_rfr.transform(teste)
previsoes_rfr_teste.show()

+--------------------+---------+------------------+
|            features|    label|        prediction|
+--------------------+---------+------------------+
|(22,[0,1,2,3,4,5,...| 349000.0| 359032.1452246784|
|(22,[0,1,2,3,4,5,...| 400000.0|425192.59633974294|
|(22,[0,1,2,3,4,5,...| 318000.0| 429199.8590124885|
|(22,[0,1,2,3,4,5,...| 335000.0| 564686.2141512452|
|(22,[0,1,2,3,4,5,...| 950000.0|1127167.5035606509|
|(22,[0,1,2,3,4,5,...|1250000.0|1309392.5827927727|
|(22,[0,1,2,3,4,5,...| 355000.0|402918.19132837123|
|(22,[0,1,2,3,4,5,...| 300000.0|   465857.97559377|
|(22,[0,1,2,3,4,5,...|1540000.0|1596806.8461538462|
|(22,[0,1,2,3,4,5,...| 290000.0| 432471.0476398443|
|(22,[0,1,2,3,4,5,...| 750000.0| 631085.7483401328|
|(22,[0,1,2,3,4,5,...| 852288.0| 514647.4615972986|
|(22,[0,1,2,3,4,5,...| 940000.0| 766183.7312596486|
|(22,[0,1,2,3,4,5,...| 650000.0|  846480.047278669|
|(22,[0,1,2,3,4,5,...| 750000.0| 429897.5105673917|
|(22,[0,1,2,3,4,5,...| 650000.0| 551056.0241813188|
|(22,[0,1,2,

In [64]:
print('Random Forest Regression')
print("="*30)
print("Dados de Treino")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Dados de Teste")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "rmse"}))

Random Forest Regression
Dados de Treino
R²: 0.917009
RMSE: 424991.266060

Dados de Teste
R²: 0.829703
RMSE: 604038.468647


# Extra: Otimização

In [65]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [66]:
rfr = RandomForestRegressor()

In [67]:
grid = ParamGridBuilder() \
    .addGrid(rfr.numTrees, [10, 20, 30]) \
    .addGrid(rfr.maxDepth, [5, 10]) \
    .addGrid(rfr.maxBins, [10, 32, 45]) \
    .build()

In [68]:
evaluator = RegressionEvaluator()

In [69]:
rfr_cv = CrossValidator(
    estimator=rfr,
    estimatorParamMaps=grid,
    evaluator=evaluator,
    numFolds=3
)

In [70]:
modelo_rfr_cv = rfr_cv.fit(treino)

In [71]:
# Quais os melhores parâmetros para o modelo?

modelo_rfr_cv.bestModel

RandomForestRegressionModel: uid=RandomForestRegressor_b51d5d07855d, numTrees=30, numFeatures=22

In [72]:
previsoes_rfr_cv_teste = modelo_rfr_cv.transform(teste)

In [73]:
print('Random Forest')
print("="*30)
print("Sem Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Com Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_cv_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_cv_teste, {evaluator.metricName: "rmse"}))

Random Forest
Sem Cross Validation
R²: 0.829703
RMSE: 604038.468647

Com Cross Validation
R²: 0.832582
RMSE: 598911.193272


# Extra: Implementando o Gradient-boosted tree regression


In [74]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor()

In [75]:
modelo_gbt = gbt.fit(treino)

In [76]:
previsoes_gbt_treino = modelo_gbt.transform(treino);

In [77]:
previsoes_gbt_treino.select("prediction", "label", "features").show(5)

+-----------------+--------+--------------------+
|       prediction|   label|            features|
+-----------------+--------+--------------------+
|268023.3007638289|445000.0|(22,[0,1,2,3,4,5,...|
|268023.3007638289|539000.0|(22,[0,1,2,3,4,5,...|
|477777.5025796987|262000.0|(22,[0,1,2,3,4,5,...|
|281965.9687764651|460000.0|(22,[0,1,2,3,4,5,...|
|607186.1243671293|540000.0|(22,[0,1,2,3,4,5,...|
+-----------------+--------+--------------------+
only showing top 5 rows



In [78]:
previsoes_gbt_teste = modelo_gbt.transform(teste);

In [79]:
print("RMSE nos dados de teste: %g" % evaluator.evaluate(previsoes_gbt_teste, {evaluator.metricName: "rmse"}))
print("R2 nos dados de teste: %g" % evaluator.evaluate(previsoes_gbt_teste, {evaluator.metricName: "r2"}))


RMSE nos dados de teste: 654487
R2 nos dados de teste: 0.800069
