In [1]:
import os
import findspark

os.environ["SPARK_HOME"] = "C:\spark"
findspark.init()

# To display scroll:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .appName('Challenge 3')\
    .getOrCreate()
spark

In [4]:
df = spark.read.parquet('dataset_ml_parquet')

In [5]:
df.show(5)

+--------------------+-----+---------+---------+-------+------+----+------------+----------+-----+--------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|                  id|andar|area_util|banheiros|quartos|suites|vaga|      bairro|condominio| iptu|   valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|Academia|Animais permitidos|Churrasqueira|Condomínio fechado|Elevador|Piscina|Playground|Portaria 24h|Portão eletrônico|Salão de festas|
+--------------------+-----+---------+---------+-------+------+----+------------+----------+-----+--------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+
|00002dd9-cc74-480...|    2|       35|        1|      1|   0.0| 0.0|Santo Cristo|     100.0|100.0|245000.0|           1|         0|     

# Transforming data

In [12]:
from pyspark.ml.feature import StringIndexer

In [15]:
# TODO: Try exploding into different columns
indexed = StringIndexer(inputCol='bairro', outputCol='bairro_ind').fit(df).transform(df)

In [16]:
indexed.show(5)

+--------------------+-----+---------+---------+-------+------+----+------------+----------+-----+--------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+----------+
|                  id|andar|area_util|banheiros|quartos|suites|vaga|      bairro|condominio| iptu|   valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|Academia|Animais permitidos|Churrasqueira|Condomínio fechado|Elevador|Piscina|Playground|Portaria 24h|Portão eletrônico|Salão de festas|bairro_ind|
+--------------------+-----+---------+---------+-------+------+----+------------+----------+-----+--------+------------+----------+----------+--------+--------+------------------+-------------+------------------+--------+-------+----------+------------+-----------------+---------------+----------+
|00002dd9-cc74-480...|    2|       35|        1|      1|   0.0| 0.0|Santo Cristo|     100.0|100.0|24500

In [17]:
from pyspark.ml.feature import VectorAssembler

In [20]:
X = [col for col in indexed.columns if col not in ['id', 'bairro']]
X

['andar',
 'area_util',
 'banheiros',
 'quartos',
 'suites',
 'vaga',
 'condominio',
 'iptu',
 'valor',
 'Zona Central',
 'Zona Norte',
 'Zona Oeste',
 'Zona Sul',
 'Academia',
 'Animais permitidos',
 'Churrasqueira',
 'Condomínio fechado',
 'Elevador',
 'Piscina',
 'Playground',
 'Portaria 24h',
 'Portão eletrônico',
 'Salão de festas',
 'bairro_ind']

In [22]:
vectorized = VectorAssembler(inputCols=X, outputCol='features')\
    .transform(indexed).select(['features'])

In [24]:
vectorized.show()

+--------------------+
|            features|
+--------------------+
|[2.0,35.0,1.0,1.0...|
|[1.0,84.0,2.0,2.0...|
|(24,[1,2,3,6,7,8,...|
|(24,[1,2,3,5,6,7,...|
|(24,[1,2,3,4,5,6,...|
|[0.0,200.0,6.0,4....|
|(24,[1,2,3,6,8,10...|
|(24,[0,1,2,3,4,5,...|
|(24,[1,2,3,4,5,8,...|
|[0.0,41.0,1.0,1.0...|
|[5.0,78.0,1.0,2.0...|
|[0.0,170.0,3.0,3....|
|[0.0,173.0,3.0,3....|
|(24,[1,2,3,4,5,6,...|
|[9.0,120.0,2.0,2....|
|[20.0,341.0,2.0,3...|
|[0.0,194.0,5.0,4....|
|(24,[1,2,3,8,10,2...|
|(24,[0,1,2,3,8,10...|
|[10.0,72.0,1.0,2....|
+--------------------+
only showing top 20 rows



# Scaling

In [25]:
from pyspark.ml.feature import StandardScaler

In [26]:
scaled = StandardScaler(inputCol='features', outputCol='scaled_features').fit(vectorized).transform(vectorized)

In [27]:
scaled.show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[2.0,35.0,1.0,1.0...|[0.13607726247524...|
|[1.0,84.0,2.0,2.0...|[0.06803863123762...|
|(24,[1,2,3,6,7,8,...|(24,[1,2,3,6,7,8,...|
|(24,[1,2,3,5,6,7,...|(24,[1,2,3,5,6,7,...|
|(24,[1,2,3,4,5,6,...|(24,[1,2,3,4,5,6,...|
|[0.0,200.0,6.0,4....|[0.0,2.2447697820...|
|(24,[1,2,3,6,8,10...|(24,[1,2,3,6,8,10...|
|(24,[0,1,2,3,4,5,...|(24,[0,1,2,3,4,5,...|
|(24,[1,2,3,4,5,8,...|(24,[1,2,3,4,5,8,...|
|[0.0,41.0,1.0,1.0...|[0.0,0.4601778053...|
|[5.0,78.0,1.0,2.0...|[0.34019315618810...|
|[0.0,170.0,3.0,3....|[0.0,1.9080543147...|
|[0.0,173.0,3.0,3....|[0.0,1.9417258614...|
|(24,[1,2,3,4,5,6,...|(24,[1,2,3,4,5,6,...|
|[9.0,120.0,2.0,2....|[0.61234768113858...|
|[20.0,341.0,2.0,3...|[1.36077262475241...|
|[0.0,194.0,5.0,4....|[0.0,2.1774266885...|
|(24,[1,2,3,8,10,2...|(24,[1,2,3,8,10,2...|
|(24,[0,1,2,3,8,10...|(24,[0,1,2,3,8,10...|
|[10.0,72.0,1.0,2....|[0.6803863

# PCA

In [30]:
from pyspark.ml.feature import PCA

In [122]:
k = 6

In [123]:
pca = PCA(k=k, inputCol='scaled_features', outputCol='pca_features')
model_pca = pca.fit(scaled)
df_pca = model_pca.transform(scaled)

In [124]:
model_pca.explainedVariance

DenseVector([0.2562, 0.1692, 0.0875, 0.0624, 0.0501, 0.0447])

In [125]:
import numpy as np

In [126]:
acc_variance = np.array([sum(model_pca.explainedVariance[0:i+1]) for i in range(k)])
acc_variance

array([0.25617114, 0.42541565, 0.51289371, 0.5753326 , 0.62539143,
       0.67006927])

In [121]:
k = np.where(acc_variance >= 0.7)[0][0]
k

6