In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=3e581bb4e2523632283165278be4b8f552722644df31edd92c464f830fbc2627
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [28]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import Vectors

In [3]:
# Iniciar a Spark session
spark = SparkSession.builder.appName("HouseRentAnalysis").getOrCreate()

In [4]:
# Carregar os dados no Spark DataFrame
df = spark.read.csv('/content/houses_to_rent_v2.csv', header=True, inferSchema=True)

In [5]:
# Verificar o schema dos dados
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- bathroom: integer (nullable = true)
 |-- parking spaces: integer (nullable = true)
 |-- floor: string (nullable = true)
 |-- animal: string (nullable = true)
 |-- furniture: string (nullable = true)
 |-- hoa (R$): integer (nullable = true)
 |-- rent amount (R$): integer (nullable = true)
 |-- property tax (R$): integer (nullable = true)
 |-- fire insurance (R$): integer (nullable = true)
 |-- total (R$): integer (nullable = true)



In [6]:
# Remover duplicatas com base em colunas específicas
df = df.dropDuplicates(subset=['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)','rent amount (R$)', 'property tax (R$)', 'fire insurance (R$)', 'total (R$)'])

In [7]:
# Filtrar linhas com 'hoa (R$)' <= 1.5 * 'rent amount (R$)'
df = df.filter(df['hoa (R$)'] <= 1.5 * df['rent amount (R$)'])

In [8]:
# Tratar a coluna 'floor' e converter valores '-' para 0 e o restante para int
df = df.withColumn('floor', when(col('floor') == '-', 0).otherwise(col('floor').cast('int')))

In [9]:
# Converter variáveis categóricas para valores numéricos
df = df.withColumn('animal', when(col('animal') == 'acept', 1).otherwise(0))
df = df.withColumn('furniture', when(col('furniture') == 'furnished', 1).otherwise(0))

In [10]:
# Filtrar apenas pela cidade de São Paulo
df = df.filter(df['city'] == 'São Paulo')

In [11]:
# Remover a coluna 'city'
df = df.drop('city')

In [12]:
# Definir as variáveis independentes (features) e a variável dependente (label)
feature_cols = ['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)','property tax (R$)', 'fire insurance (R$)', 'furniture', 'animal']

In [13]:
# Criar o assembler para combinar as colunas de características em um vetor
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [14]:
# Transformar o dataframe com o assembler
df = assembler.transform(df)

In [15]:
# Selecionar as colunas de interesse (features e label)
df = df.select(col("features"), col("rent amount (R$)").alias("label"))

In [16]:
# Dividir os dados em conjunto de treino (80%) e teste (20%)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [17]:
# Inicializar o modelo de Regressão Linear
lr = LinearRegression(featuresCol='features', labelCol='label')

In [18]:
# Treinar o modelo nos dados de treino
lr_model = lr.fit(train_df)

In [19]:
# Avaliar o modelo nos dados de teste
predictions = lr_model.transform(test_df)

In [20]:
# Exibir sumário do modelo
print(f"Coeficientes: {lr_model.coefficients}")
print(f"Intercepto: {lr_model.intercept}")

Coeficientes: [-0.049386822694035964,-59.29291238797612,-21.209105157080227,-49.23718918090523,0.28752651799766477,-0.0012488179920979044,71.23857776045958,119.41910303113252,-8.932901933863656]
Intercepto: 178.24994500588377


In [21]:
# Avaliar o desempenho com métricas de regressão
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"R²: {r2}")

R²: 0.9899588383222807


In [22]:
#Calcular o RMSE (Root Mean Squared Error)
rmse_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

RMSE: 355.5815061505836


In [23]:
#Validação cruzada com 5 folds
paramGrid = ParamGridBuilder().build()
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

In [24]:
#Ajustar o modelo usando validação cruzada
cv_model = crossval.fit(train_df)

In [25]:
#Avaliar o modelo de validação cruzada nos dados de teste
cv_predictions = cv_model.transform(test_df)
r2_cv = evaluator.evaluate(cv_predictions)
rmse_cv = rmse_evaluator.evaluate(cv_predictions)

In [26]:
print(f"R² com Validação Cruzada: {r2_cv}")
print(f"RMSE com Validação Cruzada: {rmse_cv}")

R² com Validação Cruzada: 0.9899588383222807
RMSE com Validação Cruzada: 355.5815061505836


In [29]:
#Simulação de um novo imóvel com as mesmas características
new_data = Row(area=200, rooms=4, bathroom=3, parking_spaces=3,
               hoa_R=2400, property_tax_R=0, fire_insurance_R=82,
               furniture=0, animal=0)

#Converter o novo imóvel para um DataFrame do Spark
new_df = spark.createDataFrame([new_data])

#Criar um VectorAssembler para as novas entradas (com as mesmas colunas usadas no treinamento)
assembler = VectorAssembler(inputCols=['area', 'rooms', 'bathroom', 'parking_spaces',
                                       'hoa_R', 'property_tax_R', 'fire_insurance_R',
                                       'furniture', 'animal'], outputCol='features')

#Transformar os dados de entrada para o formato correto
new_df = assembler.transform(new_df)

#Fazer a previsão usando o modelo treinado
new_predictions = lr_model.transform(new_df)

#Exibir o valor previsto
predicted_total_rent = new_predictions.select("prediction").collect()[0][0]
print(f"Valor previsto do aluguel total: R$ {predicted_total_rent:,.2f}")

Valor previsto do aluguel total: R$ 6,251.49


In [31]:
#Consulta no dataset utilizando os mesmos valores para simulacao
df.filter(
    (col('area') == 200) &
    (col('rooms') == 4) &
    (col('bathroom') == 3) &
    (col('parking spaces') == 3) &
    (col('hoa (R$)') == 2400) &
    (col('property tax (R$)') == 0) &
    (col('fire insurance (R$)') == 82) &
    (col('furniture') == 0) &
    (col('animal') == 0)
).show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[200.0,4.0,3.0,3....| 6400|
+--------------------+-----+

