# Recursos Necessários

O programa abaixo foi executado no Colab, com os arquivos presentes na pasta "pof_ipm_nm" deste repositório.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=5915da3afc81cc390e8b91e1ffa24b852491b3fa357540f90aed2a42e67c4164
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import findspark

In [4]:
pof6_2008_2009 = pd.read_excel('/content/drive/MyDrive/T4M8G2/ESSEMBLE/POF_IPM_NM/2008_2009/Tabela 6a.xlsx')
pof6_2017_2018 = pd.read_excel('/content/drive/MyDrive/T4M8G2/ESSEMBLE/POF_IPM_NM/2017_2018/Tabela 6b.xlsx')
pof6 = pd.concat([pof6_2008_2009, pof6_2017_2018], ignore_index=True)

In [5]:
pof6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 10 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Unidades da Federação                               56 non-null     object 
 1   Proporção de pessoas das famílias residentes (%)    56 non-null     float64
 2   Proporção de pessoas com algum grau de pobreza (%)  56 non-null     float64
 3   IPM-NM                                              56 non-null     float64
 4   Moradia                                             56 non-null     float64
 5   Acesso aos serviços de utilidade pública            56 non-null     float64
 6   Saúde e alimentação                                 56 non-null     float64
 7   Educação                                            56 non-null     float64
 8   Acesso a serviços financeiros e padrão de vida      56 non-null     float64
 9   T

In [6]:
from pyspark import SparkConf
conf = SparkConf().set("spark.executor.memory", "2g")

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("RandomFlorestIPMNMSpark") \
    .config("spark.driver.port", "7077") \
    .config(conf=conf) \
    .getOrCreate()

In [7]:
pof6_spark = spark.createDataFrame(pof6)

selected_columns = ['Moradia', 'Acesso aos serviços de utilidade pública', 'Saúde e alimentação',
                    'Educação', 'Acesso a serviços financeiros e padrão de vida', 'Transporte e lazer', 'IPM-NM']
df_spark = pof6_spark.select(selected_columns)

In [8]:
from pyspark.ml.feature import VectorAssembler
feature_cols = ['Moradia', 'Acesso aos serviços de utilidade pública', 'Saúde e alimentação',
                'Educação', 'Acesso a serviços financeiros e padrão de vida', 'Transporte e lazer']
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_spark = vector_assembler.transform(df_spark)

train_data, test_data = df_spark.randomSplit([0.8, 0.2], seed=42)

In [9]:
from pyspark.ml.regression import RandomForestRegressor
rf_model = RandomForestRegressor(featuresCol="features", labelCol="IPM-NM", numTrees=100, seed=42)

model = rf_model.fit(train_data)

predictions = model.transform(test_data)

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="IPM-NM", predictionCol="prediction", metricName="mse")
mse_rf = evaluator.evaluate(predictions)

evaluator = RegressionEvaluator(labelCol="IPM-NM", predictionCol="prediction", metricName="rmse")
rmse_rf = evaluator.evaluate(predictions)

evaluator = RegressionEvaluator(labelCol="IPM-NM", predictionCol="prediction", metricName="mae")
mae_rf = evaluator.evaluate(predictions)

evaluator = RegressionEvaluator(labelCol="IPM-NM", predictionCol="prediction", metricName="r2")
r2_rf = evaluator.evaluate(predictions)

In [11]:
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf}")
print(f"Mean Absolute Error (MAE): {mae_rf}")
print(f"R-squared (R²): {r2_rf}")

spark.stop()

Mean Squared Error (MSE): 3.939212816363966
Root Mean Squared Error (RMSE): 1.9847450255294674
Mean Absolute Error (MAE): 1.533741693640499
R-squared (R²): 0.7684213380529044
