In [1]:
import pandas as pd       # para operar sobre dataframes
import matplotlib.pyplot as plt
import numpy as np        # para manipular vetores e realizar operações
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler # para escalonar os dados
from sklearn.metrics import mean_absolute_error, mean_squared_error # para obter métricas do modelo

from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Regressão múltipla com estimator na prática

Aqui vamos aplicar regressão múltipla com estimators em um banco de dados obtido pela kaggle, chamada "House Sales in King County, USA". Pode ser obtida pelo link: https://www.kaggle.com/harlfoxem/housesalesprediction

Ou seja, vamos utilizar quase todos atributos para predizer o preço da casa. 

O foco dessa atividade é trabalhar mais ativamente com as funções do pandas.


In [2]:
# Lendo o banco de dados
base = pd.read_csv("house_price.csv")

# Para confirmarmos se está tudo certo com o banco olhamos o começinho dele.
base.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Encontrando as dimensões do banco de dados
base.shape

(21613, 21)

In [4]:
# Consultando os nomes de todas colunas do dataset
base.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [5]:
# Vamos selecionar apenas as de interesse
colunas_usadas = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long']
print(colunas_usadas)

['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long']


In [6]:
# Agora vamos carregar novamente o banco apenas com as colunas que desejamos
base = pd.read_csv("house_price.csv", usecols=colunas_usadas)
base.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045


Até então estavamos utilizando a padronização, mas agora vamos usar a normalização onde os valores ficam entre 0 e 1.
Notar que na padronização podemos ter números negativos e ela pode ser mais indicada quando o dataset possui outliers.

In [7]:
# Normalizando os dados
scaler_atributos = MinMaxScaler()

# notar que retiramos o price, pois é nosso objetivo
base_normalizada = base
base_normalizada[[ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                  'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 
                  'long']] = scaler_atributos.fit_transform(base[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
                                             'floors', 'waterfront', 'view', 'condition', 'grade',
                                             'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 
                                             'zipcode', 'lat', 'long']])
base_normalizada.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,0.090909,0.125,0.06717,0.003108,0.0,0.0,0.0,0.5,0.5,0.097588,0.0,0.478261,0.0,0.893939,0.571498,0.217608
1,538000.0,0.090909,0.28125,0.172075,0.004072,0.4,0.0,0.0,0.5,0.5,0.20614,0.082988,0.443478,0.988089,0.626263,0.908959,0.166113
2,180000.0,0.060606,0.125,0.036226,0.005743,0.0,0.0,0.0,0.5,0.416667,0.052632,0.0,0.286957,0.0,0.136364,0.936143,0.237542
3,604000.0,0.121212,0.375,0.126038,0.002714,0.0,0.0,0.0,1.0,0.5,0.083333,0.188797,0.565217,0.0,0.681818,0.586939,0.104651
4,510000.0,0.090909,0.25,0.104906,0.004579,0.0,0.0,0.0,0.5,0.583333,0.152412,0.0,0.756522,0.0,0.368687,0.741354,0.393688


Podemos notar que a coluna price não está normalizada, então precisamos normalizar o preço...



In [8]:
scaler_preco = MinMaxScaler()
base_normalizada[['price']] = scaler_preco.fit_transform(base_normalizada[['price']])
base_normalizada.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,0.019266,0.090909,0.125,0.06717,0.003108,0.0,0.0,0.0,0.5,0.5,0.097588,0.0,0.478261,0.0,0.893939,0.571498,0.217608
1,0.060721,0.090909,0.28125,0.172075,0.004072,0.4,0.0,0.0,0.5,0.5,0.20614,0.082988,0.443478,0.988089,0.626263,0.908959,0.166113
2,0.01377,0.060606,0.125,0.036226,0.005743,0.0,0.0,0.0,0.5,0.416667,0.052632,0.0,0.286957,0.0,0.136364,0.936143,0.237542
3,0.069377,0.121212,0.375,0.126038,0.002714,0.0,0.0,0.0,1.0,0.5,0.083333,0.188797,0.565217,0.0,0.681818,0.586939,0.104651
4,0.057049,0.090909,0.25,0.104906,0.004579,0.0,0.0,0.0,0.5,0.583333,0.152412,0.0,0.756522,0.0,0.368687,0.741354,0.393688


In [9]:
# Obtendo os atributos de interesse
X = base_normalizada.drop('price', axis=1) # obtem a tabela, exceto a coluna 'price'
Y = base_normalizada.price

# Visualizando os resultados
print("Atributos : \n",X)
print("\n \nLabels : \n", Y)

Atributos : 
        bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
0      0.090909    0.12500     0.067170  0.003108     0.0         0.0   0.0   
1      0.090909    0.28125     0.172075  0.004072     0.4         0.0   0.0   
2      0.060606    0.12500     0.036226  0.005743     0.0         0.0   0.0   
3      0.121212    0.37500     0.126038  0.002714     0.0         0.0   0.0   
4      0.090909    0.25000     0.104906  0.004579     0.0         0.0   0.0   
...         ...        ...          ...       ...     ...         ...   ...   
21608  0.090909    0.31250     0.093585  0.000370     0.8         0.0   0.0   
21609  0.121212    0.31250     0.152453  0.003206     0.4         0.0   0.0   
21610  0.060606    0.09375     0.055094  0.000503     0.4         0.0   0.0   
21611  0.090909    0.31250     0.098868  0.001132     0.4         0.0   0.0   
21612  0.060606    0.09375     0.055094  0.000337     0.4         0.0   0.0   

       condition     grade  sqft_abov

 Temos outras formas de visualização também... 

In [10]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,0.090909,0.125,0.06717,0.003108,0.0,0.0,0.0,0.5,0.5,0.097588,0.0,0.478261,0.0,0.893939,0.571498,0.217608
1,0.090909,0.28125,0.172075,0.004072,0.4,0.0,0.0,0.5,0.5,0.20614,0.082988,0.443478,0.988089,0.626263,0.908959,0.166113
2,0.060606,0.125,0.036226,0.005743,0.0,0.0,0.0,0.5,0.416667,0.052632,0.0,0.286957,0.0,0.136364,0.936143,0.237542
3,0.121212,0.375,0.126038,0.002714,0.0,0.0,0.0,1.0,0.5,0.083333,0.188797,0.565217,0.0,0.681818,0.586939,0.104651
4,0.090909,0.25,0.104906,0.004579,0.0,0.0,0.0,0.5,0.583333,0.152412,0.0,0.756522,0.0,0.368687,0.741354,0.393688


In [11]:
Y.head()

0    0.019266
1    0.060721
2    0.013770
3    0.069377
4    0.057049
Name: price, dtype: float64

In [12]:
# Observando que os tipos são do pandas e não do numpy
print("Tipo de X: ", type(X))
print("Tipo de Y: ", type(Y))

Tipo de X:  <class 'pandas.core.frame.DataFrame'>
Tipo de Y:  <class 'pandas.core.series.Series'>


Agora vamos definir as colunas que vamos utilizar como atributos que estão no dataset. Nesse caso não podemos incluir a coluna price pois é o que queremos prever.

In [13]:
colunas_para_prever = colunas_usadas[1:17] # tira o price
colunas_para_prever

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long']

In [14]:
# Agora teremos que criar um feature_column para cada coluna, para isso utilizamos um for dentro da função
colunas = [tf.feature_column.numeric_column(key=c) for c in colunas_para_prever ]

# Verificando se deu certo
print(colunas[0])
print(colunas[1])
print(colunas[2])

NumericColumn(key='bedrooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)
NumericColumn(key='bathrooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)
NumericColumn(key='sqft_living', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)


Agora vamos separar os dados em treinamento e teste.

In [29]:
# Aqui estamos separando 30% do dataset para testarmos e 70% para treinarmos o modelo
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

# Analisando as dimensões
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("Y_train: ", Y_train.shape)
print("Y_test: ", Y_test.shape)

X_train:  (15129, 16)
X_test:  (6484, 16)
Y_train:  (15129,)
Y_test:  (6484,)


Agora vamos fazer as funções de input para 'injetar' os dados no tensorflow depois. Nesse caso estamos usando pandas e não o numpy. **Não esquecer.**


In [16]:
funcao_train = tf.estimator.inputs.pandas_input_fn(x = X_train, y = Y_train,
                                                  batch_size=32,# pega em lotes de 32
                                                  num_epochs=None, # quantidade de vezes que vai rodar
                                                  shuffle=True) # pega ordem aleatória

funcao_test = tf.estimator.inputs.pandas_input_fn(x = X_test, y = Y_test,
                                                  batch_size=32,# pega em lotes de 32
                                                  num_epochs=10000, # quantidade de vezes que vai rodar
                                                  shuffle=False) # pega em ordem 

In [17]:
# Agora vamos definir o regressor
regressor = tf.estimator.LinearRegressor(feature_columns=colunas)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Natielle\\AppData\\Local\\Temp\\tmpavy31dao', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001C42FA45BE0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [37]:
# Agora vamos treinar o modelo
regressor.train(input_fn= funcao_train, steps = 10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt-10000
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 10000 into C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt.
INFO:tensorflow:loss = 0.049740076, step = 10001
INFO:tensorflow:global_step/sec: 228.898
INFO:tensorflow:loss = 0.008383008, step = 10101 (0.437 sec)
INFO:tensorflow:global_step/sec: 242.416
INFO:tensorflow:loss = 0.018427094, step = 10201 (0.418 sec)
INFO:tensorflow:global_step/sec: 277.207
INFO:tensorflow:loss = 0.04507032, step = 10301 (0.356 sec)
INFO:tensorflow:global_step/sec: 322.868
INFO:tensorflow:loss = 0.018763697, step = 10401 (0.314 sec)
INFO:tensorflow:gl

INFO:tensorflow:loss = 0.03214763, step = 17301 (0.347 sec)
INFO:tensorflow:global_step/sec: 249.528
INFO:tensorflow:loss = 0.0060807634, step = 17401 (0.385 sec)
INFO:tensorflow:global_step/sec: 249.236
INFO:tensorflow:loss = 0.017803678, step = 17501 (0.417 sec)
INFO:tensorflow:global_step/sec: 343.001
INFO:tensorflow:loss = 0.03984508, step = 17601 (0.281 sec)
INFO:tensorflow:global_step/sec: 367.205
INFO:tensorflow:loss = 0.01888584, step = 17701 (0.274 sec)
INFO:tensorflow:global_step/sec: 350.73
INFO:tensorflow:loss = 0.016874393, step = 17801 (0.279 sec)
INFO:tensorflow:global_step/sec: 309.74
INFO:tensorflow:loss = 0.025777198, step = 17901 (0.323 sec)
INFO:tensorflow:global_step/sec: 343.193
INFO:tensorflow:loss = 0.016657107, step = 18001 (0.295 sec)
INFO:tensorflow:global_step/sec: 294.904
INFO:tensorflow:loss = 0.017287657, step = 18101 (0.337 sec)
INFO:tensorflow:global_step/sec: 255.643
INFO:tensorflow:loss = 0.017356765, step = 18201 (0.389 sec)
INFO:tensorflow:global_st

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressor at 0x1c42fa45128>

Após treinarmos, podemos observar algumas métricas.


In [38]:
metricas_train = regressor.evaluate(input_fn=funcao_train, steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-05-06T03:13:24Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1000/10000]
INFO:tensorflow:Evaluation [2000/10000]
INFO:tensorflow:Evaluation [3000/10000]
INFO:tensorflow:Evaluation [4000/10000]
INFO:tensorflow:Evaluation [5000/10000]
INFO:tensorflow:Evaluation [6000/10000]
INFO:tensorflow:Evaluation [7000/10000]
INFO:tensorflow:Evaluation [8000/10000]
INFO:tensorflow:Evaluation [9000/10000]
INFO:tensorflow:Evaluation [10000/10000]
INFO:tensorflow:Finished evaluation at 2020-05-06-03:13:55
INFO:tensorflow:Saving dict for global step 20000: average_loss = 0.0007184091, global_step = 20000, label/mean = 0.061093275, loss = 0.02298909, prediction/mean = 0.062283684
INF

In [39]:
metricas_test = regressor.evaluate(input_fn=funcao_test, steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-05-06T03:13:57Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1000/10000]
INFO:tensorflow:Evaluation [2000/10000]
INFO:tensorflow:Evaluation [3000/10000]
INFO:tensorflow:Evaluation [4000/10000]
INFO:tensorflow:Evaluation [5000/10000]
INFO:tensorflow:Evaluation [6000/10000]
INFO:tensorflow:Evaluation [7000/10000]
INFO:tensorflow:Evaluation [8000/10000]
INFO:tensorflow:Evaluation [9000/10000]
INFO:tensorflow:Evaluation [10000/10000]
INFO:tensorflow:Finished evaluation at 2020-05-06-03:14:29
INFO:tensorflow:Saving dict for global step 20000: average_loss = 0.000671062, global_step = 20000, label/mean = 0.060745265, loss = 0.021473983, prediction/mean = 0.061987665
INF

In [40]:
print("Métricas treinamento: \n", metricas_train)
print("Métricas teste: \n", metricas_test)

Métricas treinamento: 
 {'average_loss': 0.0007184091, 'label/mean': 0.061093275, 'loss': 0.02298909, 'prediction/mean': 0.062283684, 'global_step': 20000}
Métricas teste: 
 {'average_loss': 0.000671062, 'label/mean': 0.060745265, 'loss': 0.021473983, 'prediction/mean': 0.061987665, 'global_step': 20000}


**Iremos realizar a predição de alguns exemplos.**

In [41]:
funcao_previsao = tf.estimator.inputs.pandas_input_fn(x = X_test,
                                                  shuffle=False) # pega em ordem 

previsoes = regressor.predict(input_fn=funcao_previsao)
list(previsoes) # mostra os resultados normalizados

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'predictions': array([0.03130377], dtype=float32)},
 {'predictions': array([0.12601155], dtype=float32)},
 {'predictions': array([0.01284308], dtype=float32)},
 {'predictions': array([0.04682913], dtype=float32)},
 {'predictions': array([0.0295601], dtype=float32)},
 {'predictions': array([0.04924708], dtype=float32)},
 {'predictions': array([0.01305094], dtype=float32)},
 {'predictions': array([0.07813163], dtype=float32)},
 {'predictions': array([0.06551817], dtype=float32)},
 {'predictions': array([0.03376254], dtype=float32)},
 {'predictions': array([0.02563023], dtype=float32)},
 {'predictions': array([0.11356162], dtype=float32)},
 {'predictions': array([0.02392885], dtype=float32)},
 {'predictions': array([0.10446045], dtype=float32)},
 {'predictions': array([0.05802169], dtype=float32)},
 {'predictions': array([0.04985686], dtype=float32)},
 {'predictions': array([0.01770943], dtype=float32)},
 {'predictions': array([0.08217108], dtype=float32)},
 {'predictions': array([0.089

In [42]:
# agora vamos visualizar os valores preditos ainda com a normalização
valores_previsao = []
for p in regressor.predict(input_fn=funcao_previsao):
    valores_previsao.append(p['predictions'])
    
valores_previsao


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Natielle\AppData\Local\Temp\tmpavy31dao\model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[array([0.03130377], dtype=float32),
 array([0.12601155], dtype=float32),
 array([0.01284308], dtype=float32),
 array([0.04682913], dtype=float32),
 array([0.0295601], dtype=float32),
 array([0.04924708], dtype=float32),
 array([0.01305094], dtype=float32),
 array([0.07813163], dtype=float32),
 array([0.06551817], dtype=float32),
 array([0.03376254], dtype=float32),
 array([0.02563023], dtype=float32),
 array([0.11356162], dtype=float32),
 array([0.02392885], dtype=float32),
 array([0.10446045], dtype=float32),
 array([0.05802169], dtype=float32),
 array([0.04985686], dtype=float32),
 array([0.01770943], dtype=float32),
 array([0.08217108], dtype=float32),
 array([0.08961725], dtype=float32),
 array([0.04416753], dtype=float32),
 array([0.02132387], dtype=float32),
 array([-6.7315996e-05], dtype=float32),
 array([0.06676462], dtype=float32),
 array([0.09719938], dtype=float32),
 array([0.07618383], dtype=float32),
 array([0.04731706], dtype=float32),
 array([0.10069418], dtype=float32)

**Agora vamos calcular os erros e para isso teremos que fazer algumas conversões de tipo.**

In [43]:
# arrumando o formato dos dados
valores_previsao = np.asarray(valores_previsao).reshape(-1,1)
valores_previsao

array([[0.03130377],
       [0.12601155],
       [0.01284308],
       ...,
       [0.02836864],
       [0.04796293],
       [0.06189564]], dtype=float32)

In [44]:
valores_previsao_final = scaler_preco.inverse_transform(valores_previsao)
valores_previsao_final # sem os dados normalizados

array([[ 313691.25],
       [1035838.06],
       [ 172928.48],
       ...,
       [ 291310.9 ],
       [ 440717.3 ],
       [ 546954.25]], dtype=float32)

In [45]:
# Temos que deixar a dimensão de Y_test (6484,1) e não (6484,)
print(Y_test.shape)
Y_test_values_norm = Y_test.values.reshape(-1,1)
print(Y_test_values_norm.shape)

(6484,)
(6484, 1)


In [46]:
# Desnormalizando
print("Y_test_values_norm:\n", Y_test_values_norm)
Y_test_values_real = scaler_preco.inverse_transform(Y_test_values_norm)
print("\n\nY_test_values_real:\n", Y_test_values_real)

Y_test_values_norm:
 [[0.02754098]
 [0.10295082]
 [0.01193443]
 ...
 [0.05245902]
 [0.04957377]
 [0.04131148]]


Y_test_values_real:
 [[285000.]
 [860000.]
 [166000.]
 ...
 [475000.]
 [453000.]
 [390000.]]


In [47]:
mae = mean_absolute_error(Y_test_values_real, valores_previsao_final)
mse = mean_squared_error(Y_test_values_real, valores_previsao_final)

print("Erro absoluto médio", mae)
print("Erro ao quadrado médio", mse)

Erro absoluto médio 130393.63335654502
Erro ao quadrado médio 44060195791.28518


In [None]:
Notar que quando usamos apenas um atributo o mae era cerca de 173 mil pra cima ou pra baixo.
E quando utilizamos todos os outros atributos pra prever, abaixou paara 130 mil a margem de erro. 
