In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_sp = pd.read_csv('sao-paulo-properties-april-2019.csv')

In [3]:
df_rent = df_sp[df_sp['Negotiation Type'] == 'rent']

In [4]:
from sklearn.preprocessing import OneHotEncoder
cat_enconder = OneHotEncoder()

In [5]:
df_cleaned = df_rent.drop(['New', 'Negotiation Type','Property Type'], axis=1)

In [6]:
housing_cat_1hot = cat_enconder.fit_transform(df_cleaned[['District']])

In [7]:
housing_cat_1hot

<7228x94 sparse matrix of type '<class 'numpy.float64'>'
	with 7228 stored elements in Compressed Sparse Row format>

In [8]:
housing_cat_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
one_hot = pd.get_dummies(df_cleaned['District'])
one_hot

Unnamed: 0,Alto de Pinheiros/São Paulo,Anhanguera/São Paulo,Aricanduva/São Paulo,Artur Alvim/São Paulo,Barra Funda/São Paulo,Bela Vista/São Paulo,Belém/São Paulo,Bom Retiro/São Paulo,Brasilândia/São Paulo,Brooklin/São Paulo,...,Vila Jacuí/São Paulo,Vila Leopoldina/São Paulo,Vila Madalena/São Paulo,Vila Maria/São Paulo,Vila Mariana/São Paulo,Vila Matilde/São Paulo,Vila Olimpia/São Paulo,Vila Prudente/São Paulo,Vila Sônia/São Paulo,Água Rasa/São Paulo
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11205,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11206,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11207,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11208,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df = df_cleaned.drop('District', axis=1)

In [11]:
df = df.join(one_hot)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
Y = df['Price'] 
X = df.loc[:, df.columns != 'Price']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) # Treino e, 70% e teste em 30% do nosso df.

In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [19]:
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train)

RandomForestRegressor()

In [20]:
predts = rf_reg.predict(x_train)

rf_mse = mean_squared_error(y_train, predts)

rf_mse = np.sqrt(rf_mse)

rf_mse

704.5746398058056

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
score = cross_val_score(rf_reg, x_train, y_train, scoring='neg_mean_squared_error', cv=10)

In [23]:
rf_rmse_scores = np.sqrt(-score)

In [24]:
def diskplay_score(scores):
    for a, b in enumerate(scores):
        print(f'Score {a + 1} : ', f'{b:.05f}')
    #print('Score :', scores)
    print()
    print('Média : ', scores.mean())
    print('Desvio padrão : ', scores.std())

In [25]:
diskplay_score(rf_rmse_scores)

Score 1 :  2160.58243
Score 2 :  1413.83721
Score 3 :  2245.80842
Score 4 :  1449.95227
Score 5 :  1476.63524
Score 6 :  2816.32803
Score 7 :  1604.15025
Score 8 :  1264.78126
Score 9 :  2201.18435
Score 10 :  1839.80263

Média :  1847.3062088746829
Desvio padrão :  469.3482678755495


In [26]:
alguns_dados = x_train.iloc[:5]
algumas_label = y_train.iloc[:5]

In [27]:
print(f'*'*25, ' Predições', '*'*25)
for a, b in enumerate(rf_reg.predict(alguns_dados)):
    print(a + 1,' : ', f'{b:.02f}')

*************************  Predições *************************
1  :  1852.53
2  :  1097.10
3  :  1565.55
4  :  2121.00
5  :  2320.00


In [28]:
print(f'*'*25, ' Labels ', '*'*25)
for a, b in enumerate(algumas_label.values):
    print(a + 1,' : ', f'{b:.02f}')

*************************  Labels  *************************
1  :  1850.00
2  :  1100.00
3  :  1550.00
4  :  1990.00
5  :  2500.00


In [39]:
from sklearn.model_selection import GridSearchCV

In [56]:
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

In [57]:
forest_reg = RandomForestRegressor()

In [58]:
grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring='neg_mean_squared_error')

In [59]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')

In [60]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [61]:
final_model = grid_search.best_estimator_

In [62]:
final_model_predict = final_model.predict(x_test)

In [63]:
final_mse = mean_squared_error(y_test, final_model_predict)

In [64]:
print(np.sqrt(final_mse))

1526.9505704892313


In [65]:
import plotly.graph_objects as go

In [66]:
fig = go.Figure( data=[go.Scatter(y=y_test.values), go.Scatter(y=final_model_predict)])

fig.show()