In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_sp = pd.read_csv('sao-paulo-properties-april-2019.csv')

In [3]:
df_rent = df_sp[df_sp['Negotiation Type'] == 'rent']

In [4]:
from sklearn.preprocessing import OneHotEncoder
cat_enconder = OneHotEncoder()

In [6]:
df_cleaned = df_rent.drop(['New', 'Negotiation Type','Property Type'], axis=1)

In [7]:
housing_cat_1hot = cat_enconder.fit_transform(df_cleaned[['District']])

In [8]:
housing_cat_1hot

<7228x94 sparse matrix of type '<class 'numpy.float64'>'
	with 7228 stored elements in Compressed Sparse Row format>

In [9]:
housing_cat_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
one_hot = pd.get_dummies(df_cleaned['District'])
one_hot

Unnamed: 0,Alto de Pinheiros/São Paulo,Anhanguera/São Paulo,Aricanduva/São Paulo,Artur Alvim/São Paulo,Barra Funda/São Paulo,Bela Vista/São Paulo,Belém/São Paulo,Bom Retiro/São Paulo,Brasilândia/São Paulo,Brooklin/São Paulo,...,Vila Jacuí/São Paulo,Vila Leopoldina/São Paulo,Vila Madalena/São Paulo,Vila Maria/São Paulo,Vila Mariana/São Paulo,Vila Matilde/São Paulo,Vila Olimpia/São Paulo,Vila Prudente/São Paulo,Vila Sônia/São Paulo,Água Rasa/São Paulo
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11205,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11206,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11207,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11208,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df = df_cleaned.drop('District', axis=1)

In [12]:
df = df.join(one_hot)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
Y = df['Price'] 
X = df.loc[:, df.columns != 'Price']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) # Treino e, 70% e teste em 30% do nosso df.

In [16]:
# DecisionTreeRegressor (regressão de árvore de decisão.)
from sklearn.tree import DecisionTreeRegressor

In [17]:
tree_reg = DecisionTreeRegressor()

In [18]:
tree_reg.fit(x_train, y_train)

DecisionTreeRegressor()

In [19]:
from sklearn.metrics import mean_squared_error

In [26]:
predts = tree_reg.predict(x_train)

tree_mse = mean_squared_error(y_train, predts)

tree_mse = np.sqrt(tree_mse)

tree_mse

40.784044368127766

In [44]:
alguns_dados = x_train.iloc[:5]
algumas_label = y_train.iloc[:5]

In [45]:
print(f'*'*25, ' Predições', '*'*25)
for a, b in enumerate(tree_reg.predict(alguns_dados)):
    print(a + 1,' : ', f'{b:.02f}')

*************************  Predições *************************
1  :  2700.00
2  :  2450.00
3  :  970.00
4  :  1990.00
5  :  2000.00


In [46]:
print(f'*'*25, ' Labels ', '*'*25)
for a, b in enumerate(algumas_label.values):
    print(a + 1,' : ', f'{b:.02f}')

*************************  Labels  *************************
1  :  2700.00
2  :  2400.00
3  :  970.00
4  :  1990.00
5  :  2000.00


In [47]:
tree_reg.predict(x_train)

array([2700., 2450.,  970., ..., 1000., 1400.,  900.])

In [48]:
from sklearn.model_selection import cross_val_score

In [49]:
score = cross_val_score(tree_reg, x_train, y_train, scoring='neg_mean_squared_error', cv=10)

In [50]:
tree_rmse_scores = np.sqrt(-score)

In [69]:
def diskplay_score(scores):
    for a, b in enumerate(scores):
        print(f'Score {a + 1} : ', f'{b:.05f}')
    #print('Score :', scores)
    print()
    print('Média : ', scores.mean())
    print('Desvio padrão : ', scores.std())

In [113]:
diskplay_score(tree_rmse_scores)

Score 1 :  2242.70355
Score 2 :  1947.23635
Score 3 :  2023.53559
Score 4 :  2961.31325
Score 5 :  2435.56718
Score 6 :  2640.50163
Score 7 :  2688.30015
Score 8 :  3009.17150
Score 9 :  1914.03588
Score 10 :  2022.23489

Média :  2388.4599971406196
Desvio padrão :  397.1629325801242


In [115]:
import plotly.graph_objects as go

In [117]:
fig = go.Figure( data=[go.Scatter(y=y_test.values), go.Scatter(y=tree_reg.predict(x_test))])

fig.show()