In [6]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from dotenv import load_dotenv


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier


load_dotenv()

px.set_mapbox_access_token(open('mapbox_token').read())

In [7]:
df_data = pd.read_csv("sao-paulo-properties-april-2019.csv")
df_rent = df_data[df_data["Negotiation Type"] == "rent"]
df_cleaned = df_rent.drop(["New", "Property Type", "Negotiation Type"], axis=1)

one_hot = pd.get_dummies(df_cleaned["District"])

df = df_cleaned.drop('District', axis=1)
df = df.join(one_hot)

In [8]:
# Separando dados

# somente o id e valores
Y = df["Price"]

# todas colunas - Price
X = df.loc[:, df.columns != "Price"]

In [9]:
# Função de treino
# test_size=0.3, significa que vou fazer um treino com 70% e teste com 30%
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [19]:
# Regressão Linear
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)  # função custo para otimizar o modelo

In [None]:
# teste-01
# 5 primeiras linhas
alguns_dados = x_train.iloc[:5]
algumas_label = y_train.iloc[:5]

print("Prediçoes:", lin_reg.predict(alguns_dados))
print("Labels:", algumas_label.values)

In [None]:
# usando RMSE
# resultado é mais ou menos a variação sobre o valor real

# estamos comparando os valores de treino com os valores reais
# passamos o valor x para receber os valores de y
preds = lin_reg.predict(x_train)
# ai comparamos o valor recebido com y real
lin_mse = mean_squared_error(y_train, preds)

# raiz quadrada = root
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# regressão de arvores

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)  # função custo para otimizar o modelo

# usando RMSE
# resultado é mais ou menos a variação sobre o valor real

# estamos comparando os valores de treino com os valores reais
# passamos o valor x para receber os valores de y
preds = tree_reg.predict(x_train)
# ai comparamos o valor recebido com y real
tree_mse = mean_squared_error(y_train, preds)

# raiz quadrada = root
tree_rmse = np.sqrt(tree_mse)
tree_rmse

# cross validation
# so usamos os dados de treino
# método de avaliação, onde divide-se os dados em folds, training folds e test folds
# ele treina 1/10 dos dados e na sequencia ele teste 2/10 porem nao testando o anterior

In [None]:
# aqui usamos métricas de utilidades que é o contrario das métricas de custo, ambas inversamente correlativas
# modelo tree

scores = cross_val_score(tree_reg, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standart deviation:', scores.std())


display_scores(tree_rmse_scores)

In [None]:
# aqui usamos métricas de utilidades que é o contrario das métricas de custo, ambas inversamente correlativas
# modelo linear

scores = cross_val_score(lin_reg, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standart deviation:', scores.std())


display_scores(tree_rmse_scores)

In [10]:
# random forests

rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train)

In [11]:
# validação de teste

preds = rf_reg.predict(x_train)
rf_mse = mean_squared_error(y_train, preds)

rf_rmse = np.sqrt(rf_mse)
rf_rmse

724.3103990234526

In [8]:

scores = cross_val_score(rf_reg, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standart deviation:', scores.std())


display_scores(rf_rmse_scores)

Scores: [1225.54839138 2274.68846874 1611.55613952 1244.64883719 1655.48568423
 1530.15074628 2081.09339007 1410.56393103 1909.26787593 2076.17043959]
Mean: 1701.917390395239
Standart deviation: 348.6594822867156


In [16]:
# teste com AdaBoost

x_train, y_train = load_iris(return_X_y=True)

clf = AdaBoostClassifier(n_estimators=10000)

scores = cross_val_score(rf_reg, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standart deviation:', scores.std())


display_scores(rf_rmse_scores)

Scores: [0.         0.         0.         0.08694826 0.2733374  0.36296924
 0.02081666 0.30933261 0.34001961 0.11535453]
Mean: 0.15087783035555835
Standart deviation: 0.14540247696376893
