In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from dotenv import load_dotenv


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV


load_dotenv()

px.set_mapbox_access_token(open('mapbox_token').read())

In [2]:
df_data = pd.read_csv("sao-paulo-properties-april-2019.csv")
df_rent = df_data[df_data["Negotiation Type"] == "rent"]
df_cleaned = df_rent.drop(["New", "Property Type", "Negotiation Type"], axis=1)

one_hot = pd.get_dummies(df_cleaned["District"])

df = df_cleaned.drop('District', axis=1)
df = df.join(one_hot)

Y = df["Price"]
X = df.loc[:, df.columns != "Price"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [3]:
# testando combinações de hiper-parâmetros

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           cv=5, scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)

In [4]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [5]:
grid_search.best_estimator_

In [6]:
final_model = grid_search.best_estimator_
final_model_pred = final_model.predict(x_test)

final_mse = mean_squared_error(y_test, final_model_pred)
print(np.sqrt(final_mse))

1912.871064567689
