## 1 - Quais casas o CEO da House Rocket deveria comprar e por qual preço de compra?

## 2 - Uma vez a casa em posse da empresa, qual seria o preço da venda?

## 3 - A House Rocket deveria fazer uma reforma para aumentar o preço da venda? Quais seriam as sugestões de mudanças? Qual o incremento no preço dado por cada opção de reforma?

In [113]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Passo 1: Importar os dados e criar o modelo

In [114]:
tabela = pd.read_csv('kc_house_data.csv')
modelo = RandomForestRegressor()
tabela2 = tabela

# Passo 2: Verificar o estado dos dados

In [115]:
tabela.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

# Passo 3: Limpeza e Organização

In [116]:
tabela = tabela.drop(['date', 'id'], axis=1)
tabela.floors = tabela.floors.astype(int)
tabela.price = tabela.price.astype(int)
tabela.bathrooms = tabela.bathrooms.astype(int)
tabela.price = tabela.price.round(-3)

In [117]:
display(tabela)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,222000,3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000,3,2,2570,7242,2,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,180000,2,1,770,10000,1,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000,4,3,1960,5000,1,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000,3,2,1680,8080,1,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000,3,2,1530,1131,3,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,400000,4,2,2310,5813,2,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,402000,2,0,1020,1350,2,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,400000,3,2,1600,2388,2,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


# Passo 4: Modelagem

In [118]:
X = tabela.drop('price', axis=1)
y = tabela['price']
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=52)

# Passo 5: Treinamento do algoritmo

In [119]:
modelo.fit(x_train, y_train)

RandomForestRegressor()

In [120]:
pred = modelo.predict(x_test)

In [121]:
r2_score(y_test, pred)

0.8539873089476007

# Passo 6: Exportando modelo

In [122]:
import joblib

In [123]:
joblib.dump(modelo, 'model2.pkl')

In [126]:
teste = np.array([[3,1,1180,5650,1,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650]])

In [127]:
modelo.predict(teste)

array([238070.])