## 1 - Quais casas o CEO da House Rocket deveria comprar e por qual preço de compra?

## 2 - Uma vez a casa em posse da empresa, qual seria o preço da venda?

## 3 - A House Rocket deveria fazer uma reforma para aumentar o preço da venda? Quais seriam as sugestões de mudanças? Qual o incremento no preço dado por cada opção de reforma?

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
# Passo 1: Importar os dados e criar o modelo
tabela = pd.read_csv('kc_house_data.csv')
rl = LinearRegression()
dtc = DecisionTreeClassifier()
dtr = DecisionTreeRegressor()

In [3]:
# Passo 2: Verificar o estado dos dados
tabela.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [4]:
# Passo 3: Limpeza e Organização
tabela = tabela.drop(['date'], axis=1)
tabela.floors = tabela.floors.astype(int)
tabela.price = tabela.price.astype(int)
tabela.bathrooms = tabela.bathrooms.astype(int)

In [5]:
# Passo 4: Modelagem
X = tabela.drop('price', axis=1)
y = tabela['price']
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=52)

In [6]:
from pycaret.regression import *

In [9]:
s = setup( tabela, target = 'price')

Unnamed: 0,Description,Value
0,session_id,8276
1,Target,price
2,Original Data,"(21613, 20)"
3,Missing Values,False
4,Numeric Features,14
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(15129, 48)"


In [11]:
best = compare_models(fold = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,68887.2736,17359556756.1346,131002.2043,0.8757,0.1741,0.13,0.368
rf,Random Forest Regressor,71527.991,18741161300.1783,136388.6786,0.8653,0.1799,0.133,2.81
et,Extra Trees Regressor,71934.6617,19361351496.8125,138781.3811,0.8604,0.1818,0.1344,2.272
gbr,Gradient Boosting Regressor,80053.1758,20191991352.9204,141790.9018,0.8543,0.1968,0.1528,0.986
lr,Linear Regression,120126.1989,37594142883.9022,193652.5579,0.7275,0.311,0.2339,1.114
lasso,Lasso Regression,120120.8593,37592731215.2817,193648.2268,0.7275,0.311,0.2339,1.266
llar,Lasso Least Angle Regression,120061.1351,37593073377.5431,193647.8721,0.7275,0.3101,0.2337,0.044
br,Bayesian Ridge,120177.2904,37636426793.8618,193746.6612,0.7273,0.3113,0.2341,0.078
ridge,Ridge Regression,120332.7365,37690476572.3616,193859.4904,0.7271,0.3124,0.2346,0.042
dt,Decision Tree Regressor,104572.3429,39720985512.9451,198684.6821,0.7127,0.2576,0.1904,0.094


In [12]:
lightgbm = create_model('lightgbm', fold = 5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,65814.4309,13268219552.5185,115187.7578,0.9023,0.1678,0.1261
1,68809.1216,17949136482.4669,133974.3874,0.8682,0.1743,0.1284
2,68549.3968,14309835537.4073,119623.7248,0.8729,0.179,0.1341
3,69121.8793,17082335053.4486,130699.4072,0.8759,0.1743,0.1306
4,72141.5393,24188257154.8318,155525.7443,0.8593,0.1752,0.131
Mean,68887.2736,17359556756.1346,131002.2043,0.8757,0.1741,0.13
SD,2010.3689,3823491725.8171,14070.5088,0.0144,0.0036,0.0027


In [13]:
tuned_lightgbm = tune_model(lightgbm, fold = 5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,67622.5836,15485600992.5874,124441.1547,0.886,0.1723,0.1279
1,70870.2016,16663013545.7029,129085.2956,0.8777,0.1806,0.1326
2,67753.9225,13710256009.2265,117090.8024,0.8782,0.177,0.1323
3,68802.7247,16371501814.1441,127951.1696,0.881,0.1745,0.1293
4,72812.8135,22606833349.2602,150355.6894,0.8685,0.1786,0.134
Mean,69572.4492,16967441142.1842,129784.8224,0.8783,0.1766,0.1312
SD,1994.2725,3001855077.4064,11105.9004,0.0057,0.0029,0.0023


In [14]:
final_model = finalize_model(tuned_lightgbm)

In [15]:
save_model(final_model,'model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='price',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy='...
                                colsample_bytree=1.0, feature_fraction=0.6,
                                importance_type='split', learning_rate=0.15,
                                max_depth=-1, min_child_samples=21,
                                min_child_weight=0.001, min_split_gain=0.1,
                