In [1]:
#!pip install -U scikit-learn
#!pip install lightgbm

# Regressão linear - preços de aluguéis em Nova Iorque

Neste arquivo estão alguns testes de regressão com diferentes modelos afim de obter o melhor resultado possível. 
Note que, dado o tipo de problema, as métricas escolhidas aqui para maximizar o desempenho do modelo foram o Mean Absolute Error (MAE) e o Root Mean Square Error (RMSE). Ambas as métricas foram escolhidas devido ao fato de representarem medidas que se encontram na mesma unidade do valor medido (neste caso, preço em dólares).

Na MAE todos os erros contribuem igualmente, enquanto no RMSE erros pequenos são minimizados enquanto erros grandes são amplificados. O uso combinado das duas ferramentas dá uma ideia sobre a existência ou não de erros grandes, uma vez que quanto maior a diferença entre o MAE e o RMSE, maior a quantidade de erros grandes na predição.

## 1. Importando as bibliotecas 

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor

from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,make_column_selector, make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import pickle

## 2. Inspecionando o _dataset_

In [3]:
df = pd.read_csv('teste_indicium_precificacao.csv',parse_dates=['ultima_review'])
df.head().T

Unnamed: 0,0,1,2,3,4
id,2595,3647,3831,5022,5099
nome,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park,Large Cozy 1 BR Apartment In Midtown East
host_id,2845,4632,4869,7192,7322
host_name,Jennifer,Elisabeth,LisaRoxanne,Laura,Chris
bairro_group,Manhattan,Manhattan,Brooklyn,Manhattan,Manhattan
bairro,Midtown,Harlem,Clinton Hill,East Harlem,Murray Hill
latitude,40.75362,40.80902,40.68514,40.79851,40.74767
longitude,-73.98377,-73.9419,-73.95976,-73.94399,-73.975
room_type,Entire home/apt,Private room,Entire home/apt,Entire home/apt,Entire home/apt
price,225,150,89,80,200


In [4]:
zerados = df[df['price'] == 0]
zerados

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
23160,18750597,"Huge Brooklyn Brownstone Living, Close to it all.",8993084,Kimberly,Brooklyn,Bedford-Stuyvesant,40.69023,-73.95428,Private room,0,4,1,2018-01-06,0.05,4,28
25432,20333471,★Hostel Style Room | Ideal Traveling Buddies★,131697576,Anisha,Bronx,East Morrisania,40.83296,-73.88668,Private room,0,2,55,2019-06-24,2.56,4,127
25633,20523843,"MARTIAL LOFT 3: REDEMPTION (upstairs, 2nd room)",15787004,Martial Loft,Brooklyn,Bushwick,40.69467,-73.92433,Private room,0,2,16,2019-05-18,0.71,5,0
25752,20608117,"Sunny, Quiet Room in Greenpoint",1641537,Lauren,Brooklyn,Greenpoint,40.72462,-73.94072,Private room,0,2,12,2017-10-27,0.53,2,0
25777,20624541,Modern apartment in the heart of Williamsburg,10132166,Aymeric,Brooklyn,Williamsburg,40.70838,-73.94645,Entire home/apt,0,5,3,2018-01-02,0.15,1,73
25793,20639628,Spacious comfortable master bedroom with nice ...,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68173,-73.91342,Private room,0,1,93,2019-06-15,4.28,6,176
25794,20639792,Contemporary bedroom in brownstone with nice view,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68279,-73.9117,Private room,0,1,95,2019-06-21,4.37,6,232
25795,20639914,Cozy yet spacious private brownstone bedroom,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68258,-73.91284,Private room,0,1,95,2019-06-23,4.35,6,222
26258,20933849,the best you can find,13709292,Qiuchi,Manhattan,Murray Hill,40.75091,-73.97597,Entire home/apt,0,3,0,NaT,,1,0
26840,21291569,Coliving in Brooklyn! Modern design / Shared room,101970559,Sergii,Brooklyn,Bushwick,40.69211,-73.9067,Shared room,0,30,2,2019-06-22,0.11,6,333


In [5]:
display(df.info())
display(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   id                             48894 non-null  int64         
 1   nome                           48878 non-null  object        
 2   host_id                        48894 non-null  int64         
 3   host_name                      48873 non-null  object        
 4   bairro_group                   48894 non-null  object        
 5   bairro                         48894 non-null  object        
 6   latitude                       48894 non-null  float64       
 7   longitude                      48894 non-null  float64       
 8   room_type                      48894 non-null  object        
 9   price                          48894 non-null  int64         
 10  minimo_noites                  48894 non-null  int64         
 11  numero_de_revie

None

id                                   0
nome                                16
host_id                              0
host_name                           21
bairro_group                         0
bairro                               0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimo_noites                        0
numero_de_reviews                    0
ultima_review                    10052
reviews_por_mes                  10052
calculado_host_listings_count        0
disponibilidade_365                  0
dtype: int64

## 3. Preparando os dados para a regressão

In [6]:
def prepare_data(filename,droppedcolumns):
    df = pd.read_csv(filename,parse_dates=['ultima_review'])
    df = df[df['price'] != 0]
    #df.drop(zerados,axis=1,inplace=True)
    df.drop(droppedcolumns,axis=1,inplace=True)   
    return df
    
def evaluate_model(reg,xtrain,ytrain,xtest,ytest,cv = 5):
    #fit and check for the 
    reg.fit(xtrain,ytrain)
    train_set = np.mean(cross_val_score(reg,xtrain,ytrain,scoring='neg_mean_squared_error',cv = cv))

    ypred = reg.predict(xtest)
    test_set_rmse = root_mean_squared_error(ytest, ypred)
    test_set_mae = mean_absolute_error(ytest, ypred)

    print(f'############ {reg.steps[1][0]} #############################')
    print(f'Root mean square error in training set (cv = {cv}): {train_set}')
    print(f'Root mean square error in test set: {test_set_rmse}\n')
    print(f'Mean absolute error in test set: {test_set_mae}\n')


def select_Xy(df, ycolumn,axis=1):
    y = df[ycolumn]
    X = df.drop(ycolumn,axis = 1)
    return X,y
    

In [7]:
df.columns

Index(['id', 'nome', 'host_id', 'host_name', 'bairro_group', 'bairro',
       'latitude', 'longitude', 'room_type', 'price', 'minimo_noites',
       'numero_de_reviews', 'ultima_review', 'reviews_por_mes',
       'calculado_host_listings_count', 'disponibilidade_365'],
      dtype='object')

In [8]:
droppedcolumns = ['id', 'nome', 'host_id', 'host_name',
                  'latitude', 'longitude',
                  'ultima_review','reviews_por_mes']

df = prepare_data('teste_indicium_precificacao.csv',droppedcolumns=droppedcolumns)
df

Unnamed: 0,bairro_group,bairro,room_type,price,minimo_noites,numero_de_reviews,calculado_host_listings_count,disponibilidade_365
0,Manhattan,Midtown,Entire home/apt,225,1,45,2,355
1,Manhattan,Harlem,Private room,150,3,0,1,365
2,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,1,194
3,Manhattan,East Harlem,Entire home/apt,80,10,9,1,0
4,Manhattan,Murray Hill,Entire home/apt,200,3,74,1,129
...,...,...,...,...,...,...,...,...
48889,Brooklyn,Bedford-Stuyvesant,Private room,70,2,0,2,9
48890,Brooklyn,Bushwick,Private room,40,4,0,2,36
48891,Manhattan,Harlem,Entire home/apt,115,10,0,1,27
48892,Manhattan,Hell's Kitchen,Shared room,55,1,0,6,2


In [9]:
df.isnull().sum()

bairro_group                     0
bairro                           0
room_type                        0
price                            0
minimo_noites                    0
numero_de_reviews                0
calculado_host_listings_count    0
disponibilidade_365              0
dtype: int64

In [10]:
df_final,df_test = train_test_split(df,train_size=0.80,random_state=42)
print(df_final.shape[0]/df.shape[0])

0.7999918171961623


In [11]:
df_train,df_val = train_test_split(df_final,test_size=0.25,random_state=42)
print(f'Tamanho do teste: {df_test.shape[0]/df.shape[0]}')
print(f'Tamanho da validação: {df_val.shape[0]/df.shape[0]}')

Tamanho do teste: 0.20000818280383773
Tamanho da validação: 0.20000818280383773


In [12]:
X_train,y_train = select_Xy(df_train,'price')
X_train

Unnamed: 0,bairro_group,bairro,room_type,minimo_noites,numero_de_reviews,calculado_host_listings_count,disponibilidade_365
42262,Manhattan,Murray Hill,Entire home/apt,2,5,327,82
21553,Manhattan,Washington Heights,Private room,1,0,1,0
25358,Queens,Flushing,Entire home/apt,1,104,1,90
31752,Manhattan,Harlem,Entire home/apt,4,17,2,245
19670,Manhattan,Upper West Side,Private room,1,117,1,365
...,...,...,...,...,...,...,...
21419,Queens,Rockaway Beach,Entire home/apt,2,42,1,310
11718,Brooklyn,Williamsburg,Private room,6,0,1,0
10267,Manhattan,Nolita,Entire home/apt,5,9,1,0
2133,Manhattan,West Village,Entire home/apt,4,1,1,273


In [13]:
y_train

42262    231
21553     50
25358    120
31752    122
19670    119
        ... 
21419    145
11718     70
10267    225
2133     170
14433    160
Name: price, Length: 29329, dtype: int64

In [14]:
#Preprocessing for numerical e categorical features
num_pipeline = Pipeline([
    #("min_max_Scaler",MinMaxScaler()),
    ('Standard_Scaler',StandardScaler())

])

cat_pipeline = Pipeline(steps=[
    #('Target Encoder',TargetEncoder())
    #('Ordinal Encoder',OrdinalEncoder(handle_unknown='use_encoded_value')),
    ('One Hot Encoder',OneHotEncoder(handle_unknown=('ignore'))),
])

###### Linear Regression ####################
reg_lin = LinearRegression()

preprocessing_reg_lin = make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include=np.number)),
    (cat_pipeline,make_column_selector(dtype_include=object)),
    remainder='passthrough'
)

pipe_reg_lin = Pipeline(steps=[
    ('preprocessor regression',preprocessing_reg_lin),
    ('Linear_regression',reg_lin)]
                  )

###### Random Forest ####################
ran_for = RandomForestRegressor(random_state=42)

preprocessing_rfr = make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include=np.number)),
    (cat_pipeline,make_column_selector(dtype_include=object)),
    remainder='passthrough'
)

pipe_rfr = Pipeline(steps=[
    ('preprocessor_rfr',preprocessing_rfr),
    ('Random_forest_Regressor',ran_for)]
                  )

###### XGBoost ####################
xg_reg = XGBRegressor(random_state=42)

preprocessing_xgb = preprocessing_rfr

pipe_xgb = Pipeline(steps=[
    ('preprocessor_xgb',preprocessing_xgb),
    ('XGBoost_Regressor',xg_reg)
])

###### Light GBM ####################
l_gbm = LGBMRegressor(random_state=42,verbose=0)

preprocessing_lgb = preprocessing_rfr

pipe_lgbm = Pipeline(steps=[
    ('preprocessor_lgbm',preprocessing_lgb),
    ('LGBM',l_gbm)
])

In [15]:
X_test,y_test = select_Xy(df_test,'price')

evaluate_model(pipe_reg_lin,X_train,y_train,X_test,y_test)
evaluate_model(pipe_xgb,X_train,y_train,X_test,y_test)
evaluate_model(pipe_lgbm,X_train,y_train,X_test,y_test)
#evaluate_model(pipe_rfr,X_train,y_train,X_test,y_test)

############ Linear_regression #############################
Root mean square error in training set (cv = 5): -56168.3416809051
Root mean square error in test set: 228.95842820048478

Mean absolute error in test set: 70.73574618617543

############ XGBoost_Regressor #############################
Root mean square error in training set (cv = 5): -59456.090682346876
Root mean square error in test set: 238.38092976861557

Mean absolute error in test set: 66.21455822808375

############ LGBM #############################
Root mean square error in training set (cv = 5): -52227.876028513994
Root mean square error in test set: 225.86137121318086

Mean absolute error in test set: 66.42869866037088



## 4. Tunagem de hiperparâmetros

### 4.1 XGBoost

In [30]:
n_estimators = [int(x) for x in np.linspace(10,100,5)]
max_depth = [int(x) for x in np.linspace(2,50,5)]

param_grid = {
    'XGBoost_Regressor__n_estimators': n_estimators,
    'XGBoost_Regressor__max_depth': max_depth
}

cv_xgb = GridSearchCV(
    estimator = pipe_xgb,
    param_grid=param_grid,
    cv=4,
    scoring='neg_mean_absolute_error',
)

cv_xgb.fit(X_train,y_train)

In [33]:
cv_xgb.best_params_

{'XGBoost_Regressor__max_depth': 14, 'XGBoost_Regressor__n_estimators': 10}

In [35]:
pipe_xgb.set_params(XGBoost_Regressor__max_depth= 14,XGBoost_Regressor__n_estimators= 10).fit(X_final,y_final)

### 4.2 LGBM

In [75]:
n_estimators = [int(x) for x in np.linspace(10,100,5)]
max_depth = [int(x) for x in np.linspace(2,50,5)]
num_leaves = [int(y) for y in np.linspace(10,70,5)]

param_grid = {
    'LGBM__n_estimators': n_estimators,
    'LGBM__max_depth': max_depth,
    'LGBM__num_leaves':num_leaves
}

cv_lgbm = GridSearchCV(
    estimator = pipe_lgbm,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
)

cv_lgbm.fit(X_train,y_train)



In [76]:
cv_lgbm.best_params_

{'LGBM__max_depth': 14, 'LGBM__n_estimators': 32, 'LGBM__num_leaves': 70}

In [77]:
pipe_lgbm.set_params(LGBM__max_depth= 14,LGBM__n_estimators= 32,LGBM__num_leaves=70).fit(X_final,y_final)

In [78]:
print('############ - XGBOOST - ###############')
X_val,y_val = select_Xy(df_val,'price')
y_pred = pipe_xgb.predict(X_val)
print(f'Mean Absolute Error:{mean_absolute_error(y_val,y_pred)}')
print(f'Root Mean Squared Error:{root_mean_squared_error(y_val,y_pred)}')

print('############ - LGBM - ###############')
y_pred = pipe_lgbm.predict(X_val)
print(f'Mean Absolute Error:{mean_absolute_error(y_val,y_pred)}')
print(f'Root Mean Squared Error:{root_mean_squared_error(y_val,y_pred)}')

############ - XGBOOST - ###############
Mean Absolute Error:50.83255527789356
Root Mean Squared Error:130.19365329422615
############ - LGBM - ###############
Mean Absolute Error:64.76074323574755
Root Mean Squared Error:174.66162569125095


In [54]:
usedcolumns = pipe_xgb.feature_names_in_.tolist()

In [55]:
usedcolumns

['bairro_group',
 'bairro',
 'room_type',
 'minimo_noites',
 'numero_de_reviews',
 'calculado_host_listings_count',
 'disponibilidade_365']

## 5. Prevendo o valor do imóvel

In [79]:
cliente = {'id': 2595,
           'nome': 'Skylit Midtown Castle',
           'host_id': 2845,
           'host_name': 'Jennifer',
           'bairro_group': 'Manhattan',
           'bairro': 'Midtown',
           'latitude': 40.75362,
           'longitude': -73.98377,
           'room_type': 'Entire home/apt',
           'minimo_noites': 1,
           'numero_de_reviews': 45,
           'ultima_review': '2019-05-21',
           'reviews_por_mes': 0.38,
           'calculado_host_listings_count': 2,
           'disponibilidade_365': 355}



In [80]:
cliente = pd.DataFrame.from_dict(cliente,orient='index').T
#cliente.drop(droppedcolumns,axis=1,inplace=True)
cliente = cliente[usedcolumns]

In [81]:
X_final, y_final = select_Xy(df_final,'price')
pipe_lgbm.fit(X_final,y_final)
pipe_lgbm.predict(cliente)

array([287.29864424])

In [82]:
X_final, y_final = select_Xy(df_final,'price')
pipe_xgb.fit(X_final,y_final)
pipe_xgb.predict(cliente)

array([311.66986], dtype=float32)

## 6. Salvando o modelo

In [84]:
output_file = 'model.pkl'
f_out = open(output_file,'wb')
pickle.dump(pipe_xgb,f_out)
f_out.close()

### 6.1 Salvando o arquivo requirements

In [85]:
!pip freeze > requeriments.txt