In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from skopt import forest_minimize
import joblib as jb

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


<h2>Índice</h2>
<font size=3>
<ol>
    <li><a href='#PRE'>Pré-processamento</a></li>
    <li><a href='#MODEL'>Modelo</a></li>
    <ul>
        <li><a href='#LR'>Regressão Logística</a></li>
        <li><a href='#RF'>Random Forest</a></li>
        <li><a href='#LGBM'>LightGBM</a></li>
        <li><a href='#ENSEN'>Ensemble</a></li>
    </ul>
    <li><a href='#SAV'>Salvando os modelos</a></li>
</ol>
</font>

<a name='PRE'></a>
<h2>Pré-processamento</h2>
<h3>Carregando o dataset</h3>

In [2]:
df = pd.read_csv('apartamentos.csv')
df.drop(['endereco', 'update_time', 'apartment_link', 'id'], axis=1, inplace=True)
df.head()

Unnamed: 0,y,quartos,area,custo,vagas,mobiliado,bairro,latitude,longitude,estacao,distancia,linha_amarela,linha_azul,linha_lilas,linha_prata,linha_verde,linha_vermelha
0,0,3,200,8702,2,1,Alto de Pinheiros,-23.541829,-46.718899,Vila Madalena,2.88,0,0,0,0,1,0
1,0,2,125,8130,2,0,Alto de Pinheiros,-23.55265,-46.719504,Pinheiros,2.41,1,0,0,0,0,0
2,0,3,105,4509,2,0,Alto da Lapa,-23.540726,-46.717298,Vila Madalena,2.75,0,0,0,0,1,0
3,0,3,77,3545,0,1,Alto de Pinheiros,-23.542408,-46.716614,Vila Madalena,2.64,0,0,0,0,1,0
4,0,3,63,4991,0,1,Alto de Pinheiros,-23.542408,-46.716614,Vila Madalena,2.64,0,0,0,0,1,0


<font size=3>Dividindo entre X e y.</font>

In [3]:
X = df.drop('y', axis=1)
y = df['y']
print('X: {}\ny: {}'.format(X.shape, y.shape))

X: (4764, 16)
y: (4764,)


<font size=3><br>Dividindo entre treino e teste.</font>

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=75)
print('X_train:{} \ny_train:{} \nX_test:\t{} \ny_test:\t{}'.format(X_train.shape,y_train.shape,X_test.shape,y_test.shape))

X_train:(3334, 16) 
y_train:(3334,) 
X_test:	(1430, 16) 
y_test:	(1430,)


In [5]:
X_train.head()

Unnamed: 0,quartos,area,custo,vagas,mobiliado,bairro,latitude,longitude,estacao,distancia,linha_amarela,linha_azul,linha_lilas,linha_prata,linha_verde,linha_vermelha
2354,3,138,3865,0,0,Mooca,-23.558846,-46.603011,Bresser Mooca,1.45,0,0,0,0,0,1
404,4,400,11942,3,1,Bela Vista,-23.562363,-46.646864,Brigadeiro,0.63,0,0,0,0,1,0
4601,1,38,3116,1,1,Vila Mariana,-23.586055,-46.63353,Vila Mariana,0.36,0,1,0,0,0,0
3285,3,253,8855,4,0,Santana,-23.50076,-46.625908,Santana,0.21,0,1,0,0,0,0
290,1,30,3495,1,1,Jardim Paulista,-23.567145,-46.654306,Trianon Masp,0.39,0,0,0,0,1,0


<font size=3>
    Com <i>OneHotEncoder</i> vamos transformar as variáveis <i>bairro</i> e <i>estacao</i> para númericas para que possamos utilizar nos nossos modelos.
</font>

In [6]:
cat_cols = ['bairro', 'estacao']
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
train_cat_feat = enc.fit_transform(X_train[cat_cols])
train_cat_feat = pd.DataFrame(train_cat_feat)
# train_cat_feat.columns = enc.get_feature_names(cat_cols)
train_cat_feat.index = X_train.index
train_num_feat = X_train.drop(cat_cols, axis=1)
X_train_feat = pd.merge(train_num_feat, train_cat_feat, left_index=True, right_index=True)

<font size=3>Replicando para os dados de teste.</font>

In [7]:
test_cat_feat = enc.transform(X_test[cat_cols])
test_cat_feat = pd.DataFrame(test_cat_feat)
# test_cat_feat.columns = enc.get_feature_names(cat_cols)
test_cat_feat.index = X_test.index
test_num_feat = X_test.drop(cat_cols, axis=1)
X_test_feat = pd.merge(test_num_feat, test_cat_feat, left_index=True, right_index=True)

<a name='MODEL'></a>
<h2>Modelo</h2>

<a name='LR'></a>
<h3>Regressão Logística</h3>

In [8]:
lr = LogisticRegression(random_state=75)
lr.fit(X_train_feat, y_train)
ypred_lr = lr.predict_proba(X_test_feat)[:,1]

In [9]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_lr)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_lr)))
print('log loss: {}'.format(log_loss(y_test, ypred_lr)))

average precision score: 0.30771393315031137
roc_auc_scode: 0.9567446043165466
log loss: 0.07495812020988335


<a name='RF'></a>
<h3>Random Forest</h3>

In [10]:
rf = RandomForestClassifier(n_estimators=1000, random_state=75, min_samples_leaf=2, class_weight='balanced')
rf.fit(X_train_feat, y_train)
ypred_rf = rf.predict_proba(X_test_feat)[:,1]

In [11]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_rf)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_rf)))
print('log loss: {}'.format(log_loss(y_test, ypred_rf)))

average precision score: 0.7897314535634665
roc_auc_scode: 0.99
log loss: 0.06671914890707406


<a name='LGBM'></a>
<h3>LightGBM</h3>

In [104]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                          min_child_samples=min_child_samples, subsample=subsample,
                          colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                          class_weight="balanced", n_jobs=6)
    lgbm.fit(X_train_feat, y_train)
    
    ypred_lgbm = lgbm.predict_proba(X_test_feat)[:,1]
    
    print(roc_auc_score(y_test, ypred_lgbm))
    
    return -average_precision_score(y_test, ypred_lgbm)

space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 15), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000)] # n_estimators

res = forest_minimize(tune_lgbm, space, random_state=75, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.026573581158061813, 2, 9, 0.3602419777451103, 0.38907875080188686, 949]
0.989136690647482
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5541
Function value obtained: -0.7136
Current minimum: -0.7136
Iteration No: 2 started. Evaluating function at random point.
[0.01226172672492125, 8, 7, 0.4054213799661132, 0.9369030598583269, 337]
0.9878417266187051
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.8581
Function value obtained: -0.6841
Current minimum: -0.7136
Iteration No: 3 started. Evaluating function at random point.
[0.01683319796246901, 5, 5, 0.7513238533324651, 0.631305640343713, 568]
0.9881294964028777
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.3246
Function value obtained: -0.7282
Current minimum: -0.7282
Iteration No: 4 started. Evaluating function at random point.
[0.003248394018273504, 13, 11, 0.6324068503439444, 0.4933551902972303, 798]
0

In [105]:
res.x

[0.011585543281987907, 10, 6, 0.7665481816275143, 0.613769011471217, 102]

In [12]:
lgbm = LGBMClassifier(learning_rate=0.011585543281987907, num_leaves=2**10, max_depth=10,
                      min_child_samples=6, subsample=0.7665481816275143, colsample_bytree=0.613769011471217,
                      n_estimators=102, random_state=75, class_weight="balanced", n_jobs=6)
lgbm.fit(X_train_feat, y_train)
ypred_lgbm = lgbm.predict_proba(X_test_feat)[:,1]

In [13]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_lgbm)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_lgbm)))
print('log loss: {}'.format(log_loss(y_test, ypred_lgbm)))

average precision score: 0.7057988797789752
roc_auc_scode: 0.9897122302158273
log loss: 0.2274847801186301


<a name='ENSEN'></a>
<h3>Ensemble</h3>

In [14]:
vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('lgbm', lgbm)], voting='soft')
vot.fit(X_train_feat, y_train)
ypred_vot = vot.predict_proba(X_test_feat)[:,1]

In [15]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_vot)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_vot)))
print('log loss: {}'.format(log_loss(y_test, ypred_vot)))

average precision score: 0.7221515300193122
roc_auc_scode: 0.9908453237410071
log loss: 0.11139276472914478


In [16]:
lr_perc = 0.0
rf_perc = 0.5
lgbm_perc = 0.5
ypred_ens = lr_perc*ypred_lr + rf_perc*ypred_rf + lgbm_perc*ypred_lgbm
print('average precision score: {}'.format(average_precision_score(y_test, ypred_ens)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_ens)))
print('log loss: {}'.format(log_loss(y_test, ypred_ens)))

average precision score: 0.7897426624916278
roc_auc_scode: 0.9910251798561152
log loss: 0.14262059641240524


<a name='SAV'></a>
<h2>Salvando os modelos</h2>

In [17]:
jb.dump(lgbm, "./deploy/lgbm_20200720.pkl.z")
jb.dump(rf, "./deploy/random_forest_20200720.pkl.z")
jb.dump(enc, "./deploy/onehotvec_20200720.pkl.z")

['./deploy/onehotvec_20200720.pkl.z']

In [22]:
teste = X_test.copy()
teste['proba'] = ypred_ens
df_teste = pd.read_csv('apartamentos.csv')
teste = teste.merge(df_teste, how='left', left_index=True, right_index=True)
teste.sort_values('proba', ascending=False)[['id', 'quartos_x', 'area_x', 'custo_x', 'mobiliado_x',
                                             'estacao_x', 'distancia_x', 'proba', 'apartment_link']]

Unnamed: 0,id,quartos_x,area_x,custo_x,mobiliado_x,estacao_x,distancia_x,proba,apartment_link
841,892793354,1,45,2999,0,Higienopolis Mackenzie,0.60,0.878944,https://www.quintoandar.com.br/imovel/892793354
146,892830694,1,41,2719,0,Higienopolis Mackenzie,0.44,0.834483,https://www.quintoandar.com.br/imovel/892830694
309,893095426,1,36,3055,0,Trianon Masp,0.53,0.824940,https://www.quintoandar.com.br/imovel/893095426
519,893098742,1,56,2255,0,Sao Joaquim,0.27,0.818438,https://www.quintoandar.com.br/imovel/893098742
370,892851780,2,70,2845,0,Higienopolis Mackenzie,0.33,0.811248,https://www.quintoandar.com.br/imovel/892851780
...,...,...,...,...,...,...,...,...,...
64,892793750,2,66,3526,0,Palmeiras Barra Funda,1.05,0.076212,https://www.quintoandar.com.br/imovel/892793750
2056,893020837,3,82,10778,0,Pedro II,1.38,0.076164,https://www.quintoandar.com.br/imovel/893020837
2369,893098824,3,216,3740,0,Bresser Mooca,2.30,0.076114,https://www.quintoandar.com.br/imovel/893098824
2126,893093170,3,98,3470,0,Jardim Sao Paulo Ayrton Senna,2.85,0.076096,https://www.quintoandar.com.br/imovel/893093170
