In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from skopt import forest_minimize
import joblib as jb

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
filterwarnings('ignore')

<h2>Índice</h2>
<font size=3>
<ol>
    <li><a href='#PRE'>Pré-processamento</a></li>
    <li><a href='#MODEL'>Modelo</a></li>
    <ul>
        <li><a href='#LR'>Regressão Logística</a></li>
        <li><a href='#RF'>Random Forest</a></li>
        <li><a href='#LGBM'>LightGBM</a></li>
        <li><a href='#ENSEN'>Ensemble</a></li>
    </ul>
    <li><a href='#SAV'>Salvando os modelos</a></li>
</ol>
</font>

<a name='PRE'></a>
<h2>Pré-processamento</h2>
<h3>Carregando o dataset</h3>

In [2]:
df = pd.read_csv('apartamentos.csv')
df.drop(['endereco', 'update_time', 'apartment_link', 'id'], axis=1, inplace=True)
df.head()

Unnamed: 0,y,quartos,area,custo,vagas,mobiliado,bairro,latitude,longitude,estacao,distancia,linha_amarela,linha_azul,linha_lilas,linha_prata,linha_verde,linha_vermelha
0,0,2,80,3455,1,0,Vila Madalena,-23.540695,-46.701796,Vila Madalena,1.26,0,0,0,0,1,0
1,0,3,120,5975,1,0,Alto de Pinheiros,-23.544324,-46.709045,Vila Madalena,1.85,0,0,0,0,1,0
2,0,2,68,3390,1,0,Alto de Pinheiros,-23.540833,-46.706647,Vila Madalena,1.7,0,0,0,0,1,0
3,0,3,88,5188,2,0,Vila Madalena,-23.541757,-46.701762,Vila Madalena,1.21,0,0,0,0,1,0
4,0,2,52,2289,0,1,Alto de Pinheiros,-23.542408,-46.716614,Vila Madalena,2.64,0,0,0,0,1,0


<font size=3>Dividindo entre X e y.</font>

In [3]:
X = df.drop('y', axis=1)
y = df['y']
print('X: {}\ny: {}'.format(X.shape, y.shape))

X: (6538, 16)
y: (6538,)


<font size=3><br>Dividindo entre treino e teste.</font>

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=75)
print('X_train:{} \ny_train:{} \nX_test:\t{} \ny_test:\t{}'.format(X_train.shape,y_train.shape,X_test.shape,y_test.shape))

X_train:(4576, 16) 
y_train:(4576,) 
X_test:	(1962, 16) 
y_test:	(1962,)


In [5]:
X_train.head()

Unnamed: 0,quartos,area,custo,vagas,mobiliado,bairro,latitude,longitude,estacao,distancia,linha_amarela,linha_azul,linha_lilas,linha_prata,linha_verde,linha_vermelha
2273,1,33,5504,1,1,Consolação,-23.55044,-46.654623,Higienopolis Mackenzie,0.29,1,0,0,0,0,0
1041,1,34,4145,2,1,Bela Vista,-23.567636,-46.646206,Brigadeiro,0.28,0,0,0,0,1,0
1318,2,104,3414,0,0,Santa Cecília,-23.538508,-46.645741,Santa Cecilia,0.34,0,0,0,0,0,1
2247,1,45,3826,1,0,Bela Vista,-23.561625,-46.650809,Trianon Masp,0.38,0,0,0,0,1,0
2265,2,110,4945,1,0,Jardim Paulista,-23.567925,-46.656107,Trianon Masp,0.53,0,0,0,0,1,0


<font size=3>
    Com <i>OneHotEncoder</i> vamos transformar as variáveis <i>bairro</i> e <i>estacao</i> para númericas para que possamos utilizar nos nossos modelos.
</font>

In [6]:
cat_cols = ['bairro', 'estacao']
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
train_cat_feat = enc.fit_transform(X_train[cat_cols])
train_cat_feat = pd.DataFrame(train_cat_feat)
# train_cat_feat.columns = enc.get_feature_names(cat_cols)
train_cat_feat.index = X_train.index
train_num_feat = X_train.drop(cat_cols, axis=1)
X_train_feat = pd.merge(train_num_feat, train_cat_feat, left_index=True, right_index=True)

<font size=3>Replicando para os dados de teste.</font>

In [7]:
test_cat_feat = enc.transform(X_test[cat_cols])
test_cat_feat = pd.DataFrame(test_cat_feat)
# test_cat_feat.columns = enc.get_feature_names(cat_cols)
test_cat_feat.index = X_test.index
test_num_feat = X_test.drop(cat_cols, axis=1)
X_test_feat = pd.merge(test_num_feat, test_cat_feat, left_index=True, right_index=True)

<a name='MODEL'></a>
<h2>Modelo</h2>

<a name='LR'></a>
<h3>Regressão Logística</h3>

In [8]:
lr = LogisticRegression(random_state=75)
lr.fit(X_train_feat, y_train)
ypred_lr = lr.predict_proba(X_test_feat)[:,1]

In [9]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_lr)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_lr)))
print('log loss: {}'.format(log_loss(y_test, ypred_lr)))

average precision score: 0.16968485627904073
roc_auc_scode: 0.9421481706120556
log loss: 0.060769089723026894


<a name='RF'></a>
<h3>Random Forest</h3>

In [10]:
rf = RandomForestClassifier(n_estimators=1000, random_state=75, min_samples_leaf=2, class_weight='balanced')
rf.fit(X_train_feat, y_train)
ypred_rf = rf.predict_proba(X_test_feat)[:,1]

In [11]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_rf)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_rf)))
print('log loss: {}'.format(log_loss(y_test, ypred_rf)))

average precision score: 0.5485563211695972
roc_auc_scode: 0.9908486005316017
log loss: 0.04966813134187631


<a name='LGBM'></a>
<h3>LightGBM</h3>

In [12]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                          min_child_samples=min_child_samples, subsample=subsample,
                          colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                          class_weight="balanced", n_jobs=6)
    lgbm.fit(X_train_feat, y_train)
    
    ypred_lgbm = lgbm.predict_proba(X_test_feat)[:,1]
    
    print(roc_auc_score(y_test, ypred_lgbm))
    
    return -average_precision_score(y_test, ypred_lgbm)

space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 15), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000)] # n_estimators

res = forest_minimize(tune_lgbm, space, random_state=75, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.026573581158061813, 2, 9, 0.3602419777451103, 0.38907875080188686, 949]
0.9884581764989206
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5741
Function value obtained: -0.4831
Current minimum: -0.4831
Iteration No: 2 started. Evaluating function at random point.
[0.01226172672492125, 8, 7, 0.4054213799661132, 0.9369030598583269, 337]
0.9917227108122091
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.8359
Function value obtained: -0.5466
Current minimum: -0.5466
Iteration No: 3 started. Evaluating function at random point.
[0.01683319796246901, 5, 5, 0.7513238533324651, 0.631305640343713, 568]
0.9899209732950389
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.1757
Function value obtained: -0.5167
Current minimum: -0.5466
Iteration No: 4 started. Evaluating function at random point.
[0.003248394018273504, 13, 11, 0.6324068503439444, 0.4933551902972303, 798]


In [13]:
res.x

[0.002631058751802242, 7, 2, 0.8094065591181088, 0.6564247612511152, 203]

In [15]:
lgbm = LGBMClassifier(learning_rate=0.002631058751802242, num_leaves=2**7, max_depth=7,
                      min_child_samples=2, subsample=0.8094065591181088, colsample_bytree=0.6564247612511152,
                      n_estimators=203, random_state=75, class_weight="balanced", n_jobs=6)
lgbm.fit(X_train_feat, y_train)
ypred_lgbm = lgbm.predict_proba(X_test_feat)[:,1]

In [16]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_lgbm)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_lgbm)))
print('log loss: {}'.format(log_loss(y_test, ypred_lgbm)))

average precision score: 0.5763678266484014
roc_auc_scode: 0.992864405872594
log loss: 0.3765352518715935


<a name='ENSEN'></a>
<h3>Ensemble</h3>

In [17]:
vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('lgbm', lgbm)], voting='soft')
vot.fit(X_train_feat, y_train)
ypred_vot = vot.predict_proba(X_test_feat)[:,1]

In [18]:
print('average precision score: {}'.format(average_precision_score(y_test, ypred_vot)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_vot)))
print('log loss: {}'.format(log_loss(y_test, ypred_vot)))

average precision score: 0.5432782433600254
roc_auc_scode: 0.9892430918529355
log loss: 0.14067055868961295


In [19]:
lr_perc = 0.0
rf_perc = 0.5
lgbm_perc = 0.5
ypred_ens = lr_perc*ypred_lr + rf_perc*ypred_rf + lgbm_perc*ypred_lgbm
print('average precision score: {}'.format(average_precision_score(y_test, ypred_ens)))
print('roc_auc_scode: {}'.format(roc_auc_score(y_test, ypred_ens)))
print('log loss: {}'.format(log_loss(y_test, ypred_ens)))

average precision score: 0.5794405144412936
roc_auc_scode: 0.9919367786360311
log loss: 0.19835566252934994


<a name='SAV'></a>
<h2>Salvando os modelos</h2>

In [20]:
jb.dump(lgbm, "./deploy/lgbm_20211028.pkl.z")
jb.dump(rf, "./deploy/random_forest_20211028.pkl.z")
jb.dump(enc, "./deploy/onehotvec_20211028.pkl.z")

['./deploy/onehotvec_20211028.pkl.z']

In [21]:
teste = X_test.copy()
teste['proba'] = ypred_ens
df_teste = pd.read_csv('apartamentos.csv')
teste = teste.merge(df_teste, how='left', left_index=True, right_index=True)
teste.sort_values('proba', ascending=False)[['id', 'quartos_x', 'area_x', 'custo_x', 'mobiliado_x',
                                             'estacao_x', 'distancia_x', 'proba', 'apartment_link']]

Unnamed: 0,id,quartos_x,area_x,custo_x,mobiliado_x,estacao_x,distancia_x,proba,apartment_link
5408,893439730,1,40,1907,0,Japao Liberdade,0.37,0.826700,https://www.quintoandar.com.br/imovel/893439730
5207,893439739,1,40,2285,0,Japao Liberdade,0.37,0.809185,https://www.quintoandar.com.br/imovel/893439739
492,893437823,1,50,2397,0,Trianon Masp,0.40,0.796379,https://www.quintoandar.com.br/imovel/893437823
867,893252861,1,36,2488,0,Vergueiro,0.49,0.796055,https://www.quintoandar.com.br/imovel/893252861
350,892874927,1,52,2602,0,Consolacao,0.36,0.792344,https://www.quintoandar.com.br/imovel/892874927
...,...,...,...,...,...,...,...,...,...
3832,893000385,3,64,4153,0,Palmeiras Barra Funda,1.53,0.146360,https://www.quintoandar.com.br/imovel/893000385
5153,893420041,2,81,3442,0,Pedro II,1.08,0.146360,https://www.quintoandar.com.br/imovel/893420041
4563,893235837,3,78,4918,0,Marechal Deodoro,0.30,0.146359,https://www.quintoandar.com.br/imovel/893235837
3665,893414336,3,68,5444,1,Palmeiras Barra Funda,1.50,0.146359,https://www.quintoandar.com.br/imovel/893414336
