## 3. Machine Learning

### 3.1 Importar Dados Preprocessados

In [1]:
import pandas as pd
treino = pd.read_csv('https://raw.githubusercontent.com/WittmannF/awari-calculadora-imoveis-may-20/master/2-eda-preprocessamento/treino_preprocessado.csv')
teste = pd.read_csv('https://raw.githubusercontent.com/WittmannF/awari-calculadora-imoveis-may-20/master/2-eda-preprocessamento/teste_preprocessado.csv')

In [2]:
# Divisão X e y
X_train = treino.drop('preco', axis=1)
y_train = treino['preco']
X_test = teste.drop('preco', axis=1)
y_test = teste['preco']

### 3.2 Modelo "Baseline" (Ponto de Partida)

In [3]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.6236918447455909

In [4]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.0016538742649583327

### 3.3 Fazer triagem entre diferentes estimadores

In [5]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [6]:
reg_list = [RidgeCV(),
            LGBMRegressor(), 
            XGBRegressor(objective='reg:squarederror'),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor()
            ]

In [8]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando Modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

Treinando Modelo RidgeCV
R2 Score Train: 0.6012110923644094
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6240340341635653
Treinando Modelo LGBMRegressor
R2 Score Train: 0.7995456704936577
R2 Score Valid: 0.69 +- 0.08
R2 Score Test: 0.7269038788547185
Treinando Modelo XGBRegressor
R2 Score Train: 0.7960825942221591
R2 Score Valid: 0.73 +- 0.06
R2 Score Test: 0.7233996294302408
Treinando Modelo SVR
R2 Score Train: 0.7345972275266579
R2 Score Valid: 0.72 +- 0.07
R2 Score Test: 0.7132957760525864
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.7924410274499227
R2 Score Valid: 0.68 +- 0.05
R2 Score Test: 0.7052967841630859
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.8674292439215541
R2 Score Valid: 0.68 +- 0.05
R2 Score Test: 0.728570755276619
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.6545582867007731
R2 Score Valid: 0.60 +- 0.06
R2 Score Test: 0.5664090693088373
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.8086388911889636
R2 Score Valid: 0.7



R2 Score Train: 0.7297894497444206
R2 Score Valid: 0.68 +- 0.13
R2 Score Test: 0.7068530427029853




In [31]:
# Bonus: Testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']


In [14]:
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)

In [16]:
for name, RegressorClass in estimators:
  if name not in ignore_list:
    print(f'Treinando Modelo {name}')
    reg = RegressorClass()
    reg.fit(X_train, y_train)

    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)

    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

    relatorio['nome'].append(name)
    relatorio['train_score'].append(train_score)
    relatorio['cv_scores_mean'].append(np.mean(cv_scores))
    relatorio['test_score'].append(test_score)
    relatorio['estimador'].append(reg)

Treinando Modelo ARDRegression
R2 Score Train: 0.8596531163839972
R2 Score Valid: 0.67 +- 0.06
R2 Score Test: 0.7083100244395286
Treinando Modelo BayesianRidge
R2 Score Train: 0.6011944059402474
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6243998859423144
Treinando Modelo CCA
R2 Score Train: 0.28565336515321493
R2 Score Valid: 0.27 +- 0.18
R2 Score Test: 0.29150420597736526
Treinando Modelo DecisionTreeRegressor
R2 Score Train: 0.8848482542159023
R2 Score Valid: 0.61 +- 0.06
R2 Score Test: 0.6581463146862659
Treinando Modelo DummyRegressor
R2 Score Train: 0.0
R2 Score Valid: -0.01 +- 0.01
R2 Score Test: -0.0016538742649583327
Treinando Modelo ElasticNetCV




R2 Score Train: 0.6011995333793443
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6240946394886209
Treinando Modelo ExtraTreeRegressor
R2 Score Train: 0.8848482542159023
R2 Score Valid: 0.59 +- 0.06
R2 Score Test: 0.6484608808206791
Treinando Modelo ExtraTreesRegressor
R2 Score Train: 0.8848482542159023
R2 Score Valid: 0.66 +- 0.05
R2 Score Test: 0.7206912947237747
Treinando Modelo GaussianProcessRegressor
R2 Score Train: 0.8257158312785288
R2 Score Valid: -57814.52 +- 96367.83
R2 Score Test: -2189.230874800904
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.8086388911889636
R2 Score Valid: 0.73 +- 0.07
R2 Score Test: 0.7291125618488584
Treinando Modelo HistGradientBoostingRegressor
R2 Score Train: 0.8023321610668072
R2 Score Valid: 0.69 +- 0.08
R2 Score Test: 0.7282158996252834
Treinando Modelo HuberRegressor
R2 Score Train: 0.562906253127321
R2 Score Valid: 0.56 +- 0.16
R2 Score Test: 0.6163984066158077
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.79244102744992



R2 Score Train: 0.5526700156162401
R2 Score Valid: 0.55 +- 0.17
R2 Score Test: 0.6103166686905238
Treinando Modelo MLPRegressor




R2 Score Train: 0.7266737542048014
R2 Score Valid: 0.71 +- 0.07
R2 Score Test: 0.7036071450368079
Treinando Modelo MultiTaskElasticNetCV
Ignoring MultiTaskElasticNetCV
Treinando Modelo MultiTaskLasso
Ignoring MultiTaskLasso
Treinando Modelo MultiTaskLassoCV
Ignoring MultiTaskLassoCV
Treinando Modelo NuSVR
R2 Score Train: 0.737448269243369
R2 Score Valid: 0.72 +- 0.07
R2 Score Test: 0.7132661443478059
Treinando Modelo OrthogonalMatchingPursuit
R2 Score Train: 0.4625425224728076
R2 Score Valid: 0.45 +- 0.18
R2 Score Test: 0.523997377892827
Treinando Modelo OrthogonalMatchingPursuitCV
R2 Score Train: 0.6012210324248282
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6234364528295557
Treinando Modelo PLSCanonical
R2 Score Train: 0.2282505146399173
R2 Score Valid: 0.21 +- 0.19
R2 Score Test: 0.2234456315661583
Treinando Modelo PLSRegression
R2 Score Train: 0.5998225105341303
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6142050658850435
Treinando Modelo PassiveAggressiveRegressor
R2 Score Tra



R2 Score Train: 0.42507214599274884
R2 Score Valid: 0.46 +- 0.20
R2 Score Test: 0.5068553066227031
Treinando Modelo RadiusNeighborsRegressor
Ignoring RadiusNeighborsRegressor
Treinando Modelo RandomForestRegressor




R2 Score Train: 0.8664711010616786
R2 Score Valid: 0.68 +- 0.05
R2 Score Test: 0.7281766147419715
Treinando Modelo RegressorChain
Ignoring RegressorChain
Treinando Modelo Ridge
R2 Score Train: 0.601211092364409
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6240340341635531
Treinando Modelo RidgeCV
R2 Score Train: 0.6012110923644094
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6240340341635653
Treinando Modelo SGDRegressor
R2 Score Train: 0.5844211420431752
R2 Score Valid: 0.56 +- 0.17
R2 Score Test: 0.6118199902179546
Treinando Modelo SVR
R2 Score Train: 0.7345972275266579
R2 Score Valid: 0.72 +- 0.07
R2 Score Test: 0.7132957760525864
Treinando Modelo StackingRegressor
Ignoring StackingRegressor
Treinando Modelo TheilSenRegressor
R2 Score Train: 0.5214215837390783
R2 Score Valid: 0.52 +- 0.18
R2 Score Test: 0.5767903886653336
Treinando Modelo TransformedTargetRegressor
R2 Score Train: 0.6011775050836603
R2 Score Valid: 0.59 +- 0.12
R2 Score Test: 0.6236918447455909
Treinando Modelo Vo

In [17]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
41,XGBRegressor,0.796083,0.733585,0.7234,"XGBRegressor(base_score=0.5, booster='gbtree',..."
11,GradientBoostingRegressor,0.808639,0.729663,0.729113,"([DecisionTreeRegressor(ccp_alpha=0.0, criteri..."
26,NuSVR,0.737448,0.724886,0.713266,"NuSVR(C=1.0, cache_size=200, coef0=0.0, degree..."
37,SVR,0.734597,0.721531,0.713296,"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3..."
25,MLPRegressor,0.726674,0.710701,0.703607,"MLPRegressor(activation='relu', alpha=0.0001, ..."
40,LGBMRegressor,0.799546,0.690786,0.726904,"LGBMRegressor(boosting_type='gbdt', class_weig..."
12,HistGradientBoostingRegressor,0.802332,0.688338,0.728216,HistGradientBoostingRegressor(l2_regularizatio...
14,KNeighborsRegressor,0.792441,0.684014,0.705297,"KNeighborsRegressor(algorithm='auto', leaf_siz..."
33,RandomForestRegressor,0.866471,0.68353,0.728177,"(DecisionTreeRegressor(ccp_alpha=0.0, criterio..."
2,BaggingRegressor,0.859653,0.670357,0.70831,"(DecisionTreeRegressor(ccp_alpha=0.0, criterio..."


### 3.4 TODO: Calibrar melhores estimadores usando GridSearchCV

In [None]:
## Usar GridSearchCV para calibrar os melhores estimadores

### 3.5 (Opcional) Combinar os melhores estimadores usando Stacking

In [21]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators=top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R2 Score Train: {train_score}")
print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R2 Score Test: {test_score}")
print('='*80)

R2 Score Train: 0.7850127684840529
R2 Score Valid: 0.74 +- 0.06
R2 Score Test: 0.7271540018261753


In [22]:
reg

StackingRegressor(cv=None,
                  estimators=array([['XGBRegressor',
        XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,...
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)],
       ['NuSVR',
        NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)]],
      dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

## 4. Exportar Modelo

In [33]:
import pickle
pickle.dump(reg, open('regressor.pkl', 'wb'), protocol=4)


In [None]:
ls

regressor.pkl  [0m[01;34msample_data[0m/


In [None]:
%reset -f

In [None]:
import pickle
reg = pickle.load(open('regressor.pkl', 'rb'))

In [24]:
reg

StackingRegressor(cv=None,
                  estimators=array([['XGBRegressor',
        XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,...
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)],
       ['NuSVR',
        NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)]],
      dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

In [None]:
import numpy as np
np.expm1(reg.predict([[0, 0, 0, 1, np.log1p(2), np.log1p(120)]]))

array([3148.651579])