## 3. Machine Learning

### 3.1 Importando dados preprocessados

In [3]:
import pandas as pd

treino = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/jul-20-calculadora-imoveis/master/analise_e_preprocessamento/treino_preprocessado.csv')
teste = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/jul-20-calculadora-imoveis/master/analise_e_preprocessamento/teste_preprocessado.csv')

In [4]:
# dividindo os dados de treino e teste (X e y)

X_train = treino.drop('precos', axis=1)
y_train = treino['precos']
X_test = teste.drop('precos', axis=1)
y_test = teste['precos']

### 3.2 Criando um modelo de base para comparar os resultados (baseline) 

In [5]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.04720566491073608

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math 

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)

mse = mean_squared_error(lr_y_pred, y_test)
mae = mean_absolute_error(lr_y_pred, y_test)

# olhando diferentes métricas
print('mse:', mse)
print('rmse:', math.sqrt(mse))
print('mae:', mae)

mse: 2.674883341799502
rmse: 1.635507059538265
mae: 0.9879827322299511


In [7]:
from sklearn.dummy import DummyRegressor

reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.002007697364269978

### 3.3 Comparando diferentes modelos de regressão (estimadores)

In [8]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
# se quiser, instalar e usar LGBM e XGBoost de fora do sklearn
'''
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
'''

'\nfrom lightgbm import LGBMRegressor\nfrom xgboost import XGBRegressor\n'

In [9]:
reg_list = [RidgeCV(),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor(),
            # LGBMRegressor(), 
            # XGBRegressor(objective='reg:squarederror')
            ]

In [10]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando o modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R² treino: {train_score}")
    print(f"R² validação : {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R² teste: {test_score}")
    print('-'*70)

Treinando o modelo RidgeCV
R² treino: 0.058279130292815395
R² validação : 0.04 +- 0.04
R² teste: 0.04744439035071979
----------------------------------------------------------------------
Treinando o modelo SVR
R² treino: -0.07949786895442656
R² validação : -0.08 +- 0.07
R² teste: -0.14963259792443684
----------------------------------------------------------------------
Treinando o modelo KNeighborsRegressor
R² treino: 0.33214615383275614
R² validação : 0.05 +- 0.10
R² teste: 0.018964882727984245
----------------------------------------------------------------------
Treinando o modelo RandomForestRegressor
R² treino: 0.5015390336031905
R² validação : -0.11 +- 0.08
R² teste: 0.019797256519046957
----------------------------------------------------------------------
Treinando o modelo AdaBoostRegressor
R² treino: 0.15415907375077342
R² validação : -0.02 +- 0.13
R² teste: 0.16330499770729257
----------------------------------------------------------------------
Treinando o modelo Gradien



R² treino: 0.12403097262882612
R² validação : 0.06 +- 0.05
R² teste: 0.09582210449811279
----------------------------------------------------------------------




In [11]:
# Bonus: testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']


In [12]:
# Se quiser instalar e usar LGBM e XGBoost de fora do sklearn
'''
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)
'''

"\nestimators.extend(\n    [('LGBMRegressor', LGBMRegressor),\n     ('XGBRegressor', XGBRegressor)]\n)\n"

In [13]:
for name, RegressorClass in estimators:
    if name not in ignore_list:
        print(f'Treinando o modelo {name}')
        reg = RegressorClass()
        reg.fit(X_train, y_train)

        train_score = reg.score(X_train, y_train)
        cv_scores = cross_val_score(reg, X_train, y_train)
        test_score = reg.score(X_test, y_test)

        print(f"R² treino: {train_score}")
        print(f"R² validação: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
        print(f"R² teste: {test_score}")
        print('-'*70)

        relatorio['nome'].append(name)
        relatorio['train_score'].append(train_score)
        relatorio['cv_scores_mean'].append(np.mean(cv_scores))
        relatorio['test_score'].append(test_score)
        relatorio['estimador'].append(reg)

Treinando o modelo ARDRegression
R² treino: 0.05759197351645862
R² validação: 0.04 +- 0.04
R² teste: 0.04393513822413864
----------------------------------------------------------------------
Treinando o modelo AdaBoostRegressor
R² treino: 0.13665066294490025
R² validação: -0.03 +- 0.25
R² teste: 0.10792412614968094
----------------------------------------------------------------------
Treinando o modelo BaggingRegressor
R² treino: 0.46507680585289735
R² validação: -0.08 +- 0.09
R² teste: 0.04008234863960247
----------------------------------------------------------------------
Treinando o modelo BayesianRidge
R² treino: 0.0575518169013477
R² validação: 0.03 +- 0.04
R² teste: 0.046282645852678383
----------------------------------------------------------------------
Treinando o modelo CCA
R² treino: -0.14168611647313223
R² validação: -0.21 +- 0.13
R² teste: -0.08961699984651927
----------------------------------------------------------------------
Treinando o modelo DecisionTreeRegress



R² treino: 0.057951668406564516
R² validação: 0.03 +- 0.04
R² teste: 0.04652575743815024
----------------------------------------------------------------------
Treinando o modelo ExtraTreeRegressor
R² treino: 0.5358456344540199
R² validação: -0.36 +- 0.12
R² teste: -0.02790036252492989
----------------------------------------------------------------------
Treinando o modelo ExtraTreesRegressor
R² treino: 0.5358456344540199
R² validação: -0.30 +- 0.05
R² teste: 0.013903862182757187
----------------------------------------------------------------------
Treinando o modelo GammaRegressor
R² treino: 0.013244917964521763
R² validação: 0.00 +- 0.01
R² teste: 0.005748739568734562
----------------------------------------------------------------------
Treinando o modelo GaussianProcessRegressor
R² treino: 0.2999939142086606
R² validação: -41077.01 +- 71499.51
R² teste: -8984.963556033335
----------------------------------------------------------------------
Treinando o modelo GradientBoostingReg



R² treino: 0.11855612825435591
R² validação: 0.06 +- 0.05
R² teste: 0.10222860610164675
----------------------------------------------------------------------
Treinando o modelo NuSVR
R² treino: -0.05040758236035536
R² validação: -0.05 +- 0.06
R² teste: -0.12545548141175433
----------------------------------------------------------------------
Treinando o modelo OrthogonalMatchingPursuit
R² treino: 0.016657217623550546
R² validação: 0.00 +- 0.03
R² teste: 0.025977140149567868
----------------------------------------------------------------------
Treinando o modelo OrthogonalMatchingPursuitCV
R² treino: 0.05799513819889024
R² validação: 0.04 +- 0.04
R² teste: 0.04458537333858825
----------------------------------------------------------------------
Treinando o modelo PLSCanonical
R² treino: -0.5023405627469897
R² validação: -0.70 +- 0.45
R² teste: -0.3189214848941966
----------------------------------------------------------------------
Treinando o modelo PLSRegression




R² treino: 0.05722552436031314
R² validação: 0.03 +- 0.04
R² teste: 0.046126311570245604
----------------------------------------------------------------------
Treinando o modelo PassiveAggressiveRegressor
R² treino: -0.06780259778860609
R² validação: -0.35 +- 0.50
R² teste: -0.0511681458895632
----------------------------------------------------------------------
Treinando o modelo PoissonRegressor
R² treino: 0.039525582531012904
R² validação: 0.02 +- 0.02
R² teste: 0.027897447144874077
----------------------------------------------------------------------
Treinando o modelo RANSACRegressor
R² treino: -0.16886472416505893
R² validação: -0.13 +- 0.09
R² teste: -0.2197741805546194
----------------------------------------------------------------------
Treinando o modelo RandomForestRegressor
R² treino: 0.5011499827135462
R² validação: -0.09 +- 0.08
R² teste: 0.023441244497903524
----------------------------------------------------------------------
Treinando o modelo Ridge
R² treino: 0.0

In [14]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
12,GradientBoostingRegressor,0.37538,0.102035,0.112552,([DecisionTreeRegressor(criterion='friedman_ms...
26,MLPRegressor,0.118556,0.062883,0.102229,MLPRegressor()
13,HistGradientBoostingRegressor,0.365299,0.059519,0.056813,HistGradientBoostingRegressor()
15,KNeighborsRegressor,0.332146,0.053958,0.018965,KNeighborsRegressor()
0,ARDRegression,0.057592,0.038585,0.043935,ARDRegression()
29,OrthogonalMatchingPursuitCV,0.057995,0.038539,0.044585,OrthogonalMatchingPursuitCV()
23,LassoLarsIC,0.05761,0.036215,0.046205,LassoLarsIC()
37,RidgeCV,0.058279,0.03568,0.047444,"RidgeCV(alphas=array([ 0.1, 1. , 10. ]))"
36,Ridge,0.058279,0.03568,0.047444,Ridge()
17,Lars,0.058304,0.035478,0.047588,Lars()


### 3.4 TBD: calibrando melhores estimadores usando GridSearchCV

In [15]:
## Usar GridSearchCV para calibrar os melhores estimadores

### 3.5 (Opcional) Combinando os melhores estimadores usando stacking

In [16]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators = top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R² treino: {train_score}")
print(f"R² validação: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R² teste: {test_score}")
print('-'*70)



R² treino: 0.3195963806254857
R² validação: 0.11 +- 0.03
R² teste: 0.11460479152161673
----------------------------------------------------------------------


In [17]:
reg

StackingRegressor(estimators=array([['GradientBoostingRegressor', GradientBoostingRegressor()],
       ['MLPRegressor', MLPRegressor()],
       ['HistGradientBoostingRegressor', HistGradientBoostingRegressor()]],
      dtype=object))

## 4. Salvando o modelo

In [18]:
import pickle

pickle.dump(reg, open('modelo.pkl', 'wb'), protocol=4)

In [24]:
# ls

In [25]:
# resetando o notebook para garantir que estamos carregando o modelo salvo
%reset -f

In [21]:
import pickle

reg = pickle.load(open('modelo.pkl', 'rb'))

In [22]:
reg

StackingRegressor(estimators=array([['GradientBoostingRegressor', GradientBoostingRegressor()],
       ['MLPRegressor', MLPRegressor()],
       ['HistGradientBoostingRegressor', HistGradientBoostingRegressor()]],
      dtype=object))

In [23]:
import numpy as np

dados_novos = [0, 0, 0, 1, np.log1p(2), np.log1p(120)]

np.expm1(reg.predict([dados_novos]))

array([3605.74974843])