# ***Modeling***

Após terminarmos todas as etapas de pré-processamento, podemos começar a modelagem dos dados. Finalmente, iremos utilizar os dados para treinar e comparar diversos modelos.

Além disso, faremos uso de GridSearch para encontrar os melhores hiperparâmetros para cada modelo e teste de hipótese para nos guiar a escolher o modelo mais adequado para o dataset.

In [1]:
import pickle #imports
import pathlib

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42

In [2]:
# Carrega o diretório de dados

DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

d:\machine-learning\ames\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'ames_final.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
# Separa as features e o target

X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [6]:
X.head()

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Total.Bathrooms,Has.Basement,Has.Garage,Has.Porch,Has.Pool,Was.Completed,Total.Overall,SqftPerRoom,Total.Rooms,Garage.Size
0,20,RL,141.0,10.366309,2,4,Corner,1.386294,NAmes,1Fam,...,2.0,True,True,True,False,True,11,553.555556,9.0,530.0
1,20,RH,80.0,9.360741,1,4,Inside,1.386294,NAmes,1Fam,...,1.0,True,True,True,False,True,11,587.333333,6.0,731.0
2,20,RL,81.0,9.565775,2,4,Corner,1.386294,NAmes,1Fam,...,1.5,True,True,True,False,True,12,578.0,7.5,313.0
3,20,RL,93.0,9.320181,1,4,Corner,1.386294,NAmes,1Fam,...,3.5,True,True,False,False,True,12,595.826087,11.5,524.0
4,60,RL,74.0,9.534668,2,4,Inside,1.386294,Gilbert,1Fam,...,2.5,True,True,True,False,True,10,553.176471,8.5,484.0


In [7]:
y.head()

0    5.332438
1    5.021189
2    5.235528
3    5.387390
4    5.278525
Name: SalePrice, dtype: float64

# **1. Modelos Lineares**

## A. Regressão Linear

In [8]:
X_model = pd.get_dummies(X, drop_first=True).copy()

In [9]:
numeric_cols = X_model.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_model.select_dtypes(exclude=np.number).columns.tolist()

In [10]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_model[numeric_cols] = scaler.fit_transform(X_model[numeric_cols])

In [11]:
X_model.shape

(2872, 184)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.2, random_state=RANDOM_SEED)

In [14]:
from sklearn.model_selection import cross_val_score

def rsme(model, X, y):
    cv_scores = -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    return np.sqrt(cv_scores)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

errors = rsme(lr, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05355963 0.05171333 0.04352572 0.04041464 0.17269205]
RMSE: 0.07238 +/- 0.05039


É possível perceber a instabilidade do modelo linear puro! Vamos testar suas variantes.

## B. Regressão Ridge

In [19]:
from sklearn.linear_model import Ridge

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1., 5., 10., 25.], 
    'max_iter': [50000]
}

ridge = GridSearchCV(Ridge(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
ridge.fit(X_train, y_train)
alpha = ridge.best_params_['alpha']

# Hone in
# Fonte do código: https://www.kaggle.com/code/leeclemmer/exploratory-data-analysis-of-housing-in-ames-iowa#Introduction
param_grid = {
    'alpha': [x/100. * alpha for x in range(50, 150, 5)],
    'max_iter': [50000]
}
ridge = GridSearchCV(Ridge(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
ridge.fit(X_train, y_train)
alpha = ridge.best_params_['alpha']
ridge = ridge.best_estimator_

print(f"Best alpha: {alpha}")

Best alpha: 6.75


In [20]:
errors = rsme(ridge, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.0533218  0.05054995 0.04248938 0.0403236  0.04758417]
RMSE: 0.04685 +/- 0.00485


É possível identificar uma melhora no modelo, mas ainda temos mais a testar.

## C. Regressão Lasso

In [21]:
from sklearn.linear_model import Lasso

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1., 5., 10., 25.], 
    'max_iter': [50000]
}

lasso = GridSearchCV(Lasso(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
lasso.fit(X_train, y_train)
alpha = lasso.best_params_['alpha']

# Hone in
# Fonte do código: https://www.kaggle.com/code/leeclemmer/exploratory-data-analysis-of-housing-in-ames-iowa#Introduction
param_grid = {
    'alpha': [x/100. * alpha for x in range(50, 150, 5)],
    'max_iter': [50000]
}
lasso = GridSearchCV(Lasso(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
lasso.fit(X_train, y_train)
alpha = lasso.best_params_['alpha']
lasso = lasso.best_estimator_

print(f"Best alpha: {alpha}")

Best alpha: 8e-05


In [24]:
errors = rsme(lasso, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05342898 0.05036995 0.04223465 0.03985243 0.04766155]
RMSE: 0.04671 +/- 0.00503


## D. Regressão ElasticNet

In [23]:
from sklearn.linear_model import ElasticNet

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1., 5., 10., 25.], 
    'l1_ratio': [0.25, 0.5, 0.75],
    'max_iter': [50000]
}

elastic = GridSearchCV(ElasticNet(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
elastic.fit(X_train, y_train)
alpha = elastic.best_params_['alpha']

# Hone in
# Fonte do código: https://www.kaggle.com/code/leeclemmer/exploratory-data-analysis-of-housing-in-ames-iowa#Introduction
param_grid = {
    'alpha': [x/100. * alpha for x in range(50, 150, 5)],
    'l1_ratio': [0.25, 0.5, 0.75],
    'max_iter': [50000]
}

elastic = GridSearchCV(ElasticNet(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
elastic.fit(X_train, y_train)
alpha = elastic.best_params_['alpha']
l1_ratio = elastic.best_params_['l1_ratio']
elastic = elastic.best_estimator_

print(f"Best alpha: {alpha}")
print(f"Best l1_ratio: {l1_ratio}")

Best alpha: 0.000105
Best l1_ratio: 0.75


In [25]:
errors = rsme(elastic, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05342392 0.0503745  0.04224583 0.03987216 0.04765639]
RMSE: 0.04671 +/- 0.00502


## E. Gradient Boosting

In [62]:
from sklearn.ensemble import GradientBoostingRegressor

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'max_features': ['sqrt', 'log2']
}

gbr = GridSearchCV(GradientBoostingRegressor(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
gbr.fit(X_train, y_train)

learning_rate = gbr.best_params_['learning_rate']
n_estimators = gbr.best_params_['n_estimators']
max_depth = gbr.best_params_['max_depth']

gbr = gbr.best_estimator_

print(f"Best learning_rate: {learning_rate}")
print(f"Best n_estimators: {n_estimators}")
print(f"Best max_depth: {max_depth}")

# Sugestão para teste rápido
# gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3, random_state=RANDOM_SEED)

Best learning_rate: 0.05
Best n_estimators: 1000
Best max_depth: 3


In [67]:
errors = rsme(gbr, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05609965 0.05369379 0.0442164  0.04144481 0.04597039]
RMSE: 0.04829 +/- 0.00564


# **2. Modelos Não-Lineares**

Agora, vamos testar modelos de funcionamento diferente para analisar se há melhora na performance.

## A. Random Forest

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
""" param_grid = {
    'n_estimators': [100, 500, 1000],          # Número de árvores na floresta
    'max_features': ['auto', 'sqrt'],         # Número máximo de features a considerar em cada split
    'max_depth': [None, 10, 20, 30],           # Profundidade máxima da árvore
}
 """
# Para um treinamento ainda mais profundo
"""
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],          # Número de árvores na floresta
    'max_features': ['sqrt'],         # Número máximo de features a considerar em cada split
    'max_depth': [None, 2, 10, 20, 30],           # Profundidade máxima da árvore
    'min_samples_split': [2, 5, 10],            # Número mínimo de amostras necessárias para splitar um nó
    'min_samples_leaf': [1, 2, 4]               # Número mínimo de amostras necessárias em uma folha
}
"""

# rf_model = GridSearchCV(RandomForestRegressor(random_state=RANDOM_SEED), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error', verbose=1)
# rf_model.fit(X_train, y_train)
# rf_model = rf_model.best_estimator_

# ESSA CÉLULA PODE DEMORAR! Após análise, escolhemos os seguintes parâmetros:

best_params = {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 1000}

rf_model = RandomForestRegressor(**best_params, random_state=RANDOM_SEED)

In [31]:
errors = rsme(rf_model, X_train, y_train)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.06297969 0.05483191 0.04983303 0.04735107 0.05736783]
RMSE: 0.05447 +/- 0.00553


## B. CatBoost

In [34]:
from catboost import CatBoostRegressor

""" param_grid = {
    'iterations': [100, 200, 300, 1000, 5000, 100000],  # Number of boosting iterations
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'depth': [6, 8, 10],  # Depth of the trees
    'l2_leaf_reg': [1, 3, 5],  # L2 regularization term
    'border_count': [32, 64, 128],  # Number of splits for numeric features
    'bagging_temperature': [0.6, 0.8, 1.0],  # Bagging temperature
    'random_strength': [0.6, 0.8, 1.0],  # Random strength
}


catreg = CatBoostRegressor(random_seed=RANDOM_SEED, verbose=True)

catreg = GridSearchCV(catreg, cv=5, param_grid=param_grid, scoring='neg_mean_squared_error', verbose=True)

catreg.fit(X_train, y_train)

print(catreg.best_params_)

catreg = catreg.best_estimator_ """

# Como essa célula pode demorar muito, fizemos uma análise e percebemos que o fator que mais impactou foi um número alto de iterações:

catreg = CatBoostRegressor(iterations=9000, random_seed=RANDOM_SEED)

In [35]:
errors = rsme(catreg, X_train, y_train)

Learning rate set to 0.007553
0:	learn: 0.1687061	total: 4.47ms	remaining: 40.2s
1:	learn: 0.1678453	total: 9.25ms	remaining: 41.6s
2:	learn: 0.1669570	total: 13.2ms	remaining: 39.7s
3:	learn: 0.1660442	total: 17ms	remaining: 38.2s
4:	learn: 0.1651865	total: 21.5ms	remaining: 38.7s
5:	learn: 0.1643387	total: 25.6ms	remaining: 38.3s
6:	learn: 0.1635059	total: 30.2ms	remaining: 38.8s
7:	learn: 0.1627212	total: 34.2ms	remaining: 38.4s
8:	learn: 0.1618661	total: 39.5ms	remaining: 39.5s
9:	learn: 0.1610109	total: 44.4ms	remaining: 39.9s
10:	learn: 0.1602127	total: 48.6ms	remaining: 39.7s
11:	learn: 0.1593420	total: 53.3ms	remaining: 39.9s
12:	learn: 0.1585403	total: 58.9ms	remaining: 40.8s
13:	learn: 0.1577039	total: 62.9ms	remaining: 40.4s
14:	learn: 0.1568482	total: 67ms	remaining: 40.2s
15:	learn: 0.1560389	total: 71.7ms	remaining: 40.2s
16:	learn: 0.1552465	total: 76.8ms	remaining: 40.6s
17:	learn: 0.1544943	total: 80.6ms	remaining: 40.2s
18:	learn: 0.1537436	total: 84.3ms	remaining: 39

In [36]:
print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05372561 0.05093175 0.04236544 0.03938487 0.04677446]
RMSE: 0.04664 +/- 0.00528


# **3. Extras**

Para testarmos algumas mais possibilidades que pareciam interessantes, devido ao alto número de features, utilizaremos o PCA para reduzir a dimensionalidade dos dados e testar novamente os modelos, além de outro tipo de encoding para as variáveis categóricas.

## A. PCA

In [52]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9999, random_state=RANDOM_SEED)

X_model_pca = pca.fit_transform(X_model)

X_model_pca.shape

(2872, 54)

In [53]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_model_pca, y, test_size=0.2, random_state=RANDOM_SEED)

Vamos testar uma regressão linear simples.

In [54]:
lr_pca = LinearRegression()

errors = rsme(lr_pca, X_train_pca, y_train_pca)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.057045   0.05202552 0.04637    0.04592573 0.0521302 ]
RMSE: 0.05070 +/- 0.00414


Percebemos que o PCA não se comportou bem com nosso dataset, pois para diminuir nosso RMSE do PCA, precisávamos aumentar o número de componentes, o que acaba perdindo o sentido, pois o PCA é utilizado para diminuir a dimensionalidade dos dados.

## B. Hash Encoding

In [56]:
from category_encoders import HashingEncoder

X_model_hash = HashingEncoder(cols=categorical_cols, n_components=64).fit_transform(X_model)

X_model_hash.shape

  elif pd.api.types.is_categorical_dtype(cols):


(2872, 118)

In [57]:
scaler = RobustScaler()
X_model_hash[numeric_cols] = scaler.fit_transform(X_model_hash[numeric_cols])

In [58]:
X_train_hash, X_test_hash, y_train_hash, y_test_hash = train_test_split(X_model_hash, y, test_size=0.2, random_state=RANDOM_SEED)

In [61]:
lasso_hash = Lasso(alpha=alpha, max_iter=50000)

errors = rsme(lasso_hash, X_train_hash, y_train_hash)

print(errors)
print(f'RMSE: {errors.mean():.5f} +/- {errors.std():.5f}')

[0.05770066 0.05237053 0.04598285 0.04511239 0.05392126]
RMSE: 0.05102 +/- 0.00480


Também não apresentou bons resultados.

# ***Escolhendo o melhor modelo***

Nosso filtro primário para escolher o melhor modelo será o RMSE, pois é a métrica de perda do nosso caso. Com isso, selecionaremos os dois melhores modelos que minimzaram melhor o RMSE.

Após isso, vamos utilizar um teste de hipótese para verificar se há diferença significativa entre os modelos. Caso haja, escolheremos o melhor modelo. Caso contrário, escolheremos o modelo mais simples.

Os modelos com os menores RMSE são:

1. Catboost
2. Lasso & Elastic Net

Como Lasso e Elastic Net empataram, vamos utilizar o Lasso no teste de hipótese pela sua capacidade de zerar features não importantes, o que ajuda muito nesse desafio com número alto de features.

In [68]:
# Refazendo os erros de cada modelo

errors_lasso = rsme(lasso, X_train, y_train)
errors_catreg = rsme(catreg, X_train, y_train)

Learning rate set to 0.007553
0:	learn: 0.1687061	total: 20.3ms	remaining: 3m 3s
1:	learn: 0.1678453	total: 28.2ms	remaining: 2m 7s
2:	learn: 0.1669570	total: 36.2ms	remaining: 1m 48s
3:	learn: 0.1660442	total: 41ms	remaining: 1m 32s
4:	learn: 0.1651865	total: 44.7ms	remaining: 1m 20s
5:	learn: 0.1643387	total: 50.3ms	remaining: 1m 15s
6:	learn: 0.1635059	total: 55.5ms	remaining: 1m 11s
7:	learn: 0.1627212	total: 60.4ms	remaining: 1m 7s
8:	learn: 0.1618661	total: 66ms	remaining: 1m 5s
9:	learn: 0.1610109	total: 71.1ms	remaining: 1m 3s
10:	learn: 0.1602127	total: 75.7ms	remaining: 1m 1s
11:	learn: 0.1593420	total: 82ms	remaining: 1m 1s
12:	learn: 0.1585403	total: 87.1ms	remaining: 1m
13:	learn: 0.1577039	total: 91.3ms	remaining: 58.6s
14:	learn: 0.1568482	total: 98.4ms	remaining: 58.9s
15:	learn: 0.1560389	total: 105ms	remaining: 58.9s
16:	learn: 0.1552465	total: 111ms	remaining: 58.6s
17:	learn: 0.1544943	total: 117ms	remaining: 58.4s
18:	learn: 0.1537436	total: 124ms	remaining: 58.4s


Para o t-teste, nossas hipóteses são:

* $H_0$: Não há diferença significativa entre os modelos.  
* $H_1$: Há diferença significativa entre os modelos, no caso, Lasso performar melhor

Além disso, nosso nível de significância será de 5%.

In [69]:
# Hora de realizar t-teste

from scipy.stats import ttest_ind

significance = 0.05

t, p = ttest_ind(errors_lasso, errors_catreg)

print(f"t = {t:.5f}")
print(f"p-value = {p:.5f}")

t = 0.02004
p-value = 0.98450


Como nosso p-value é muito maior que o nível de significância, não podemos rejeitar a hipótese nula. Portanto, não há diferença significativa entre os modelos. Logo, podemos fazer a predição com os dois modelos e também comparar os resultados finais com o set de teste.

In [70]:
# Treinando os modelos

lasso.fit(X_train, y_train)
catreg.fit(X_train, y_train)

Learning rate set to 0.007824
0:	learn: 0.1700679	total: 16.2ms	remaining: 2m 25s
1:	learn: 0.1691466	total: 26.2ms	remaining: 1m 57s
2:	learn: 0.1682139	total: 33.5ms	remaining: 1m 40s
3:	learn: 0.1672834	total: 50.7ms	remaining: 1m 53s
4:	learn: 0.1664040	total: 56.7ms	remaining: 1m 42s
5:	learn: 0.1654518	total: 61.4ms	remaining: 1m 32s
6:	learn: 0.1645840	total: 81.2ms	remaining: 1m 44s
7:	learn: 0.1637070	total: 86.7ms	remaining: 1m 37s
8:	learn: 0.1628190	total: 90.8ms	remaining: 1m 30s
9:	learn: 0.1620026	total: 94.8ms	remaining: 1m 25s
10:	learn: 0.1611253	total: 100ms	remaining: 1m 21s
11:	learn: 0.1602413	total: 104ms	remaining: 1m 17s
12:	learn: 0.1593829	total: 108ms	remaining: 1m 14s
13:	learn: 0.1585243	total: 112ms	remaining: 1m 11s
14:	learn: 0.1577202	total: 119ms	remaining: 1m 11s
15:	learn: 0.1568819	total: 123ms	remaining: 1m 8s
16:	learn: 0.1560240	total: 127ms	remaining: 1m 7s
17:	learn: 0.1552349	total: 134ms	remaining: 1m 6s
18:	learn: 0.1544187	total: 138ms	rem

<catboost.core.CatBoostRegressor at 0x1a5c6e07b20>

## A. Lasso

In [71]:
y_pred_lasso = lasso.predict(X_test)

RMSE_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))

print(f'RMSE_lasso: {RMSE_lasso:.5f}')

RMSE_lasso: 0.03991


In [73]:
error_percent_lasso = 100 * (10**RMSE_lasso - 1)
print(f'Average error is {error_percent_lasso:.2f}%')

Average error is 9.62%


## B. CatBoost

In [74]:
y_pred_catreg = catreg.predict(X_test)

RMSE_catreg = np.sqrt(mean_squared_error(y_test, y_pred_catreg))

print(f'RMSE_catreg: {RMSE_catreg:.5f}')

RMSE_catreg: 0.04316


In [75]:
error_percent_catreg = 100 * (10**RMSE_catreg - 1)
print(f'Average error is {error_percent_catreg:.2f}%')

Average error is 10.45%


# ***O Grande Vencedor: LASSO!***

Mesmo com o CatBoost tendo um desempenho melhor no cross-validate, o Lasso acabou performando bem melhor no conjunto de teste. Isso pode ser explicado por exemplo, pela simplificação que o modelo faz das features, automaticamente zerando as features que não são importantes para o dataset.

Vamos analisar o resultado final do Lasso e a importância das features:

In [76]:
# Atributos do Lasso

from pprint import pprint

pprint(lasso.get_params())

{'alpha': 8e-05,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 50000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}


In [81]:
coefs = pd.DataFrame({'coefs':lasso.coef_,'Positive':lasso.coef_ > 0}, index=X_train.columns)

# pega as features que não foram dropadas
# Inspiração: https://www.kaggle.com/code/leeclemmer/exploratory-data-analysis-of-housing-in-ames-iowa#Introduction


# Top 10 coeficientes positivos
coefs[coefs.coefs > 0].sort_values('coefs', ascending=False).head(10)

Unnamed: 0,coefs,Positive
Functional,0.086385,True
Gr.Liv.Area,0.055184,True
Sale.Condition_AdjLand,0.050496,True
Sale.Condition_Alloca,0.038401,True
Neighborhood_Crawfor,0.037591,True
Total.Overall,0.035213,True
Sale.Condition_Normal,0.033656,True
Exterior_BrkFace,0.03021,True
Sale.Type_New,0.02829,True
Central.Air_Y,0.023102,True


In [80]:
# Top 10 coeficientes negativos
coefs[coefs.coefs < 0].sort_values('coefs', ascending=True).head(10)

Unnamed: 0,coefs,Positive
House.Age,-0.040875,False
Condition_RoadsAndRailroad,-0.033464,False
Neighborhood_MeadowV,-0.026368,False
MS.Zoning_RM,-0.023308,False
Condition_Roads,-0.022847,False
Was.Completed,-0.021517,False
MS.SubClass_90,-0.017205,False
Neighborhood_SawyerW,-0.016409,False
Neighborhood_Edwards,-0.015549,False
Neighborhood_NWAmes,-0.014446,False


In [82]:
print(f'LASSO dropou {sum(coefs.coefs == 0)} of {coefs.shape[0]} features.')

LASSO dropou 53 of 184 features.


In [83]:
# Quais features foram dropadas?

coefs[coefs.coefs == 0].index

Index(['Land.Slope', 'Overall.Cond', 'Bsmt.Unf.SF', 'X2nd.Flr.SF',
       'Bsmt.Half.Bath', 'Open.Porch.SF', 'HasShed', 'Total.Floor.SF',
       'Total.SF', 'Has.Basement', 'Has.Garage', 'Has.Pool', 'Total.Rooms',
       'Garage.Size', 'MS.SubClass_80', 'MS.SubClass_Other', 'MS.Zoning_RL',
       'Lot.Config_FR3', 'Neighborhood_SWISU', 'Neighborhood_Sawyer',
       'Neighborhood_Timber', 'Bldg.Type_2fmCon', 'Bldg.Type_TwnhsE',
       'House.Style_1.5Unf', 'House.Style_1Story', 'House.Style_2.5Fin',
       'House.Style_2Story', 'House.Style_SFoyer', 'House.Style_SLvl',
       'Roof.Style_Hip', 'Foundation_CBlock', 'Foundation_Other',
       'BsmtFin.Type.2_LwQ', 'Electrical_FuseF', 'Electrical_FuseP',
       'Electrical_Mix', 'Garage.Type_Basment', 'Garage.Type_NoGarage',
       'Fence_GdWo', 'Fence_MnWw', 'Mo.Sold_11.0', 'Mo.Sold_12.0',
       'Mo.Sold_4.0', 'Mo.Sold_8.0', 'Mo.Sold_9.0', 'Yr.Sold_2008.0',
       'Sale.Condition_Partial', 'Exterior_HdBoard', 'Exterior_Other',
       'Ex