## Carga dos dados

In [2]:
#libs
import pandas as pd
import numpy as np

from os import listdir, chdir
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

np.random.seed(595)

In [3]:
#Carga dos dados
chdir('../input')
files = listdir()

df_train = pd.read_csv(files[0])
df_test = pd.read_csv(files[1])
assert(df_train['SalePrice'].isnull().sum() == 0)

In [4]:
train = df_train.copy()

## Pré-processamento

In [5]:
df_train.shape

(1460, 81)

In [6]:
#Colunas por tipo
cat_vars = train.select_dtypes(include='object').columns
num_vars = train.select_dtypes(include=['int','float']).columns
assert(len(cat_vars) + len(num_vars) == df_train.shape[1])

In [7]:
#Cardinalidade das variáveis categóricas
card_cat_vars = train[cat_vars].nunique().sort_values()
high_card_cat_vars = card_cat_vars[card_cat_vars.values > 7]
high_card_cat_vars

RoofMatl         8
HouseStyle       8
Condition2       8
SaleType         9
Condition1       9
Exterior1st     15
Exterior2nd     16
Neighborhood    25
dtype: int64

In [8]:
#Presença de valores NA
na_cat_vars = train[cat_vars].isnull().sum()
na_cat_vars = na_cat_vars[na_cat_vars > 0]

na_num_vars = train[num_vars].isnull().sum()
na_num_vars = na_num_vars[na_num_vars > 0]

In [9]:
print(na_cat_vars)
print(na_num_vars)

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64


### Etapas

* Retirar variáveis categóricas com muitos valores NA
* Retirar variáveis categóricas de alta cardinalidade
* Retirar observações de variáveis categóricas que possuem poucas linhas NA
* Substituir variáveis Garage:
        + HasGarage: Binário, 0 quando GarageFinish = NA
        + GarageFinish: Binário, 0  quando GarageFinish = NA OU 'Unf'
* Imputar constante zero nos registros NA da variável *MasVnrArea*
* Imputar mediana nos registros NA das variáveis *LotFrontage*
* Imputar a moda nas variáveis categóricas que possuem poucas linhas NA

In [10]:
#Retirar variáveis categóricas com muitos valores NA
drop_cols_1_mostly_missing = train[na_cat_vars[na_cat_vars > 500].index].columns

In [11]:
#Retirar variáveis categóricas de alta cardinalidade
drop_cols_2_high_card = train[high_card_cat_vars.index].columns

In [12]:
#Substituir as variáveis Garage
replace_cols_1_garage = train.filter(like='Garage')

In [13]:
#Imputar constante zero nos registros NA da variável 'MasVnrArea'
impute_cols_1_constant_zero = train[['MasVnrArea']]

In [14]:
#Variáveis a imputar median em caso de missing
impute_cols_2_median = train[['LotFrontage']]

In [15]:
#Aplicar Label Encoding nas demais variáveis categóricas
encode_cols_1 = train[na_cat_vars[na_cat_vars < 500].index]

## Transformers

In [16]:
class GarageTransform(BaseEstimator, TransformerMixin):   
    '''
    Tarefas:
    * Criar coluna HasGarage: 0 se GarageFinish == NA, 1 caso contrário
    * Criar coluna GarageStatus: 0 se GarageFinish in [NA,'Unf'], 1 caso contrário
    * Remover demais colunas
    '''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, dataFrame):
        output = dataFrame.copy()
        cols_to_drop = output.filter(like='Garage').columns
        
        #Criar coluna HasGarage
        output['HasGarage'] = output['GarageType'].isna()
        output['HasGarage'] *= 1
        
        #Criar coluna GarageStatus
        output['GarageStatus'] = 1
        index_garagefinish_isna = output[output['GarageFinish'].isna()].index
        index_garagefinish_unf = output[output['GarageFinish'] == 'Unf'].index
        output.loc[index_garagefinish_isna, 'GarageStatus'] = 0
        output.loc[index_garagefinish_unf, 'GarageStatus'] = 0

        #Remover demais colunas
        return output.drop(columns=cols_to_drop)
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [17]:
class ConstantImputer(BaseEstimator, TransformerMixin):
    '''
    Tarefas:
    * Imputar um valor constante
    '''
    def __init__(self, constant=0):
        self.constant = constant
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, dataFrame):
        output = dataFrame.copy()
        
        for col in dataFrame.columns:
            output[col] = SimpleImputer(strategy='constant', fill_value=self.constant).fit_transform(output[[col]])
            
        return output
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
            

In [18]:
class MedianImputer(BaseEstimator, TransformerMixin):
    '''
    Tarefas:
    * Imputar a mediana das variáveis numéricas
    '''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, dataFrame):
        output = dataFrame.copy()
        
        for col in dataFrame.columns:
            output[col] = SimpleImputer(strategy='median').fit_transform(output[[col]])
            
        return output
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
            

In [19]:
class MultiLabelEncoder(BaseEstimator, TransformerMixin):
    '''
    Tarefas:
    * Codificar variáveis categóricas em numéricas
    '''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, dataFrame):
        output = dataFrame.copy()
        
        for col in dataFrame.columns:
            output[col] = LabelEncoder().fit_transform(output[[col]].astype(str))
            
        return output
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
            

In [21]:
#Train/test Split
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('SalePrice', axis=1), df_train['SalePrice'], train_size=0.7)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1021, 80) (439, 80) (1021,) (439,)


## Transformers, Pipeline e Model

In [22]:
#Preprocess Transformer
ct_prep = ColumnTransformer(transformers=[('Drop_missing_cols', 'drop', drop_cols_1_mostly_missing),
                                          ('Drop_high_card_cols', 'drop', drop_cols_2_high_card),
                                          ('Transmute_garage', GarageTransform(), replace_cols_1_garage.columns),
                                          ('Mutate_constant', ConstantImputer(constant=0), impute_cols_1_constant_zero.columns),
                                          ('Mutate_median', MedianImputer(), impute_cols_2_median.columns),
                                          ('Encode_labels', MultiLabelEncoder(), encode_cols_1.columns)],
                           remainder='drop')

In [28]:
#Model
xgb_model = XGBRegressor(objective='reg:squarederror',
                         booster='gbtree',
                         n_jobs=3,
                         verbose=3)

xgb_params = {'n_estimators': [200, 400, 600, 800, 1000],
              'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1],
              'max_depth': [3, 4, 5, 6, 7, 8],
              'learning_rate': [0.01, 0.1, 1]}

xgb_random_cv = RandomizedSearchCV(xgb_model,
                                   param_distributions=xgb_params,
                                   cv=5,
                                   n_iter=100,
                                   scoring='neg_mean_squared_error')

In [29]:
pipe_model = Pipeline(steps=[('Prep', ct_prep),
                             ('Model', xgb_random_cv)])
pipe_model.fit(X_train, y_train.ravel())

  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
     steps=[('Prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('Drop_missing_cols', 'drop', Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')), ('Drop_high_card_cols', 'drop', Index(['R...=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0))])

In [32]:
preds = pipe_model.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, preds))
score

  y = column_or_1d(y, warn=True)


51773.995320366506

Adicionar novas features piorou bastante o resultado do modelo preditivo em relação aos resultados obtidos no Kaggle.

Vou gerar um arquivo com as previsões e verificar o resultado na plataforma.

### Submission file

In [33]:
chdir('../output')
submission_preds = pipe_model.predict(df_test)
df_submit = pd.DataFrame({'Id': df_test.Id,
                          'SalePrice': submission_preds})
df_submit.to_csv('01-pipeline-model_2.csv', index=False)

  y = column_or_1d(y, warn=True)


Score model_1: 33994