## Análise Inicial 

### Importação de bibliotecas, reconhecimento de data base e tratando valores faltantes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, norm
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
%matplotlib inline 

import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Criando um DataFrame sumarizando as colunas, seus valores faltantes e o percentual dos valores faltantes 
#sobre a totalidade do DataFrame original.

missings = pd.DataFrame(train.isnull().sum(), columns = ['total'])
missings['name'] = missings.index
missings['%'] = missings['total']/train.shape[0]
missings = missings[missings['total']>0].sort_values('total', ascending = False)
missings

Unnamed: 0,total,name,%
PoolQC,1453,PoolQC,0.995205
MiscFeature,1406,MiscFeature,0.963014
Alley,1369,Alley,0.937671
Fence,1179,Fence,0.807534
FireplaceQu,690,FireplaceQu,0.472603
LotFrontage,259,LotFrontage,0.177397
GarageType,81,GarageType,0.055479
GarageYrBlt,81,GarageYrBlt,0.055479
GarageFinish,81,GarageFinish,0.055479
GarageQual,81,GarageQual,0.055479


In [5]:
missing_col = missings.name[:5].tolist()
missing_col

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

In [6]:
train.drop(missing_col, inplace = True, axis= 1)

In [7]:
train.drop("Id", axis = 1, inplace= True)

In [8]:
#DataFrame de treino limpo
train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,112,0,0,0,0,4,2010,WD,Normal,142125


## Baseline

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

In [10]:
#Separando base de dados entre X e y

X = train.drop("SalePrice", axis = 1)
y = train.SalePrice

In [11]:
# Segregando X e y de teste e treino

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=99)

In [12]:
#Pipeline especificando estratégia para imputar valores faltantes das colunas pela mediana 
fill_null = Pipeline(steps=[('imputer',SimpleImputer(strategy='median'))
                           ])

#Pré processamento utilizando o processamento anterior para preencher os valores faltantes de colunas numéricas 
baseline = ColumnTransformer(transformers=[('num', fill_null, X_train.select_dtypes("number").columns.tolist())
                                         ])

#Pipeline do modelo com os steps anteriores e especificando o modelo como regressão linear 
baseline_pipe = Pipeline(steps=[('preprocessor', baseline),
                                ('modelo',LinearRegression())])

In [13]:
#Fit do modelo nos dados de treino
baseline_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                               

In [14]:
#Especificando as metodologias de pontuação do modelo

scoring = {"R2": "r2", "NEG MAE":"neg_mean_absolute_error"}

In [15]:
# KFold com 5 splits 
kfold = KFold(n_splits = 5,
             shuffle = True,
             random_state = 99)

In [16]:
# Mostrando os resultados do modelo base: 5 splits sendo avaliados pelas metodologias de R2 e 
# Negative Mean Absolute Error (para facilitar o ranking de algoritmos quando utilizarmos o GridSearchCV, já que queremos
# minimizar o MAE)

results = pd.DataFrame(cross_validate(baseline_pipe, X=X_train, y = y_train,
                                     cv = kfold, scoring = scoring, n_jobs = -1)
                      )

results

Unnamed: 0,fit_time,score_time,test_R2,test_NEG MAE
0,0.020734,0.010487,0.830673,-21192.592003
1,0.024338,0.010487,0.797367,-23701.506814
2,0.03494,0.008192,0.812551,-22664.022527
3,0.023866,0.009614,0.786621,-24616.963313
4,0.032217,0.008837,0.456338,-25646.771428


In [17]:
#Média dos resultados dos 5 splits
results.mean()

fit_time            0.027219
score_time          0.009524
test_R2             0.736710
test_NEG MAE   -23564.371217
dtype: float64

In [18]:
#Erro do modelo quando testamos o modelo com a base de dados de teste
mean_absolute_error(y_test, baseline_pipe.predict(X_test))

22048.248578916726

## Modelo de Regressão 

### Tratamento adicional de dados e construção de um modelo mais complexo, com mais etapas

In [19]:
conda install -c conda-forge category_encoders

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [20]:
from category_encoders import OneHotEncoder

In [21]:
# Essas variáveis, apesar de serem numéricas, indicam categorias, portanto vamos tratar elas como strings que serão
# transformadas em dummys 

print(X_train.MSSubClass.unique())
print(X_train.YrSold.unique())
print(X_train.MoSold.unique())

[ 20 180  70 190  60  85  90  50 160 120  30  45  75  80  40]
[2008 2009 2010 2006 2007]
[10  6  8  5  2  4  3  7 12  9 11  1]


In [22]:
# Transformação em strings em ambas bases de dados

X_train.MSSubClass = X_train.MSSubClass.astype(str)
X_train.YrSold = X_train.YrSold.astype(str)
X_train.MoSold = X_train.MoSold.astype(str)

X_test.MSSubClass = X_train.MSSubClass.astype(str)
X_test.YrSold = X_train.YrSold.astype(str)
X_test.MoSold = X_train.MoSold.astype(str)

In [23]:
# Criando pipelines para transformação de dados nas colunas
num_transformer = Pipeline(steps=[(
    'imputer', SimpleImputer(strategy = 'median'))
])

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value = 'MISSING'))
                                 ])

full_preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, X_train.select_dtypes("number").columns.tolist()),
    ('cat', cat_transformer, X_train.select_dtypes("object").columns.tolist())
])

In [24]:
#Pipeline do modelo com os passos anteriormente feitos, transformação de dummys e designação de modelo linear
pipe = Pipeline(steps = [
    ('preprocessor', full_preprocessor),
    ('ohe', OneHotEncoder()),
    ('model', LinearRegression())
])

In [25]:
#Fit do modelo
pipe.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'LowQualFinSF',
                                                   'GrLivArea', 'BsmtFullBath',
                                                   'BsmtHa...
                                  

In [26]:
pipe_results = pd.DataFrame(cross_validate(pipe, X=X_train, y=y_train,
                                          cv = kfold, scoring = scoring,
                                          n_jobs = -1))

pipe_results

Unnamed: 0,fit_time,score_time,test_R2,test_NEG MAE
0,24.272797,5.626673,0.848706,-20416.29081
1,24.453596,5.518919,0.815244,-21547.303312
2,24.173443,5.734457,0.827475,-22756.661869
3,22.640637,6.904807,0.82992,-22843.181273
4,23.190206,6.77248,0.790919,-23703.466466


In [27]:
pipe_results.mean()

fit_time           23.746136
score_time          6.111467
test_R2             0.822453
test_NEG MAE   -22253.380746
dtype: float64

In [28]:
mean_absolute_error(y_test,pipe.predict(X_test))

21697.763402194145

## Novas variáveis

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

In [30]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self, X, y=None):
        X=X.copy()
        X.SqFtPerRoom = X.GrLivArea / (X.TotRmsAbvGrd + X.FullBath + X.HalfBath + X.KitchenAbvGr)
        X.Total_Home_Quality = X.OverallQual + X.OverallCond
        X.Total_Bathrooms = X.FullBath + (0.5 * X.HalfBath) + X.BsmtFullBath + (0.5 * X.BsmtHalfBath)
        X.HighQualSF = X['1stFlrSF'] + X['2ndFlrSF']
        
        skewed_vars = X.select_dtypes('number').apply(lambda x: skew(x)).sort_values(ascending=False)
        high_skew = skewed_vars[skewed_vars > 0.5].index.tolist()
        X[high_skew] = np.log1p(X[high_skew])
        
        return X

In [31]:
final_pipe = Pipeline(steps=[
    
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', full_preprocessor),
    ('ohe', OneHotEncoder()),
    ('model', LinearRegression())
])

In [32]:
final_pipe.fit(X_train, y_train)

Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'Low...
                                                   'RoofMatl', 'Exterior1st',
                                           

In [33]:
final_pipe_results = pd.DataFrame(cross_validate(final_pipe, X=X_train, y=y_train, 
                                                cv = kfold, scoring = scoring, n_jobs = -1))

final_pipe_results

Unnamed: 0,fit_time,score_time,test_R2,test_NEG MAE
0,25.229573,6.568529,0.852196,-19876.860048
1,25.279397,6.416049,0.808073,-21940.521696
2,24.966045,6.646132,0.828696,-22487.582702
3,24.665187,6.732503,0.798932,-25951.630409
4,24.969108,6.743968,0.789676,-23854.432027


In [34]:
final_pipe_results.mean()

fit_time           25.021862
score_time          6.621436
test_R2             0.815514
test_NEG MAE   -22822.205376
dtype: float64

In [35]:
mean_absolute_error(y_test, final_pipe.predict(X_test))

21670.682892064433

## Feature Selection

In [36]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

In [37]:
feature_selection = SelectFromModel(estimator=Lasso())

In [38]:
feature_pipe = Pipeline(steps=[
    
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', full_preprocessor),
    ('ohe', OneHotEncoder()),
    ('selector', feature_selection),
    ('model', LinearRegression())
])

In [39]:
feature_pipe.fit(X_train,y_train)

Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'Low...
                                                   'ExterQual', 'ExterCond',
                                            

In [40]:
feature_pipe_results = pd.DataFrame(cross_validate(feature_pipe, X=X_train, y=y_train,
                                                  scoring = scoring, cv=kfold, n_jobs=-1)) 

In [41]:
feature_pipe_results

Unnamed: 0,fit_time,score_time,test_R2,test_NEG MAE
0,35.360896,6.461387,0.857264,-19851.974389
1,35.828838,6.091645,0.816916,-22366.548211
2,35.378609,6.241124,0.814463,-24168.058041
3,35.43087,6.248994,0.834081,-24689.101605
4,35.717994,6.338288,0.737959,-26482.138957


In [42]:
feature_pipe_results.mean()

fit_time           35.543441
score_time          6.276288
test_R2             0.812137
test_NEG MAE   -23511.564241
dtype: float64

In [43]:
mean_absolute_error(y_test, feature_pipe.predict(X_test))

24903.758709406193

## Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import roc_auc_score

In [45]:
gradient_pipe = Pipeline(steps=[
    
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', full_preprocessor),
    ('ohe', OneHotEncoder()),
    ('model', GradientBoostingRegressor())
])

In [46]:
gradient_pipe.fit(X_train,y_train)

Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'Low...
                                                   'RoofMatl', 'Exterior1st',
                                           

In [47]:
kfold_grad = KFold(n_splits = 10, shuffle = True)

In [48]:
gradient_results = pd.DataFrame(cross_validate(gradient_pipe, X_train, y_train, 
                                              cv = kfold_grad, scoring = scoring,
                                              n_jobs = -1))

In [49]:
gradient_results.mean()

fit_time           52.075922
score_time          6.795347
test_R2             0.820734
test_NEG MAE   -21865.083238
dtype: float64

In [50]:
mean_absolute_error(y_test, gradient_pipe.predict(X_test))

21330.23808230684

#### Tuning hyperparameters

In [51]:
from sklearn.model_selection import GridSearchCV

In [52]:
param = {'n_estimators': [300, 250, 200, 150, 100], 'learning_rate': [0.2, 0.15, 0.10, 0.05]}

In [53]:
gradient_pipe_gs = Pipeline(steps=[
    
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', full_preprocessor),
    ('ohe', OneHotEncoder()),
    ('model', GridSearchCV(GradientBoostingRegressor(), param_grid = param, scoring = 'r2'))
])

In [54]:
gradient_pipe.fit(X_train, y_train)

Pipeline(steps=[('feature_engineering', FeatureEngineering()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   '2ndFlrSF', 'Low...
                                                   'RoofMatl', 'Exterior1st',
                                           

In [55]:
gradient_gs_results = pd.DataFrame(cross_validate(gradient_pipe_gs, X_train, y_train, 
                                              cv = kfold_grad, scoring = scoring,
                                              n_jobs = -1))

In [56]:
gradient_gs_results.mean()

fit_time         2273.122002
score_time          4.436743
test_R2             0.808832
test_NEG MAE   -22438.429853
dtype: float64

## Resumo modelos 

In [57]:
# Modelo 01
pipe_results.mean()

fit_time           17.450440
score_time          3.523346
test_R2             0.822453
test_NEG MAE   -22253.380746
dtype: float64

In [58]:
# Modelo 02
final_pipe_results.mean()

fit_time           17.780241
score_time          3.496292
test_R2             0.815514
test_NEG MAE   -22822.205376
dtype: float64

In [59]:
mean_absolute_error(y_test, final_pipe.predict(X_test))

21670.682892064433

In [60]:
# Modelo 03
feature_pipe_results.mean()

fit_time           28.330832
score_time          3.559130
test_R2             0.812137
test_NEG MAE   -23511.564241
dtype: float64

In [61]:
mean_absolute_error(y_test, feature_pipe.predict(X_test))

24903.758709406193

In [66]:
test.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [67]:
X_test.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF'

In [68]:
teste = test[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition']]

In [69]:
resultado_enviar = gradient_pipe.predict(teste)

In [70]:
resultado_enviar

array([114725.10840016, 145935.06259905, 179884.3019301 , ...,
       149299.55195459, 123942.72479109, 202417.22302475])

In [71]:
submission = pd.DataFrame(data=resultado_enviar, index = test["Id"], columns = ['SalePrice'])
submission.index.names=['Id']
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,114725.108400
1462,145935.062599
1463,179884.301930
1464,195183.414628
1465,199468.343043
...,...
2915,98130.388986
2916,109335.685765
2917,149299.551955
2918,123942.724791


In [72]:
submission.to_csv('Submission_grad.csv')