# Preparation

In [3]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from feature_engine.imputation import CategoricalImputer
from sklearn.base import BaseEstimator,TransformerMixin

In [4]:
# load train and test data
train_raw = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv", index_col='Id')
test_raw = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv", index_col='Id')
print("The shape of train: {}".format(train_raw.shape))
print("The shape of test: {}".format(test_raw.shape))

The shape of train: (1460, 80)
The shape of test: (1459, 79)


# Check Data

## Tools for showing missing data

In [5]:
## getColInfo shows a table with column names and their dtypes and the number of unique values
## you can set parameter dtype and cardinality_threshold to show columns you want
## it can be used to check how many categories a column has for the reference of 
## choosing methods of encoding categorical variables.
def getColInfo(train, dtype=None, cardinality_threshold=0):
    train_dtypes = [train[dt].dtype for dt in train.columns]
    nuniques = [train[col].nunique() for col in train.columns]
    res = pd.DataFrame({'Columns Names': train.columns,
                                 'Dtypes': train_dtypes,
                                 'Unique Value Numbers': nuniques })
    if dtype != None:
        res = res.loc[res['Dtypes']==dtype]
    res = pd.DataFrame(res.loc[res['Unique Value Numbers'] >= cardinality_threshold])
    return(res.sort_values(by=['Unique Value Numbers'], axis=0))

## getMissingData shows the percentage of columns that contain missing data
def getMissingData(train):
    all_data_na = (train.isnull().sum() / len(train)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    all_data_na_dtypes = [train[dt].dtype for dt in all_data_na.index]
    nuniques = train[all_data_na.index].nunique()
    missing_data = pd.DataFrame({'Dtypes': all_data_na_dtypes,
                                 'Missing Ratio' : all_data_na,
                                 'Unique Value Numbers': nuniques })
    if missing_data.shape[0] == 0:
        return("no missing vlaue")
    return(missing_data)

In [6]:
getColInfo(train_raw, dtype='object', cardinality_threshold=10)

Unnamed: 0,Columns Names,Dtypes,Unique Value Numbers
22,Exterior1st,object,15
23,Exterior2nd,object,16
11,Neighborhood,object,25


In [7]:
ob = getMissingData(train_raw)
type(ob.loc[ob['Missing Ratio']>5].index)

pandas.core.indexes.base.Index

# Preprocessing

## Split Training Data

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(train_raw.drop('SalePrice', axis=1), 
                                                      train_raw['SalePrice'], 
                                                      test_size=0.1, 
                                                      random_state=99)

In [9]:
X_train.shape

(1314, 79)

## Delete Unwanted Columns 

### Deletion Strategy

In [10]:
## define a class to filter unwanted columns
class ColumnFilter(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0):
        super().__init__()
        self.threshold = threshold
        self.col_names_deleted = []
        
    def fit(self, X, y=None):
        missing_df = self.get_missing_data(X, self.threshold)
        self.col_names_deleted = missing_df.index
        return self
    
    def transform(self, X, y=None):
        return(X.drop(columns=self.col_names_deleted))
        
        
    def get_missing_data(self, X, threshold):
        all_data_na = (X.isnull().sum() / len(X)) * 100
        all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
        all_data_na_dtypes = [X[dt].dtype for dt in all_data_na.index]
        nuniques = X[all_data_na.index].nunique()
        missing_data = pd.DataFrame({'Dtypes': all_data_na_dtypes,
                                 'Missing Ratio' : all_data_na,
                                 'Unique Value Numbers': nuniques })
        return(missing_data.loc[missing_data['Missing Ratio'] >= self.threshold])
    

In [11]:
# test
# drop all the columns if their missing ration is greater than 5%
cf = ColumnFilter(5)
X_train_transformed = cf.fit_transform(X_train)
X_train_transformed.shape


(1314, 68)

In [12]:
X_train_transformed.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
147,30,RM,6120,Pave,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,...,0,0,0,0,0,0,11,2009,WD,Normal
1237,160,RL,2628,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,27,0,0,0,0,0,6,2010,WD,Normal
799,60,RL,13518,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,78,0,0,0,0,0,7,2009,New,Partial
254,80,RL,9350,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,10,2007,CWD,Normal
274,20,RL,9600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,4,2009,WD,Normal


## Impute Missing Values

### Imputation Strategy

For qualitative values, impute the mean value if there is no specified requirement.
For categorical values, impute the mode value if there is no specified requirement.

### Realization

In [13]:
## Below is a class you can use to realize customized imputation
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
class MyImputer(BaseEstimator, TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [14]:
# test
# impute X_train
my_imputer = MyImputer()
X_train_imputed = my_imputer.fit_transform(X_train_transformed)
# check missing data after imputation
getMissingData(X_train_imputed)

'no missing vlaue'

In [15]:
X_train_imputed.shape

(1314, 68)

## Encode Categorical Variavles

### Encoding Strategy

We will encode all the categorical data using one-hot encoding with a specified threshold of cardinality.

In [16]:
class MyEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, cardinality_threshold=0):
        self.cardinality_threshold=cardinality_threshold
        self.cat_cols = []
        self.one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        
        
    def fit(self, X, y=None):
        # get all the categorial column names
        self.cat_cols = list(X.columns[X.dtypes == 'O'])
        self.cat_cols = list(X[self.cat_cols].columns[X[self.cat_cols].nunique() > self.cardinality_threshold])
        self.one_hot_encoder.fit(X[self.cat_cols])
        return(self)

    def transform(self, X, y=None):
        # one hot encoder transform 
        X_encoded = pd.DataFrame(self.one_hot_encoder.transform(X[self.cat_cols]).toarray(), 
                                 index=X.index, 
                                 columns=self.one_hot_encoder.get_feature_names_out())
        #print('The shape of X_encoded is {}'.format(X_encoded.shape))
        # delete original categorical columns
        X_dropped = X.drop(self.cat_cols,axis=1)
        #print('The shape of X_dropped is {}'.format(X_dropped.shape))
        # concat
        return(pd.concat([X_dropped,X_encoded],axis=1))


In [21]:
my_encoder= MyEncoder()
my_encoder.fit(X_train)
X_encoded = my_encoder.transform(X_train)
X_encoded.shape

(1314, 302)

# Building Pipeline

In [294]:
pipe_ranf = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('model_ranf', RandomForestRegressor())]) 


In [296]:
grid_params_ranf = {
    'ColumnFilter_ranf__threshold':[5],
    'model_ranf__n_estimators':[500],
    'model_ranf__max_depth':[12,14,16],
    'model_ranf__criterion':['squared_error'],
    'model_ranf__min_samples_leaf':[5,10,20],
    'model_ranf__oob_score':[True],
    'model_ranf__random_state':[2021]
}
ranf_cv_tune = GridSearchCV(estimator=pipe_ranf,
            param_grid=grid_params_ranf,
            scoring='neg_mean_squared_error',
            cv=5)


In [297]:
# log transform y
ranf_cv_tune.fit(X_train, np.log1p(y_train))





GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter()),
                                       ('MyImputer_ranf', MyImputer()),
                                       ('MyEncoder_ranf', MyEncoder()),
                                       ('model_ranf',
                                        RandomForestRegressor())]),
             param_grid={'ColumnFilter_ranf__threshold': [5],
                         'model_ranf__criterion': ['squared_error'],
                         'model_ranf__max_depth': [12, 14, 16],
                         'model_ranf__min_samples_leaf': [5, 10, 20],
                         'model_ranf__n_estimators': [500],
                         'model_ranf__oob_score': [True],
                         'model_ranf__random_state': [2021]},
             scoring='neg_mean_squared_error')

In [306]:
ranf_best = ranf_cv_tune.best_estimator_
pred_valid = ranf_best.predict(X_valid)

In [307]:
def MSE(y_valid,y_pred):
    res = np.square(np.subtract(y_valid,y_pred)).mean()
    return(res)

In [308]:
MSE(np.log1p(y_valid), pred_valid)

0.015168034385216899

In [23]:
# test
pipe_ranf2 = Pipeline([('ColumnFilter_ranf', ColumnFilter(5)),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),]) 

X_tranransformed = pipe_ranf2.fit_transform(X_train)

ranf2 = RandomForestRegressor()

In [None]:
grid_params_ranf2 = {
    'n_estimators':[500],
    'max_depth':[4,6,8,10,12,14],
    'criterion':['squared_error'],
    'min_samples_leaf':[5,10,20],
    'oob_score':[True],
    'random_state':[2021]
}
grid2 = GridSearchCV(estimator=ranf2,
            param_grid=grid_params_ranf2,
            scoring='neg_mean_squared_error',
            cv=5)


grid2.fit(X_transformed.values, np.log1p(y_train))

In [None]:
grid2.best_estimator_

In [None]:
ranf_best = RandomForestRegressor(
    n_estimators=500,
    max_depth=14,
    criterion='squared_error',
    min_samples_leaf=5,
    oob_score=True,
    random_state=2021
)
ranf_best.fit(X_transformed.values,  np.log1p(y_train))

In [None]:
X_valid_transformed = pipe_ranf2.transform(X_valid)
pred_value = ranf_best.predict(X_valid_transformed.values)

In [None]:
MSE_valid = np.square(np.subtract(np.log1p(y_valid),pred_value)).mean();MSE_valid

In [56]:
# define transform function for y
import math
def trans_Y(Y):
    return([math.exp(1)**y - 1 for y in Y ])

# Prediction

In [None]:
test_raw_transformed = pipe_ranf2.transform(test_raw)
pred_test = ranf_best.predict(test_raw_transformed.values)

In [None]:
pred_test_reversed = trans_Y(pred_test)

In [None]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission.csv', index=False)

# XGBoost

## Preprocessing

In [279]:
import xgboost as xgb
pipe_xgb = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('XGBoost', xgb.XGBRegressor())]) 

## Training

In [284]:
grid_params_xgb = {
    'ColumnFilter_ranf__threshold':[5],
    'XGBoost__n_estimators':[200,500],
    'XGBoost__max_depth':[2,4],
    'XGBoost__min_child_weight': [5,8],
    'XGBoost__learning_rate': [0.1,0.05],
    'XGBoost__gamma':[0.01,0.1],
    'XGBoost__reg_lambda': [0.1,0.5,2,5],
    'XGBoost__random_state': [2022]
}
xgb_cv_tune = GridSearchCV(estimator=pipe_xgb,
            param_grid=grid_params_xgb,
            scoring='neg_mean_squared_error',
            cv=5)

In [285]:
xgb_cv_tune.fit(X_train,np.log1p(y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter()),
                                       ('MyImputer_ranf', MyImputer()),
                                       ('MyEncoder_ranf', MyEncoder()),
                                       ('XGBoost',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     enable_categorical=False,
                                                     gamma=None, gpu_id=None,
                                                     importance_type=None,
                                                     interaction_constr...
                                   

In [287]:
xgb_best = xgb_cv_tune.best_estimator_;xgb_best

Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter(threshold=5)),
                ('MyImputer_ranf', MyImputer()),
                ('MyEncoder_ranf', MyEncoder()),
                ('XGBoost',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, enable_categorical=False,
                              gamma=0.01, gpu_id=-1, importance_type=None,
                              interaction_constraints='', learning_rate=0.1,
                              max_delta_step=0, max_depth=2, min_child_weight=5,
                              missing=nan, monotone_constraints='()',
                              n_estimators=500, n_jobs=4, num_parallel_tree=1,
                              predictor='auto', random_state=2022, reg_alpha=0,
                              reg_lambda=5, scale_pos_weight=1, subsample=1,
                              tree_method='exact', validat

In [291]:
pred_xgb = xgb_best.predict(X_valid)

In [293]:
MSE_valid = MSE(np.log1p(y_valid), pred_xgb);MSE_valid

0.010476437557894843

## Prediction

In [148]:
pred_test = xgb_regressor_refit.predict(test_raw_transformed)
pred_test_reversed = trans_Y(pred_test)

In [151]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_xgboost.csv', index=False)

# Lasso

In [155]:
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso

In [264]:
pipe_lasso = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('Lasso', Lasso())
                      ]) 


In [265]:
grid_params_lasso = {
    'ColumnFilter_ranf__threshold':[5],
    'Lasso__alpha':[0.01,0.05,0.1,1,2],
    'Lasso__random_state':[2021],
    'Lasso__selection':['random'],
    'Lasso__max_iter': [1000,1500,2000]
}
lasso_cv_tune = GridSearchCV(estimator=pipe_lasso,
            param_grid=grid_params_lasso,
            scoring='neg_mean_squared_error',
            cv=5)


In [266]:
lasso_cv_tune.fit(X_train, np.log1p(y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter()),
                                       ('MyImputer_ranf', MyImputer()),
                                       ('MyEncoder_ranf', MyEncoder()),
                                       ('RobustScaler', RobustScaler()),
                                       ('Lasso', Lasso())]),
             param_grid={'ColumnFilter_ranf__threshold': [5],
                         'Lasso__alpha': [0.01, 0.05, 0.1, 1, 2],
                         'Lasso__max_iter': [1000, 1500, 2000],
                         'Lasso__random_state': [2021],
                         'Lasso__selection': ['random']},
             scoring='neg_mean_squared_error')

In [267]:
lasso_best = lasso_cv_tune.best_estimator_

In [268]:
lasso_best.fit(X_train, np.log1p(y_train))

Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter(threshold=5)),
                ('MyImputer_ranf', MyImputer()),
                ('MyEncoder_ranf', MyEncoder()),
                ('RobustScaler', RobustScaler()),
                ('Lasso',
                 Lasso(alpha=0.01, random_state=2021, selection='random'))])

In [269]:
lasso_pred = lasso_best.predict(X_valid)

In [270]:
MSE(np.log1p(y_valid), lasso_pred)

0.013214076923489826

# Ridge Regression

In [194]:
from sklearn.kernel_ridge import KernelRidge

In [196]:
pipe_ridge = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('KernelRidge', KernelRidge())
                      ]) 


In [258]:
grid_params_ridge = {
    'ColumnFilter_ranf__threshold':[5],
    'KernelRidge__alpha':[0.01,0.1,0.5],
    'KernelRidge__kernel':['polynomial','linear'],
    'KernelRidge__degree': [3],
    'KernelRidge__coef0': [2,2.5,3]
}
ridge_cv_tune = GridSearchCV(estimator=pipe_ridge,
            param_grid=grid_params_ridge,
            scoring='neg_mean_squared_error',
            cv=5)


In [259]:
ridge_cv_tune.fit(X_train, np.log1p(y_train))

  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coe

  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ColumnFilter_ranf',
                                        ColumnFilter(threshold=5)),
                                       ('MyImputer_ranf', MyImputer()),
                                       ('MyEncoder_ranf', MyEncoder()),
                                       ('RobustScaler', RobustScaler()),
                                       ('KernelRidge',
                                        KernelRidge(alpha=0.01, gamma=1))]),
             param_grid={'ColumnFilter_ranf__threshold': [5],
                         'KernelRidge__alpha': [0.01, 0.1, 0.5],
                         'KernelRidge__coef0': [2, 2.5, 3],
                         'KernelRidge__degree': [3],
                         'KernelRidge__kernel': ['polynomial', 'linear']},
             scoring='neg_mean_squared_error')

In [260]:
ridge_best = ridge_cv_tune.best_estimator_

In [324]:
ridge_cv_tune.scorer_

make_scorer(mean_squared_error, greater_is_better=False)

In [261]:
ridge_pred = pipe_ridge.predict(X_valid)
MSE(np.log1p(y_valid), ridge_pred)

0.015151176224024504

# Elastic Net Regression

In [233]:
from sklearn.linear_model import ElasticNet

In [234]:
pipe_elastic = Pipeline([('ColumnFilter_ranf', ColumnFilter()),
                      ('MyImputer_ranf', MyImputer()),
                      ('MyEncoder_ranf', MyEncoder()),
                      ('RobustScaler', RobustScaler()),
                      ('ElasticNet', ElasticNet())
                      ]) 

In [236]:
grid_params_elastic = {
    'ColumnFilter_ranf__threshold':[5],
    'ElasticNet__alpha':[0.01,0.1,0.5,0.6,0.8],
    'ElasticNet__l1_ratio':[0.1,0.3,0.5,0.7,1]
}
elastic_cv_tune = GridSearchCV(estimator=pipe_elastic,
            param_grid=grid_params_elastic,
            scoring='neg_mean_squared_error',
            cv=5)


In [237]:
elastic_cv_tune.fit(X_train, np.log1p(y_train))

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ColumnFilter_ranf', ColumnFilter()),
                                       ('MyImputer_ranf', MyImputer()),
                                       ('MyEncoder_ranf', MyEncoder()),
                                       ('RobustScaler', RobustScaler()),
                                       ('ElasticNet', ElasticNet())]),
             param_grid={'ColumnFilter_ranf__threshold': [5],
                         'ElasticNet__alpha': [0.01, 0.1, 0.5, 0.6, 0.8],
                         'ElasticNet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 1]},
             scoring='neg_mean_squared_error')

In [251]:
elastic_best = elastic_cv_tune.best_estimator_

In [250]:
elastic_pred = elastic_best.predict(X_valid)
MSE(np.log1p(y_valid), elastic_pred)

0.009748359735627843

## Prediction

In [242]:
pred_elastic = pipe_elastic.predict(test_raw)

In [243]:
pred_test_reversed = trans_Y(pred_elastic)

In [245]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_elastic.csv', index=False)

# Ensemble

## Ensemble with average strategy

In [315]:
from sklearn.ensemble import VotingRegressor

In [331]:
votingR = VotingRegressor(estimators=[
    ('xgb', xgb_best),
    ('lasso',lasso_best),
    ('elastic', elastic_best)],                    
     n_jobs=4)

votingR = votingR.fit(X_train, np.log1p(y_train))

In [332]:
voting_pred = votingR.predict(X_valid)

In [333]:
MSE(np.log1p(y_valid), voting_pred)

0.008936755429089832

In [334]:
pred_test = votingR.predict(test_raw)
pred_test_reversed = trans_Y(pred_test)

In [335]:
output = pd.DataFrame({'Id': test_raw.index,
                       'SalePrice': pred_test_reversed})
output.to_csv('submission_ensemble_average.csv', index=False)