In [1]:
"""
This is a top ten submission in the private leaderboard of 
"Housing Prices Competition for Kaggle Learn Users" [1] in February 2019.
Since then the competition has been restarted and currently only shows 
public leaderboard results. The latter are highly overfitted.

[1] https://www.kaggle.com/c/home-data-for-ml-course

"""


import numpy as np 
import pandas as pd 

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 1000)


def concat_squares(x):
    return np.hstack((x, x**2))

def ml_pipe(regressor, num_cols, cat_cols):

    num_transformer = Pipeline([
            ('num_imputer', SimpleImputer(missing_values=np.nan,
                                          strategy='constant', 
                                          fill_value=0)),
            ('power_transformer', PowerTransformer())])
             
    cat_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant')),
            ('one-hot', OneHotEncoder(handle_unknown='ignore'))])
       
    pl = Pipeline([
            ('preprocessing', ColumnTransformer(
                                  [('num_feat', num_transformer, num_cols),
                                   ('cat_feat', cat_transformer, cat_cols),
                                  ])),
            ('voting_regressor', regressor)])
    return pl

def deploy_pipe(pl, X_tr, y_tr, X_valid, y_valid):
    """ """
    pl_hist = pl.fit(X_tr, y_tr)
    y_pred = pl.predict(X_valid)
    
    MAE = mae(y_valid, y_pred)
    print('\nMean absolute error = {}'.format(MAE))
    
    return pl, pl_hist, MAE


# Load training data
df_train = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col=0)
df_test = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col=0)


# Feature engineering - categorical counts
ord_cols = ['MSZoning', 'Street', 'Alley', 'Neighborhood', 'ExterQual', 'BsmtQual', 'BsmtCond',
            'CentralAir', 'KitchenQual', 'GarageFinish', 'PoolQC']

metrics = []
for col in ord_cols:
    metrics.append(df_train.groupby([col])['SalePrice'].median().sort_values())

for i, col in enumerate(ord_cols):
    ord_num = {v: k for k, v in enumerate(list(metrics[i].index.ravel()), 1)}
    df_train[col] = df_train[col].map(ord_num)
    df_test[col] = df_test[col].map(ord_num)

# Prepare training and testing set
y = df_train['SalePrice'].values.ravel()
df_train.drop(['SalePrice'], axis=1, inplace=True)

# Select numerical and categorical columns
num_cols = [col for col in df_train.columns if df_train[col].dtype 
            in ['int64', 'float64']]

cat_cols = [col for col in df_train.columns if df_train[col].dtype == 'object' 
           if col not in ord_cols and df_train[col].nunique() < 20]

regr_1 = LGBMRegressor(objective='regression',
                         num_leaves=4,
                         learning_rate=0.01,
                         n_estimators=5000,
                         max_bin=200,
                         bagging_fraction=0.75,
                         bagging_freq=5,
                         bagging_seed=7,
                         feature_fraction=0.2,
                         feature_fraction_seed=7,
                         verbose=-1,
                         random_state=101,
                         n_jobs=-1)
                      
                      
regr_2 = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                       max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear', nthread=-1,
                       scale_pos_weight=1, seed=27,
                       reg_alpha=0.00006,
                       random_state=0,
                       n_jobs=-1)

regr_3 = TransformedTargetRegressor(
            regressor=SVR(),
            transformer=QuantileTransformer(n_quantiles=200,
                                    output_distribution='normal'))

regressor_0 = VotingRegressor(
        estimators=[('LGB_1', regr_1),
                    ('LGB_2', regr_2),
                    ('SVR', regr_3)
                   ],
        weights=[0.3, 0.3, 0.4],
        n_jobs=-1)

# Model fitting
pl = ml_pipe(regressor_0, num_cols, cat_cols)
pl, pl_history, MAE = deploy_pipe(pl, df_train, y, df_train, y,)

# Preprocessing test data, model prediction and saving the results
df_submit = pd.read_csv('../input/home-data-for-ml-course/sample_submission.csv', index_col=0)
df_submit['SalePrice'] = pl.predict(df_test)
df_submit.to_csv('house_prices_voting_7.csv')
joblib.dump(pl, 'house_prices_voting_7.pkl')


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)



Mean absolute error = 7790.640100490856


['house_prices_voting_7.pkl']