In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Dataset

In [None]:
X_full = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col='Id')

# Dropping samples with missing target and dropping Target variable from training dataset

In [None]:
X_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
y=X_full['SalePrice']
X_full.drop(['SalePrice'],axis=1,inplace=True)

# Splitting dataset into training and validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train_full,X_valid_full,y_train,y_valid=train_test_split(X_full,y,train_size=0.8,test_size=0.2,random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)

In [None]:
cat_cols=[c for c in X_train_full if X_train_full[c].nunique()<10 and X_train_full[c].dtype=='object']

# Select numerical columns

In [None]:
num_cols=[c for c in X_train_full if X_train_full[c].dtype in ['int64','float64']]

In [None]:
my_cols=cat_cols+num_cols
X_train=X_train_full[my_cols].copy()
X_valid=X_valid_full[my_cols].copy()
X_test=X_test_full[my_cols].copy()

# Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_trans=SimpleImputer(strategy='median')
cat_trans=Pipeline(steps=[
    ('imp',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])
preprocesssor=ColumnTransformer(transformers=[
    ('numerical',num_trans,num_cols),
    ('categorical',cat_trans,cat_cols)
])


# Hyperparameter Tuning

In [None]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

pipeline1=Pipeline(steps=[
    ('preprocessor',preprocesssor),
    ('model',XGBRegressor())
])
param_grid={
        'model__n_estimators':np.arange(300,1500,100),
        'model__max_depth':np.arange(3,10),
        'model__learning_rate':[0.05,0.1,0.15]
    }
   
        
    
    


gbmodel=RandomizedSearchCV(
    estimator=pipeline1,
    param_distributions=param_grid,
    n_iter=10,
    verbose=10,
    n_jobs=1,
    cv=5
)

gbmodel.fit(X_train,y_train)


In [None]:
gbmodel.best_params_

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
pipeline2=Pipeline(steps=[
    ('preprocessor',preprocesssor),
    ('model',GradientBoostingRegressor())
])
param_grid={
        'model__n_estimators':np.arange(300,1000,100),
        'model__max_depth':np.arange(3,10),
        'model__learning_rate':[0.05,0.1]
    }
   
        
    
    

model=RandomizedSearchCV(
    estimator=pipeline2,
    param_distributions=param_grid,
    n_iter=10,
    verbose=10,
    n_jobs=1,
    cv=5
)

model.fit(X_train,y_train)

In [None]:
model.best_params_

In [None]:
pipeline3=Pipeline(steps=[
    ('preprocessor',preprocesssor),
    ('model',CatBoostRegressor())
])
param_grid={
        'model__n_estimators':[300,500,700,1000],
        'model__max_depth':np.arange(3,10,2),
        'model__learning_rate':[0.05,0.1]
    }   

model2=RandomizedSearchCV(
    estimator=pipeline3,
    param_distributions=param_grid,
    n_iter=10,
    verbose=10,
    n_jobs=1,
    cv=5
)

model2.fit(X_train,y_train)

In [None]:
model2.best_params_

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=700,random_state=42)
rf_pipeline=Pipeline(steps=[
    ('preprocessor',preprocesssor),
    ('rf',rf)
])

# Catboost

In [None]:
cb=GradientBoostingRegressor(n_estimators=300,max_depth=7,learning_rate=0.1)
cb_pipeline=Pipeline(steps=[
    ('preprocesssor',preprocesssor),
    ('cb',cb)
])

In [None]:
cb_pipeline.fit(X_train,y_train)

In [None]:
cb_pred=cb_pipeline.predict(X_valid)

# Gradient Boosting

In [None]:

gb=GradientBoostingRegressor(n_estimators=900,max_depth=4,learning_rate=0.1)
gb_pipeline=Pipeline(steps=[
    ('preprocesssor',preprocesssor),
    ('gb',gb)
])

In [None]:
rf_pipeline.fit(X_train,y_train)
gb_pipeline.fit(X_train,y_train)

# Random forest + Gradient Boosting

In [None]:
rf_pred=rf_pipeline.predict(X_valid)
gb_pred=gb_pipeline.predict(X_valid)
rfgb=(rf_pred+gb_pred)/2

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
rfgb_error=np.sqrt(mean_squared_error(y_valid,rfgb))
print(rfgb_error)

# Catboost+Gradient boost

In [None]:
cbgb=(cb_pred+gb_pred)/2
cbgb_error=np.sqrt(mean_squared_error(y_valid,cbgb))
print(cbgb_error)

# GB+RF+CB

In [None]:
p3=(gb_pred+rf_pred+cb_pred)/3
cxb_error=np.sqrt(mean_squared_error(y_valid,p3))
print(cxb_error)

# XGBoost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1400,max_depth=7,learning_rate=0.05, n_jobs=4)
xgb_pipeline=Pipeline(steps=[
    ('preprocesssor',preprocesssor),
    ('xgb',xgb)
])


**The problem is that pipelines do not fit eval_set. So, as you said, you need to preprocess X_valid. To do that the easiest way is using your pipeline without the 'model' step. Use the following code before fitting your pipeline:**

In [None]:
# Make a copy to avoid changing original data
X_valid_eval=X_valid.copy()
# Remove the model from pipeline
eval_set_pipe = Pipeline(steps = [('preprocesssor', preprocesssor)])
# fit transform X_valid.copy()
X_valid_eval = eval_set_pipe.fit(X_train, y_train).transform (X_valid_eval)

In [None]:
xgb_pipeline.fit(X_train, y_train, 
             xgb__early_stopping_rounds=5, 
             xgb__eval_set=[(X_valid_eval, y_valid)],xgb__verbose=False 
             )   

# GB+XGB

In [None]:
xgb_pred=xgb_pipeline.predict(X_valid)
gbx=(gb_pred+xgb_pred)/2
gbx_error=np.sqrt(mean_squared_error(y_valid,gbx))
print(gbx_error)

# CB+XGB

In [None]:
cxb=(cb_pred+xgb_pred)/2
cxb_error=np.sqrt(mean_squared_error(y_valid,cxb))
print(cxb_error)

# GB+XGB+CB

In [None]:
p3=(gb_pred+xgb_pred+cb_pred)/3
cxb_error=np.sqrt(mean_squared_error(y_valid,p3))
print(cxb_error)

# GB+RF+CB

In [None]:
p3=(gb_pred+rf_pred+cb_pred)/3
cxb_error=np.sqrt(mean_squared_error(y_valid,p3))
print(cxb_error)

# GB+CB+XGB+RF

In [None]:
p4=(gb_pred+xgb_pred+cb_pred+rf_pred)/4
cxb_error=np.sqrt(mean_squared_error(y_valid,p4))
print(cxb_error)

In [None]:
pred1=xgb_pipeline.predict(X_test)
pred2=gb_pipeline.predict(X_test)
pred=(pred1+pred2)/2

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)