## This is Prod notebook for house prices project from Kaggle, XGB

#### 1. Load environment and data

In [13]:
notebook_run_id = 1
# notebook_run_id is a digit, creating and deploying a new model every time this notebook is run. increment it by 1.
project_name = 'My First Project'
project_id = 'quantum-keep-360100'
regionn = 'us-central1'

ml_project_name = 'house_price'
model_name = 'XGB'

import numpy as np
import pandas as pd
import os, time, warnings, random, shap
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier, XGBRegressor
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
    """
    # set df_pred to None if it does not exist
    if (cat_fill=='mode'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            
    if (cat_fill=='missing'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value='missing')
        df_test[cat_features] = df_test[cat_features].fillna(value='missing')
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
    
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """
    This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])
    """
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """
    This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)
    """
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], 
                                       labels=False, 
                                       duplicates = 'drop', 
                                       bins = bin , 
                                       include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], 
                                      labels=False, 
                                      duplicates = 'drop', 
                                      bins = bin , 
                                      include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], 
                                          labels=False, 
                                          duplicates = 'drop', 
                                          bins = bin , 
                                          include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, feature_subset=False, min_skew=3, df_pred=None):
    """
    This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)
    """
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if df_pred:
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
def add_dummyfeatures(df_train, df_test, feature_dict, df_pred=None):
    """
    This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
    """
    if df_pred:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1]])    
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        if df_pred:
            df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
            df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    if df_pred:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded    

time0 = time.time()

os.chdir('/home/jupyter/projects_data/house_price')
df = pd.read_csv('train.csv') 
pred=pd.read_csv('test.csv')

print(df.shape, pred.shape)

(1460, 81) (1459, 80)


#### 2. Data cleaning and EDA

In [14]:
# 2. pEDA #

cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond', 
               'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea',  
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 
               'HalfBath', 'MiscVal', 'LotFrontage', 
               'ExterQual', 'MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
               'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
               'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive',
               'SaleType', 'SaleCondition', 'BsmtQual', 'BsmtCond', 
               'BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df = df[cols_tokeep]

df['MisGarage'] = df.GarageType.isnull().astype(int)
df['MisBsmt'] = df.BsmtCond.isnull().astype(int)
display(df.head())

ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
num_cols = ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
            'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 
           'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log']
cat_cols = list(set(df.columns)-set(num_cols)-set(ord_cols)-set(['SalePrice']))

df[ord_cols] = df[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
pred[ord_cols] = pred[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])

test_size = 0.2
df.reset_index(inplace=True, drop=True)
#random.seed(2)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train.drop(columns=['id'],inplace=True, errors='ignore')
test.drop(columns=['id'],inplace=True, errors='ignore')
display(train.shape, test.shape, train.head(3))

# fill missing values

mis_col_mode = ['LotFrontage', 'Electrical']
mis_cat_cols = ['BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish']
mis_num_cols = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']

for col in mis_col_mode:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

for col in mis_cat_cols:
    train[col] = train[col].fillna(value='missing')
    test[col] = test[col].fillna(value='missing')
    
for col in mis_num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,MisGarage,MisBsmt
0,208500,8450,7,5,2003,TA,706,0,856,Ex,856,854,1710,Gd,548,2,8,3,2,1,0,65.0,Gd,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,No,GLQ,Attchd,RFn,TA,TA,0,0
1,181500,9600,6,8,1976,TA,978,0,1262,Ex,1262,0,1262,TA,460,2,6,3,2,0,0,80.0,TA,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Gd,ALQ,Attchd,RFn,TA,TA,0,0
2,223500,11250,7,5,2001,TA,486,0,920,Ex,920,866,1786,Gd,608,2,6,3,2,1,0,68.0,Gd,60,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Mn,GLQ,Attchd,RFn,TA,TA,0,0
3,140000,9550,7,5,1915,TA,216,0,756,Gd,961,756,1717,Gd,642,3,7,3,1,0,0,60.0,TA,70,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,Wd Shng,BrkTil,GasA,Y,SBrkr,Typ,Y,WD,Abnorml,TA,Gd,No,ALQ,Detchd,Unf,TA,TA,0,0
4,250000,14260,8,5,2000,TA,655,0,1145,Ex,1145,1053,2198,Gd,836,3,9,4,2,1,0,84.0,Gd,60,RL,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Av,GLQ,Attchd,RFn,TA,TA,0,0


(1168, 53)

(292, 53)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,MisGarage,MisBsmt
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,2,1,0,65.0,4,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Attchd,RFn,3.0,3.0,0,0
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,2,0,0,80.0,3,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Gd,ALQ,Attchd,RFn,3.0,3.0,0,0
2,140000,9550,7,5,1915,3,216,0,756,4,961,756,1717,4,642,3,7,3,1,0,0,60.0,3,70,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,Wd Shng,BrkTil,GasA,Y,SBrkr,Typ,Y,WD,Abnorml,3.0,4.0,No,ALQ,Detchd,Unf,3.0,3.0,0,0


#### 3. Feature engineering

In [15]:
train['SalePrice'] = np.log(train['SalePrice'])
test['SalePrice'] = np.log(test['SalePrice'])

# do target encoding #

encoder = CrossFoldEncoder(MEstimateEncoder, m=10)
train_encoded = encoder.fit_transform(train, train.SalePrice, cols=cat_cols)
test_encoded = encoder.transform(test)

train.drop(columns=cat_cols, inplace=True)
test.drop(columns=cat_cols,  inplace=True)
train = pd.concat([train, train_encoded], axis = 1)
test = pd.concat([test, test_encoded], axis = 1)

display(train.shape, train.head())

add_dummyfeatures(train, test, {'OverallQual':1})
add_dummyfeatures(train, test, {'OverallQual':8})
add_dummyfeatures(train, test, {'OverallQual':9})
add_dummyfeatures(train, test, {'OverallQual':10})

X_train = train.copy()
y_train = X_train.pop('SalePrice')
X_test = test.copy()
y_test = X_test.pop('SalePrice')
print(X_train.shape, X_test.shape, y_train.shape)
display(X_train.head())

(1168, 53)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,LotConfig_encoded,MisGarage_encoded,FullBath_encoded,BsmtExposure_encoded,MisBsmt_encoded,Neighborhood_encoded,Heating_encoded,LandContour_encoded,GarageFinish_encoded,Exterior2nd_encoded,MSZoning_encoded,HalfBath_encoded,GarageType_encoded,Electrical_encoded,BsmtFinType1_encoded,HouseStyle_encoded,BldgType_encoded,SaleType_encoded,Foundation_encoded,PavedDrive_encoded,SaleCondition_encoded,RoofStyle_encoded,MSSubClass_encoded,Exterior1st_encoded,CentralAir_encoded,LotShape_encoded,Condition1_encoded,Functional_encoded
0,12.247694,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,12.001917,12.055476,12.212567,11.950964,12.043047,12.151146,12.037089,12.025126,12.159433,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,12.18068,12.053157,11.996137,12.269314,12.063192,12.003515,11.980819,12.340168,12.21202,12.061371,11.935832,12.040312,12.046054
1,12.109011,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,12.041599,12.055476,12.212567,12.349353,12.043047,12.139689,12.037089,12.025126,12.159433,11.87419,12.083115,11.924987,12.159779,12.063028,11.97057,12.000454,12.053157,11.996137,11.879552,12.063192,12.003515,11.980819,12.059047,11.868101,12.061371,11.935832,11.882704,12.046054
2,11.849398,9550,7,5,1915,3,216,0,756,4,961,756,1717,4,642,3,7,3,0,60.0,3,3.0,4.0,3.0,3.0,12.05119,12.055476,11.782508,11.950964,12.043047,12.184662,12.037089,12.025126,11.823103,11.881341,12.083115,11.924987,11.773749,12.063028,11.97057,12.18068,12.053157,11.996137,11.744264,12.063192,11.85259,11.980819,11.996709,11.856655,12.061371,12.174842,12.040312,12.046054
3,12.429216,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,12.041599,12.055476,12.212567,12.165483,12.043047,12.564623,12.037089,12.025126,12.159433,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,12.18068,12.053157,11.996137,12.269314,12.063192,12.003515,11.980819,12.340168,12.21202,12.061371,12.174842,12.040312,12.046054
4,11.8706,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,12.001917,12.055476,11.782508,11.950964,12.043047,11.915132,12.037089,12.025126,11.823103,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,11.844283,12.053157,11.996137,12.064623,12.063192,12.003515,11.980819,11.849986,12.21202,12.061371,12.174842,12.040312,12.046054


[1 1]  variables created
[1 1]  variables created
[1 1]  variables created
[1 1]  variables created
(1168, 56) (292, 56) (1168,)


Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,LotConfig_encoded,MisGarage_encoded,FullBath_encoded,BsmtExposure_encoded,MisBsmt_encoded,Neighborhood_encoded,Heating_encoded,LandContour_encoded,GarageFinish_encoded,Exterior2nd_encoded,MSZoning_encoded,HalfBath_encoded,GarageType_encoded,Electrical_encoded,BsmtFinType1_encoded,HouseStyle_encoded,BldgType_encoded,SaleType_encoded,Foundation_encoded,PavedDrive_encoded,SaleCondition_encoded,RoofStyle_encoded,MSSubClass_encoded,Exterior1st_encoded,CentralAir_encoded,LotShape_encoded,Condition1_encoded,Functional_encoded,OverallQual1,OverallQual8,OverallQual9,OverallQual10
0,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,12.001917,12.055476,12.212567,11.950964,12.043047,12.151146,12.037089,12.025126,12.159433,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,12.18068,12.053157,11.996137,12.269314,12.063192,12.003515,11.980819,12.340168,12.21202,12.061371,11.935832,12.040312,12.046054,0.0,0.0,0.0,0.0
1,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,12.041599,12.055476,12.212567,12.349353,12.043047,12.139689,12.037089,12.025126,12.159433,11.87419,12.083115,11.924987,12.159779,12.063028,11.97057,12.000454,12.053157,11.996137,11.879552,12.063192,12.003515,11.980819,12.059047,11.868101,12.061371,11.935832,11.882704,12.046054,0.0,0.0,0.0,0.0
2,9550,7,5,1915,3,216,0,756,4,961,756,1717,4,642,3,7,3,0,60.0,3,3.0,4.0,3.0,3.0,12.05119,12.055476,11.782508,11.950964,12.043047,12.184662,12.037089,12.025126,11.823103,11.881341,12.083115,11.924987,11.773749,12.063028,11.97057,12.18068,12.053157,11.996137,11.744264,12.063192,11.85259,11.980819,11.996709,11.856655,12.061371,12.174842,12.040312,12.046054,0.0,0.0,0.0,0.0
3,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,12.041599,12.055476,12.212567,12.165483,12.043047,12.564623,12.037089,12.025126,12.159433,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,12.18068,12.053157,11.996137,12.269314,12.063192,12.003515,11.980819,12.340168,12.21202,12.061371,12.174842,12.040312,12.046054,0.0,1.0,0.0,0.0
4,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,12.001917,12.055476,11.782508,11.950964,12.043047,11.915132,12.037089,12.025126,11.823103,12.214851,12.083115,12.199385,12.159779,12.063028,12.308323,11.844283,12.053157,11.996137,12.064623,12.063192,12.003515,11.980819,11.849986,12.21202,12.061371,12.174842,12.040312,12.046054,0.0,0.0,0.0,0.0


#### 4. Modeling

In [16]:
xgbb = XGBRegressor(n_estimators=200,
                   max_depth=5,
                   eta=0.06,
                   subsample=0.8,
                   colsample_bytree=0.6)
xgbb.fit(X_train, y_train)
print('xgb ', mean_squared_error(y_train, xgbb.predict(X_train))**0.5)

y_train = np.exp(y_train)
y_test = np.exp(y_test)

# performance evaluation: train set
print('Train: ')
print('XGB', np.sqrt(mean_squared_error(y_train, np.exp(xgbb.predict(X_train)))))

# performance evaluation: test set
print('Test: ')
print('XGB', np.sqrt(mean_squared_error(y_test, np.exp(xgbb.predict(X_test)))))

print('Total time for preprocessing/modeling part of the script: ', time.time() - time0)

xgb  0.04982778821350637
Train: 
XGB 9387.852640397205
Test: 
XGB 27597.13746196995
Total time for preprocessing/modeling part of the script:  2.272798776626587


#### 5. Deployment

In [19]:
deployment_time_start = time.time()

os.chdir('/home/jupyter/projects_gcp_cpu')

model_path = os.getcwd()+'/house_price/artifacts/model_xgb/'

artifact_filename = 'model.bst'
local_path = artifact_filename
xgbb.save_model(model_path+local_path)

In [20]:
# Upload model artifact to Cloud Storage
model_bucket = 'gs://pmykola-projectsgcp-artifacts/houseprices-xgb'
storage_path = os.path.join(model_bucket, artifact_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(model_path+artifact_filename)

In [21]:
from google.cloud import aiplatform

aiplatform.init(project=project_id, location=regionn)

model = aiplatform.Model.upload(display_name = ml_project_name+model_name+str(notebook_run_id),
    description = ml_project_name+model_name+str(notebook_run_id),
    artifact_uri = model_bucket,
    serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-6:latest'
)

Creating Model
Create Model backing LRO: projects/234443118908/locations/us-central1/models/8182121331211370496/operations/3911430496038420480
Model created. Resource name: projects/234443118908/locations/us-central1/models/8182121331211370496@1
To use this Model in another session:
model = aiplatform.Model('projects/234443118908/locations/us-central1/models/8182121331211370496@1')


In [22]:
# optional code to create an endpoint
endpoint = aiplatform.Endpoint.create(display_name = ml_project_name+model_name+str(notebook_run_id), 
                                      project = project_id, 
                                      location = regionn)
endpoint_id = endpoint.resource_name[-19:0]

Creating Endpoint
Create Endpoint backing LRO: projects/234443118908/locations/us-central1/endpoints/1932400481909342208/operations/3089523564043304960
Endpoint created. Resource name: projects/234443118908/locations/us-central1/endpoints/1932400481909342208
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/234443118908/locations/us-central1/endpoints/1932400481909342208')


In [23]:
# if you do not specify the endpoint parameter, a new endpoint will be created
# this step is slow. On a weak machine it runs for 3-6 minutes.
# if the VM runs more jobs or the model is complex, it may be even longer.
model.deploy(endpoint = endpoint,
             machine_type = 'n1-standard-2')

Deploying model to Endpoint : projects/234443118908/locations/us-central1/endpoints/1932400481909342208
Deploy Endpoint model backing LRO: projects/234443118908/locations/us-central1/endpoints/1932400481909342208/operations/7701209582470692864
Endpoint model deployed. Resource name: projects/234443118908/locations/us-central1/endpoints/1932400481909342208


<google.cloud.aiplatform.models.Endpoint object at 0x7f080adddd10> 
resource name: projects/234443118908/locations/us-central1/endpoints/1932400481909342208

In [24]:
test_instance = [8.45000000e+03, 7.00000000e+00, 5.00000000e+00, 2.00300000e+03,
       3.00000000e+00, 7.06000000e+02, 0.00000000e+00, 8.56000000e+02,
       5.00000000e+00, 8.56000000e+02, 8.54000000e+02, 1.71000000e+03,
       4.00000000e+00, 5.48000000e+02, 2.00000000e+00, 8.00000000e+00,
       3.00000000e+00, 0.00000000e+00, 6.50000000e+01, 4.00000000e+00,
       4.00000000e+00, 3.00000000e+00, 3.00000000e+00, 3.00000000e+00,
       1.19149349e+01, 1.20212216e+01, 1.21360716e+01, 1.19869729e+01,
       1.20288848e+01, 1.23153000e+01, 1.19813544e+01, 1.20096449e+01,
       1.21917951e+01, 1.21961726e+01, 1.20256525e+01, 1.21707660e+01,
       1.20486887e+01, 1.20524108e+01, 1.19659164e+01, 1.21460374e+01,
       1.20469395e+01, 1.20002024e+01, 1.22384184e+01, 1.21763269e+01,
       1.20226458e+01, 1.20252396e+01, 1.22802531e+01, 1.20396120e+01,
       1.21757046e+01, 1.20681130e+01, 1.21573679e+01, 1.19473673e+01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]
display(endpoint.predict(instances=[test_instance]))
endpoint_id = endpoint.resource_name[-19:]
display(endpoint_id)

Prediction(predictions=[12.29288578033447], deployed_model_id='7849690587623063552', model_version_id='1', model_resource_name='projects/234443118908/locations/us-central1/models/8182121331211370496', explanations=None)

'1932400481909342208'

In [25]:
import json

payload = {'instances': [test_instance]}

# Parse JSON
with open('request.json', 'w') as outfile:
    json.dump(payload, outfile)

!gcloud ai endpoints predict $endpoint_id \
  --region=$regionn \
  --json-request=request.json

Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[12.29288578033447]
