Optimization of model parameters was done previously

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
from xgboost.sklearn import XGBRegressor
from scipy.stats import norm, skew
from bayes_opt import BayesianOptimization
from sklearn.model_selection import  cross_val_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor as RFR
from mlxtend.regressor import StackingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
import lightgbm as lgb

In [4]:
train = pd.read_csv("../data/train_clean.csv")
test = pd.read_csv("../data/test_clean.csv")

In [5]:
#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the 'Id' colum since it's unnecessary for the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

ytrain = train["SalePrice"]

In [6]:
#combine data
train.drop("Unnamed: 0", axis = 1, inplace = True)
test.drop("Unnamed: 0", axis = 1, inplace = True)
ntrain = train.shape[0]
ntest = test.shape[0]
train.drop(['SalePrice'], axis=1, inplace=True)
all_data = pd.concat((train, test)).reset_index(drop=True)

In [7]:
# MSSubClass should be string
all_data["MSSubClass"] = all_data["MSSubClass"].apply(str)

In [8]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [9]:
def addlogs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)   
        res.columns.values[m] = l + '_log'
        m += 1
    return res

loglist = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF']

all_data = addlogs(all_data, loglist)

In [10]:
def addSquared(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(res[l]*res[l]).values)   
        res.columns.values[m] = l + '_sq'
        m += 1
    return res 

sqpredlist = ['YearRemodAdd', 'LotFrontage_log', 
              'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
              'GarageCars_log', 'GarageArea_log']
all_data = addSquared(all_data, sqpredlist)

In [11]:
# get all continuous variables
all_non_object = all_data.dtypes[all_data.dtypes != "object"].index.tolist()
# do not consider Year,Month and Qual as continuous
year_month = ["YearBuilt", "YearRemodAdd","GarageYrBlt","MoSold","YrSold",
              "OverallQual","OverallCond"]
# numeric_features
numeric_features = list(set(all_non_object)-set(year_month))

In [12]:
# Check the skew of all numerical features
skewed_feats = all_data[numeric_features].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)

skewness = pd.DataFrame({'Skew' :skewed_feats})

# check skewness of numerical variables
skewness = skewness[abs(skewness.Skew)>0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)

There are 37 skewed numerical features to Box Cox transform


In [13]:
qual_dict = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
all_data["ExterQual"] = all_data["ExterQual"].map(qual_dict).astype(int)
all_data["ExterCond"] = all_data["ExterCond"].map(qual_dict).astype(int)
all_data["BsmtQual"] = all_data["BsmtQual"].map(qual_dict).astype(int)
all_data["BsmtCond"] = all_data["BsmtCond"].map(qual_dict).astype(int)
all_data["HeatingQC"] = all_data["HeatingQC"].map(qual_dict).astype(int)
all_data["KitchenQual"] = all_data["KitchenQual"].map(qual_dict).astype(int)
all_data["FireplaceQu"] = all_data["FireplaceQu"].map(qual_dict).astype(int)
all_data["GarageQual"] = all_data["GarageQual"].map(qual_dict).astype(int)
all_data["GarageCond"] = all_data["GarageCond"].map(qual_dict).astype(int)

all_data["BsmtExposure"] = all_data["BsmtExposure"].map(
        {"None": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}).astype(int)

bsmt_fin_dict = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
all_data["BsmtFinType1"] = all_data["BsmtFinType1"].map(bsmt_fin_dict).astype(int)
all_data["BsmtFinType2"] = all_data["BsmtFinType2"].map(bsmt_fin_dict).astype(int)

all_data["Functional"] = all_data["Functional"].map(
        {"None": 0, "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, 
         "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8}).astype(int)

all_data["GarageFinish"] = all_data["GarageFinish"].map(
        {"None": 0, "Unf": 1, "RFn": 2, "Fin": 3}).astype(int)

all_data["Fence"] = all_data["Fence"].map(
        {"None": 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}).astype(int)

all_data["PoolQC"] = all_data["PoolQC"].map(qual_dict).astype(int)

# Most land slopes are gentle; treat the others as "not gentle".
all_data["LandSlope"] = (all_data["LandSlope"] == "Gtl") * 1
# IR2 and IR3 don't appear that often, so just make a distinction
# between regular and irregular.
all_data["LotShape"] = (all_data["LotShape"] == "Reg") * 1
# Most properties use standard circuit breakers.
all_data["Electrical"] = (all_data["Electrical"] == "SBrkr") * 1
# Most have a paved drive. Treat dirt/gravel and partial pavement
# as "not paved".
all_data["PavedDrive"] = (all_data["PavedDrive"] == "Y") * 1

In [14]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2917, 272)


In [15]:
# split data
train = all_data[:ntrain]
test = all_data[ntrain:] #prediction data 

In [17]:
# test_size = 0.3
# X_train, X_test, y_train, y_test = train_test_split(train, 
#                                                     ytrain,
#                                                     train_size=1-test_size, 
#                                                     test_size=test_size, 
#                                                 random_state=0)

## Base models

The parameters of the base models should be tuned separately.

### XGBoost


In [16]:
### XGBoost parameters
params={'max_depth': 2,
 'min_child_weight': 1.9038260686533586,
 'eta': 0.1,
 'subsample': 0.6005500786487102,
 'colsample_bytree': 0.3375793337385239,
 'gamma': 0.019079813666906587,
 'alpha': 0.07935229891255768,
 'lambda': 1.555371731160462,
 'objective': 'reg:linear'}

In [19]:
model_xgb = xgb.XGBRegressor(colsample_bytree=params['colsample_bytree'], 
                             gamma=params['gamma'], 
                             learning_rate=params['eta'], max_depth=int(round(params['max_depth'])), 
                             min_child_weight=params['min_child_weight'], n_estimators=2200,
                             reg_alpha=params['alpha'], reg_lambda=params['lambda'],
                             subsample=params['subsample'], silent=1,
                             random_state =42, nthread = -1)

model_xgb.fit(train, ytrain)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3375793337385239, gamma=0.019079813666906587,
       learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1.9038260686533586, missing=None,
       n_estimators=2200, n_jobs=1, nthread=-1, objective='reg:linear',
       random_state=42, reg_alpha=0.07935229891255768,
       reg_lambda=1.555371731160462, scale_pos_weight=1, seed=None,
       silent=1, subsample=0.6005500786487102)

## LASSO

In [20]:
lassocv_alpha = 0.0006
model_lasso = linear_model.Lasso(alpha=lassocv_alpha)
model_lasso.fit(train, ytrain)



Lasso(alpha=0.0006, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## Ridge

In [21]:
ridgecv_alpha = 30
model_ridge = linear_model.Ridge(alpha=ridgecv_alpha)
model_ridge.fit(train, ytrain)

Ridge(alpha=30, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

## LightGBM


In [22]:
model_lgb=lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

model_lgb.fit(train, ytrain)


LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
       boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.2319, feature_fraction_seed=9,
       learning_rate=0.05, max_bin=55, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=6, min_split_gain=0.0,
       min_sum_hessian_in_leaf=11, n_estimators=720, n_jobs=-1,
       num_leaves=5, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

## ElasticNet


In [23]:
elnParams={'l1_ratio': 0.11226021096325794, 'alpha': 0.010576730829554074}

model_eln = linear_model.ElasticNet(l1_ratio=elnParams['l1_ratio'],
                                   alpha=elnParams["alpha"])

model_eln.fit(train, ytrain)




ElasticNet(alpha=0.010576730829554074, copy_X=True, fit_intercept=True,
      l1_ratio=0.11226021096325794, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

## Stacked regression


In [38]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.preprocessing import RobustScaler

In [39]:
#setup models
kfolds = KFold(n_splits=10, shuffle=True, random_state=23)

## Parameters for ridge, lasso and elastic net, I just copied from a kernel somewhere.
## Should play around with the parameters
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005,
           0.0006, 0.0007, 0.0008]
alphas = [0.00005, 0.0001, 0.0003, 0.0005, 0.0007, 
          0.0009, 0.01]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
ridge = make_pipeline(RobustScaler(), 
                      linear_model.RidgeCV(alphas = alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      linear_model.LassoCV(max_iter=1e7, alphas = alphas2,
                              random_state = 42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(), 
                           linear_model.ElasticNetCV(max_iter=1e7, alphas=e_alphas, 
                                        cv=kfolds, l1_ratio=e_l1ratio))

lightgbm = make_pipeline(RobustScaler(),
                        lgb.LGBMRegressor(objective='regression',num_leaves=5,
                                      learning_rate=0.05, n_estimators=720,
                                      max_bin = 55, bagging_fraction = 0.8,
                                      bagging_freq = 5, feature_fraction = 0.2319,
                                      feature_fraction_seed=9, bagging_seed=9,
                                      min_data_in_leaf =6, 
                                      min_sum_hessian_in_leaf = 11))

xgboost = make_pipeline(RobustScaler(),
                        XGBRegressor(learning_rate =0.01, n_estimators=3460, 
                                     max_depth=3,min_child_weight=0 ,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective= 'reg:linear',nthread=4,
                                     scale_pos_weight=1,seed=27, 
                                     reg_alpha=0.00006))


#stack
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, 
                                            xgboost, lightgbm), 
                               meta_regressor=xgboost,
                               use_features_in_secondary=True)

#prepare dataframes
stackX = np.array(train)
stacky = np.array(ytrain)

In [41]:
stack_gen_model = stack_gen.fit(stackX, stacky)

In [42]:
eln_preds = model_eln.predict(test)
lasso_preds = model_lasso.predict(test)
ridge_preds = model_ridge.predict(test)
stack_gen_preds = stack_gen_model.predict(test)
xgb_preds = model_xgb.predict(test)
lgbm_preds = model_lgb.predict(test)



In [44]:
average = (eln_preds+lasso_preds+ridge_preds+stack_gen_preds+xgb_preds+lgbm_preds)/6

In [45]:
pred_df = pd.DataFrame(np.exp(average), index=test_ID, columns=["SalePrice"])
pred_df.to_csv('/Users/holy/dsi/module1/averaged_6_models.csv', 
               header=True, index_label='Id')

Looking at features

In [73]:
feature_importance = pd.DataFrame({"features":train.columns.tolist(),
             "xgb": list(model_xgb.feature_importances_),
            "lasso":list(model_lasso.coef_)})

In [81]:
feature_importance[feature_importance["features"].str.startswith("Exterior2nd")]

Unnamed: 0,features,xgb,lasso
211,Exterior2nd_AsbShng,0.000745,-0.0
212,Exterior2nd_AsphShn,0.0,-0.0
213,Exterior2nd_Brk Cmn,0.000372,-0.0
214,Exterior2nd_BrkFace,0.0,-0.0
215,Exterior2nd_CBlock,0.0,-0.0
216,Exterior2nd_CmentBd,0.0,0.0
217,Exterior2nd_HdBoard,0.001117,-0.0
218,Exterior2nd_ImStucc,0.0,0.0
219,Exterior2nd_MetalSd,0.001117,0.0
220,Exterior2nd_Other,0.0,-0.0


In [94]:
feature_importance[feature_importance["features"]=='1stFlrSF_log_sq']

Unnamed: 0,features,xgb,lasso
89,1stFlrSF_log_sq,0.010052,0.008283


In [89]:
feature_importance.shape

(272, 3)

## Optimizing weights using neural networks
This is a test

In [126]:
from sklearn.neural_network import MLPRegressor

In [128]:
mlp = MLPRegressor(hidden_layer_sizes=(1,),solver='lbfgs', max_iter=100, activation='identity',
                           random_state=1)

In [143]:
mlp_X_train = np.transpose(np.vstack((y_train_eln, y_train_las, y_train_lgb, 
                        y_train_rdg, y_train_xgb,stack_gen_train)))

mlp_X_test = np.transpose(np.vstack((y_test_eln, y_test_las, y_test_lgb, 
                        y_test_rdg, y_test_xgb,stack_gen_test)))

mlp_X_pred = np.transpose(np.vstack((eln_prediction, las_prediction, lgb_prediction, 
                        rdg_prediction, xgb_prediction,stack_gen_preds)))

In [140]:
mlp.fit(mlp_X_train, y_train)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [144]:
y_train_mlp = mlp.predict(mlp_X_train)
y_test_mlp = mlp.predict(mlp_X_test)
mlp_prediction = mlp.predict(mlp_X_pred)

In [149]:
pred_df = pd.DataFrame(np.exp(mlp_prediction), index=test_ID, columns=["SalePrice"])
pred_df.to_csv('/Users/holy/dsi/module1/weighted_ensemble_neural_network.csv', 
               header=True, index_label='Id')