In [38]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from pycaret.regression import setup, compare_models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score

In [39]:
suffix = "4"
df = pd.read_csv(f'data/preprocessed{suffix}.csv')
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,60,1.646667,2.30678,7,5,2.152099,2.152099,1.83788,706.0,0.0,...,0,0,1,0,0,0,0,1,0,208500.0
1,20,1.68537,2.319405,6,8,2.150521,2.150521,0.0,978.0,0.0,...,0,0,1,0,0,0,0,1,0,181500.0
2,60,1.655196,2.334879,7,5,2.151983,2.152041,1.807264,486.0,0.0,...,0,0,1,0,0,0,0,1,0,223500.0
3,70,1.63137,2.318892,7,5,2.146866,2.150167,0.0,216.0,0.0,...,0,0,1,1,0,0,0,0,0,140000.0
4,60,1.694266,2.357574,8,5,2.151925,2.151925,1.925822,655.0,0.0,...,0,0,1,0,0,0,0,1,0,250000.0


# store X and y

In [40]:
y = df.iloc[:, -1].values
X = df.iloc[:, :-1].values
X.shape, y.shape

((2919, 302), (2919,))

# scale X

In [41]:
X = StandardScaler().fit_transform(X)

# split test and train

In [42]:
train_idx = 1460
X_train, X_test = X[:train_idx], X[train_idx:]
y_train, _ = y[:train_idx], y[train_idx:]

# take log of y train

In [43]:
y_train = np.log(y_train)

In [44]:
df_train = pd.DataFrame(np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1))
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,293,294,295,296,297,298,299,300,301,302
0,0.067331,0.054405,-0.076545,0.646183,-0.507284,1.039973,0.895589,1.245516,0.581145,-0.293025,...,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,12.247694
1,-0.873616,0.656488,0.168496,-0.063185,2.188279,0.16231,-0.389569,-0.805949,1.178255,-0.293025,...,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,12.109011
2,0.067331,0.187085,0.46884,0.646183,-0.507284,0.975415,0.848336,1.211342,0.098189,-0.293025,...,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,12.317167
3,0.302568,-0.183557,0.158528,0.646183,-0.507284,-1.870828,-0.67782,-0.805949,-0.494529,-0.293025,...,-0.298629,-0.049029,0.394439,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693,11.849398
4,0.067331,0.794874,0.909324,1.355551,-0.507284,0.943109,0.753751,1.343678,0.469187,-0.293025,...,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,12.429216


# rename target col

In [45]:
# rename last column to target
df_train.rename(columns={df_train.columns[-1]: 'target'}, inplace=True)
df_train["target"]

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
          ...    
1455    12.072541
1456    12.254863
1457    12.493130
1458    11.864462
1459    11.901583
Name: target, Length: 1460, dtype: float64

# setup pycaret

In [46]:
s = setup(data=df_train, target='target')

Unnamed: 0,Description,Value
0,Session id,2314
1,Target,target
2,Target type,Regression
3,Data shape,"(1460, 303)"
4,Train data shape,"(1021, 303)"
5,Test data shape,"(439, 303)"
6,Numeric features,302
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


# Compare different models

In [47]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,0.0859,0.0194,0.133,0.8757,0.0102,0.0072,0.026
gbr,Gradient Boosting Regressor,0.0931,0.0195,0.1379,0.8746,0.0107,0.0078,0.143
lightgbm,Light Gradient Boosting Machine,0.0974,0.021,0.1434,0.8652,0.0111,0.0082,0.056
xgboost,Extreme Gradient Boosting,0.1001,0.0216,0.1461,0.8612,0.0113,0.0084,2.466
et,Extra Trees Regressor,0.1009,0.022,0.1475,0.8592,0.0114,0.0084,0.405
br,Bayesian Ridge,0.0904,0.0228,0.1417,0.8523,0.0109,0.0076,0.035
rf,Random Forest Regressor,0.1013,0.0229,0.1503,0.8521,0.0117,0.0085,0.425
ridge,Ridge Regression,0.0964,0.0268,0.1515,0.8268,0.0116,0.0081,12.444
ada,AdaBoost Regressor,0.1413,0.0354,0.1877,0.7703,0.0145,0.0118,0.099
knn,K Neighbors Regressor,0.1461,0.0429,0.206,0.7269,0.0159,0.0122,0.032


# train ensemble of best models 

In [48]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Bagging ensemble

In [49]:
models = {
    "gbr": GradientBoostingRegressor(verbose=0),
    "rf": RandomForestRegressor(),
    "xgb": XGBRegressor(),
    "lgbm": LGBMRegressor(),
}

In [50]:
for name, model in models.items():
    model.fit(X_train, y_train)

# Evaluate

In [51]:
results = {}
for name, model in models.items():
    result = cross_val_score(model, 
                            X_train, 
                            y_train, 
                            scoring="neg_mean_squared_error",
                            cv=KFold(n_splits=10))
    results[name] = np.exp(np.sqrt(-result))

In [52]:
results

{'gbr': array([1.14347333, 1.11111311, 1.12108973, 1.17306463, 1.16667264,
        1.11248624, 1.14453767, 1.11028731, 1.12919863, 1.14535942]),
 'rf': array([1.15839452, 1.12870185, 1.14443896, 1.18102731, 1.18284001,
        1.12228006, 1.14687305, 1.12916903, 1.15579179, 1.16416433]),
 'xgb': array([1.13152644, 1.12182601, 1.12733266, 1.18556151, 1.17065609,
        1.12251374, 1.17658053, 1.13339906, 1.14559438, 1.15911957]),
 'lgbm': array([1.14592582, 1.11382559, 1.12628017, 1.177269  , 1.16726057,
        1.11373092, 1.1375605 , 1.11281295, 1.13415463, 1.15066525])}

In [53]:
for name, result in results.items():
    print(f"{name}: {result.mean():.2f}")

gbr: 1.14
rf: 1.15
xgb: 1.15
lgbm: 1.14


# Combine predictions

In [54]:
n_models = len(models)
pred = np.zeros(X_test.shape[0])
for name, model in models.items():
    pred += np.exp(model.predict(X_test)) / n_models
pred

array([123060.49252137, 160930.73091762, 182216.92860612, ...,
       160201.6431354 , 115108.09284144, 233558.9707985 ])

# Save submission

In [55]:
y_submission = pred
pd.DataFrame(y_submission, 
            index=range(1461, len(df)+1), 
            columns=['SalePrice']).reset_index().rename(columns={'index': 'id'}).to_csv(f'data/submission{suffix}.csv', index=False)