In [37]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from pycaret.regression import setup, compare_models
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_score

In [38]:
suffix = "5"
df = pd.read_csv(f'data/preprocessed{suffix}.csv')
df.head()

Unnamed: 0,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,...,EnclosedPorch 3SsnPorch,EnclosedPorch ScreenPorch,EnclosedPorch PoolArea,3SsnPorch^2,3SsnPorch ScreenPorch,3SsnPorch PoolArea,ScreenPorch^2,ScreenPorch PoolArea,PoolArea^2,SalePrice
0,60,7,5,2003,2003,196.0,1.0,0.0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208500.0
1,20,6,8,1976,1976,0.0,0.0,1.0,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,181500.0
2,60,7,5,2001,2002,162.0,1.0,0.0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,223500.0
3,70,7,5,1915,1970,0.0,1.0,0.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140000.0
4,60,8,5,2000,2000,350.0,1.0,0.0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250000.0


# store X and y

In [39]:
y = df.iloc[:, -1].values
X = df.iloc[:, :-1].values
X.shape, y.shape

((2919, 494), (2919,))

# scale X

In [40]:
X = StandardScaler().fit_transform(X)

# split test and train

In [41]:
train_idx = 1460
X_train, X_test = X[:train_idx], X[train_idx:]
y_train, _ = y[:train_idx], y[train_idx:]

# take log of y train

In [42]:
y_train = np.log(y_train)

In [43]:
df_train = pd.DataFrame(np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1))
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,485,486,487,488,489,490,491,492,493,494
0,0.067331,0.646183,-0.507284,1.046258,0.896833,0.529034,1.087334,-0.249719,0.781366,1.232599,...,-0.028464,-0.073418,-0.026805,-0.080946,0.0,0.0,-0.218169,-0.020842,-0.057544,12.247694
1,-0.873616,-0.063185,2.188279,0.154764,-0.395604,-0.567016,-0.818929,3.822508,0.781366,-0.756321,...,-0.028464,-0.073418,-0.026805,-0.080946,0.0,0.0,-0.218169,-0.020842,-0.057544,12.109011
2,0.067331,0.646183,-0.507284,0.980221,0.848965,0.338903,1.087334,-0.249719,0.781366,1.232599,...,-0.028464,-0.073418,-0.026805,-0.080946,0.0,0.0,-0.218169,-0.020842,-0.057544,12.317167
3,0.302568,0.646183,-0.507284,-1.859351,-0.682812,-0.567016,1.087334,-0.249719,-1.027363,-0.756321,...,-0.028464,-0.073418,-0.026805,-0.080946,0.0,0.0,-0.218169,-0.020842,-0.057544,11.849398
4,0.067331,1.355551,-0.507284,0.947203,0.753229,1.390216,1.087334,-0.249719,0.781366,1.232599,...,-0.028464,-0.073418,-0.026805,-0.080946,0.0,0.0,-0.218169,-0.020842,-0.057544,12.429216


# rename target col

In [44]:
# rename last column to target
df_train.rename(columns={df_train.columns[-1]: 'target'}, inplace=True)
df_train["target"]

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
          ...    
1455    12.072541
1456    12.254863
1457    12.493130
1458    11.864462
1459    11.901583
Name: target, Length: 1460, dtype: float64

# setup pycaret

In [45]:
s = setup(data=df_train, target='target')

Unnamed: 0,Description,Value
0,Session id,6036
1,Target,target
2,Target type,Regression
3,Data shape,"(1460, 495)"
4,Train data shape,"(1021, 495)"
5,Test data shape,"(439, 495)"
6,Numeric features,494
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


# Compare different models

In [46]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0867,0.0156,0.1229,0.8992,0.0095,0.0073,0.39
lightgbm,Light Gradient Boosting Machine,0.0879,0.0171,0.129,0.8886,0.01,0.0074,0.201
et,Extra Trees Regressor,0.0923,0.0187,0.1347,0.8799,0.0105,0.0077,0.57
xgboost,Extreme Gradient Boosting,0.0947,0.0189,0.1366,0.8755,0.0106,0.0079,0.454
rf,Random Forest Regressor,0.0959,0.0202,0.1404,0.871,0.0109,0.008,0.73
ridge,Ridge Regression,0.1007,0.0269,0.1601,0.8318,0.0124,0.0084,0.02
br,Bayesian Ridge,0.0901,0.0263,0.1522,0.8286,0.0116,0.0076,0.037
omp,Orthogonal Matching Pursuit,0.0902,0.0268,0.1531,0.8259,0.0117,0.0076,0.021
ada,AdaBoost Regressor,0.1235,0.0274,0.1647,0.8228,0.0127,0.0103,0.195
knn,K Neighbors Regressor,0.1363,0.0384,0.1948,0.7507,0.0151,0.0114,0.024


# train ensemble of best models 

In [47]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Bagging ensemble

In [51]:
models = {
    "gbr": GradientBoostingRegressor(verbose=0),
    # "rf": RandomForestRegressor(),
    # "xgb": XGBRegressor(),
    "lgbm": LGBMRegressor(),
}

In [52]:
for name, model in models.items():
    model.fit(X_train, y_train)

# Evaluate

In [53]:
results = {}
for name, model in models.items():
    result = cross_val_score(model, 
                            X_train, 
                            y_train, 
                            scoring="neg_mean_squared_error",
                            cv=KFold(n_splits=10))
    results[name] = np.exp(np.sqrt(-result))

In [54]:
results

{'gbr': array([1.13242582, 1.1183778 , 1.11948823, 1.1645894 , 1.16634442,
        1.12056568, 1.12477109, 1.12378201, 1.12428275, 1.14457844]),
 'lgbm': array([1.14682178, 1.12499375, 1.12413319, 1.16290092, 1.16469125,
        1.12373681, 1.12774981, 1.11588884, 1.14111731, 1.14408661])}

In [55]:
for name, result in results.items():
    print(f"{name}: {result.mean():.2f}")

gbr: 1.13
lgbm: 1.14


# Combine predictions

In [56]:
n_models = len(models)
pred = np.zeros(X_test.shape[0])
for name, model in models.items():
    pred += np.exp(model.predict(X_test)) / n_models
pred

array([124179.44966818, 150906.72508822, 186741.73893939, ...,
       173701.62944509, 117232.73763622, 233012.26762875])

# Save submission

In [57]:
y_submission = pred
pd.DataFrame(y_submission, 
            index=range(1461, len(df)+1), 
            columns=['SalePrice']).reset_index().rename(columns={'index': 'id'}).to_csv(f'data/submission{suffix}.csv', index=False)