In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, warnings
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, StackingClassifier
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge, BayesianRidge, LassoLarsIC
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder, TargetEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from fastcore.transform import Transform

from scipy.special import boxcox1p
import lightgbm as lgb, xgboost as xgb

# Filter useless warnings
warnings.filterwarnings("ignore")

train_path = '/Users/fuhan/Desktop/Kaggle/house-prices-advanced-regression-techniques/train.csv'
test_path = '/Users/fuhan/Desktop/Kaggle/house-prices-advanced-regression-techniques/test.csv'

In [2]:
def impute_missing(df):
    df["PoolQC"] = df["PoolQC"].fillna("None")
    df["MiscFeature"] = df["MiscFeature"].fillna("None")
    df["Alley"] = df["Alley"].fillna("None")
    df["Fence"] = df["Fence"].fillna("None")
    df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        df[col] = df[col].fillna(0)

    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
        df[col] = df[col].fillna('None')

    for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
        df[col] = df[col].fillna(0)
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        df[col] = df[col].fillna('None')
    df["MasVnrType"] = df["MasVnrType"].fillna("None")
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
    df["MasVnrType"] = df["MasVnrType"].fillna("None")
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
    df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])
    df.drop(['Utilities'], axis=1,inplace=True)
    df["Functional"] = df["Functional"].fillna("Typ")
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
    df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])
    df['Exterior1st'] = df['Exterior1st'].fillna('Other')
    df['Exterior2nd'] = df['Exterior2nd'].fillna('Other')
    df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
    df['MSSubClass'] = df['MSSubClass'].fillna("None")
    df["LotFrontage"] = df.groupby("Neighborhood")['LotFrontage'].transform(
                                lambda x: x.fillna(x.median()))
    
def ordinal_encoder(feature_t_list, df_org, df_ed):
    Contrast = []
    df_con = df_org.copy(deep=True)
    for item in feature_t_list:
        df_temp = df_con.groupby(item)['SalePrice'].mean().reset_index()
        df_temp = df_temp.sort_values(by = 'SalePrice', ascending = True)
        df_temp['Rank'] = df_temp['SalePrice'].rank().astype(int)
        
        # Combine the original feature and rank
        correlation_list = [[i,j] for i,j in zip(df_temp[item],df_temp['Rank'])]        
        Contrast.append([[item,df_temp[item].nunique()],correlation_list])
        
    for i in Contrast:
        for value in i[1]:
            df_ed.loc[:, i[0][0]] = df_ed[i[0][0]].replace(value[0], int(value[1]))

    return Contrast

def feature_create(df):
    df['LivingQual'] = 4*df['HeatingQC'] + 5*df['KitchenQual']
    df['RoomNum'] = df['TotRmsAbvGrd'] + df['FullBath'] + 0.5*df['HalfBath'] + df['KitchenAbvGr']
    df['UsedYears'] = 7*(2010-df['YearRemodAdd'])+4*(2010-df['YearBuilt'])
    df['GarageYrUsed'] = 2010-df['GarageYrBlt']
    df['GarageQualArea'] = df['GarageFinish']*df['GarageCond']*df['GarageArea']
    df['InteriorArea'] = 0.2*df['BsmtCond']*df['TotalBsmtSF']+df['OverallQual']*df['GrLivArea']
    df['ExteriorArea'] = df['LotArea']-df['1stFlrSF']
    df['Penalty'] = df['OverallQual']*df['LowQualFinSF']

In [3]:
# One-hot encoding
one_hot = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig',
           'Condition1', 'Condition2', 'BldgType',
           'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
           'MasVnrType', 'Foundation', 'BsmtExposure', 'Heating',
           'CentralAir', 'Electrical', 'Functional', 'Neighborhood',
           'GarageType', 'PavedDrive',  'MiscFeature', 'SaleType',
           'SaleCondition']

# Ordinal_encode based on target
ordinal_encode = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
                  'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu','GarageQual', 
                  'GarageCond', 'PoolQC', 'Fence', 'GarageFinish',
                  'Alley', 'LandSlope']

## Loading

In [4]:
# Load Dataset
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_test_id = df_test['Id']

# Concate train and test dataset
df = pd.concat([df_train,df_test], axis=0, ignore_index=True)
df = df.drop('Id', axis=1)
train_size = df_train.shape[0]

# Check the datatype
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(exclude=['object']).columns

---

## Data Preprocessing

In [5]:
# Impute missing value
impute_missing(df)
impute_missing(df_train)

# Categorical Feature One-Hot Encoding
df = pd.get_dummies(df, columns=one_hot, drop_first=True)
# Ordinal_encode
contrast = ordinal_encoder(ordinal_encode,df_train,df)

# Skewness Fixing
skewed_feats = df[numerical_columns].apply(lambda x: skew(x.dropna()))
skewness = skewed_feats[skewed_feats>1]
skewness_tran = ['MSSubClass', 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
               'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', 'LowQualFinSF', 'GrLivArea',
               'BsmtHalfBath', 'KitchenAbvGr', 'WoodDeckSF', 'OpenPorchSF',
               'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
skewed_features = skewness_tran
lam = 0.15
for feat in skewed_features:
    df[feat] += 1
    df[feat] = boxcox1p(df[feat], lam)
    

# Feature Engineering
feature_create(df)
    

# Split back to train and test
df_train = df[:train_size]
df_test = df[train_size:]

X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

# y log transform
y = np.log1p(y)

---

## Training

In [6]:
# ----------------------Model Initialize--------------------------

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))
RF = RandomForestRegressor()
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,verbose=-1)

# ----------------------Cross Validation--------------------------
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmlse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    return(rmlse)

In [7]:
model_name_box = [['lasso',lasso], 
             ['ENet',ENet], 
             ['KRR',KRR], 
             ['RF',RF], 
             ['GBoost',GBoost], 
             ['model_xgb',model_xgb], 
             ['model_lgb',model_lgb]]
model_box = [lasso, ENet, KRR, RF, GBoost, model_xgb, model_lgb]

In [8]:
for model in model_name_box:
    score = rmsle_cv(model[1])
    print(model[0])
    print("Score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

lasso
Score: 0.1281 (0.0159)

ENet
Score: 0.1283 (0.0159)

KRR
Score: 0.1312 (0.0131)

RF
Score: 0.1446 (0.0101)

GBoost
Score: 0.1246 (0.0118)

model_xgb
Score: 0.1280 (0.0099)

model_lgb
Score: 0.1252 (0.0098)



In [9]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [10]:
averaged_models = AveragingModels(models = (lasso, ENet, GBoost, model_xgb, model_lgb))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1210 (0.0120)



In [11]:
from sklearn.ensemble import StackingRegressor
base_models = [
    ('ENet', ENet),
    ('GBoost', GBoost),
    ('model_lgb', model_lgb)
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the meta-learner (Gradient Boosting Classifier)
meta_model = lasso
stack_reg = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)
stack_reg.fit(X_train, y_train)
y_pred = stack_reg.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse:.4f}")

Root Mean Squared Error: 0.1298


In [13]:
# Importing required libraries
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Simpler approach using StackingRegressor
# Assuming lasso, ENet, GBoost, model_xgb, model_lgb are regressors
stacking_regressor = StackingRegressor(estimators=[
    ('lasso', lasso),
    ('ENet', ENet),
    ('GBoost', GBoost),
    ('model_xgb', model_xgb),
    ('model_lgb', model_lgb)
])

def rmsle_cv(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(stacking_regressor)
print("Stacking Regressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Stacking Regressor score: 0.1268 (0.0218)



In [14]:
# Importing required libraries
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Simpler approach using VotingRegressor for averaging predictions
# Assuming lasso, ENet, GBoost, model_xgb, model_lgb are regressors
voting_regressor = VotingRegressor(estimators=[
    ('lasso', lasso),
    ('ENet', ENet),
    ('GBoost', GBoost),
    ('model_xgb', model_xgb),
    ('model_lgb', model_lgb)
])

def rmsle_cv(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(voting_regressor)
print("Voting Regressor (Averaging) score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Voting Regressor (Averaging) score: 0.1260 (0.0202)



In [16]:
# Final model emmbeded
final_model = voting_regressor.fit(X,y)
# Drop the target
# df_test = df_test.drop('SalePrice',axis=1)
# Get the result
result = final_model.predict(df_test)
# Transform back to og
result_org = np.expm1(result)

# Out put the sub file
sub = pd.DataFrame()
sub['Id'] = df_test_id
sub['SalePrice'] = result_org
sub.to_csv('submission.csv',index=False)

In [17]:
# Final model stack
final_model = stacking_regressor.fit(X,y)
# Drop the target
# df_test = df_test.drop('SalePrice',axis=1)
# Get the result
result = final_model.predict(df_test)
# Transform back to og
result_org = np.expm1(result)

# Out put the sub file
sub = pd.DataFrame()
sub['Id'] = df_test_id
sub['SalePrice'] = result_org
sub.to_csv('submission.csv',index=False)

In [19]:
# Final model lassao
final_model = lasso.fit(X,y)
# Drop the target
# df_test = df_test.drop('SalePrice',axis=1)
# Get the result
result = final_model.predict(df_test)
# Transform back to og
result_org = np.expm1(result)

# Out put the sub file
sub = pd.DataFrame()
sub['Id'] = df_test_id
sub['SalePrice'] = result_org
sub.to_csv('submission.csv',index=False)