# Import the Libraries

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error

In [49]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [3]:
# Config

random_state = 42
n_jobs = -1

# Load the Dataset

In [5]:
train = pd.read_csv('house-prices-dataset/train.csv')
test = pd.read_csv('house-prices-dataset/test.csv')
test_ids = test['Id']

In [6]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [7]:
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [15]:
# Target and Log Transform

y = np.log1p(train['SalePrice'])
train = train.drop(columns=['SalePrice'])

In [16]:
# Combining train + test for consistent preprocessing

all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

In [17]:
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [19]:
all_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Feature Engineering

In [24]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        # Total square feet
        X['TotalSF'] = X['TotalBsmtSF'].fillna(0) + X['1stFlrSF'].fillna(0) + X['2ndFlrSF'].fillna(0)
        # Total bathrooms (count half bathrooms as 0.5)
        X['TotalBath'] = (
            X['FullBath'].fillna(0) + 0.5 * X['HalfBath'].fillna(0) + 
            X['BsmtFullBath'].fillna(0) + 0.5 * X['BsmtHalfBath'].fillna(0)
        )
        # Has pool, Has garage, Has basement etc
        X['HasPool'] = (X['PoolArea'].fillna(0) > 0).astype(int)
        X['HasGarage'] = (~X['GarageType'].isna()).astype(int)
        X['HasBsmt'] = (X['BsmtQual'].isna()).astype(int)
        # Age of House
        X['HouseAge'] = X['YrSold'] - X['YearBuilt']
        X['RemodAge'] = X['YrSold'] - X['YearRemodAdd']
        return X

In [25]:
fe = FeatureEngineering()
all_data = fe.transform(all_data)

In [26]:
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,YrSold,SaleType,SaleCondition,TotalSF,TotalBath,HasPool,HasGarage,HasBsmt,HouseAge,RemodAge
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,2008,WD,Normal,2566.0,3.5,0,1,0,5,5
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,2007,WD,Normal,2524.0,2.5,0,1,0,31,31
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,2706.0,3.5,0,1,0,7,6
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,2006,WD,Abnorml,2473.0,2.0,0,1,0,91,36
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,2008,WD,Normal,3343.0,3.5,0,1,0,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,2006,WD,Normal,1638.0,1.5,0,0,0,36,36
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,2006,WD,Abnorml,1638.0,1.5,0,1,0,36,36
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,2006,WD,Abnorml,2448.0,2.0,0,1,0,46,10
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,2006,WD,Normal,1882.0,1.5,0,0,0,14,14


In [27]:
# Identify Features

numeric_feat = all_data.select_dtypes(include=[np.number]).columns.tolist()
for x in ['Id']:
    if x in numeric_feat:
        numeric_feat.remove(x)

In [67]:
numeric_feat

['LotFrontage',
 'LotArea',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'TotalSF',
 'TotalBath',
 'HasPool',
 'HasGarage',
 'HasBsmt',
 'HouseAge',
 'RemodAge']

In [29]:
numeric_as_cat = ['MSSubClass', 'OverallQual', 'OverallCond', 'MoSold', 'YrSold']
for c in numeric_as_cat:
    if c in numeric_feat:
        numeric_feat.remove(c)

In [30]:
cat_feat = all_data.select_dtypes(include=['object']).columns.tolist()
cat_feat = cat_feat + [c for c in numeric_as_cat if c in all_data.columns]

In [33]:
# Drop Id column
if 'Id' in all_data.columns:
    all_data = all_data.drop(columns=['Id'])

In [34]:
# Numeric Transformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [36]:
# Categorical Transformer

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [37]:
# Ordinal mapping helper

ordinal_mappings = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'Missing':0}

In [38]:
class OrdinalMapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols, mapping):
        self.cols = cols
        self.mapping = mapping
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for c in X.columns:
            if c in X.columns:
                X[c] = X[c].fillna('Missing').map(self.mapping).astype(float)
        return X

In [39]:
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'GarageQual', 'GarageCond']

In [43]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal', Pipeline(steps=[('mapper', OrdinalMapper(ordinal_cols, ordinal_mappings)),
                                ('impute_num', SimpleImputer(strategy='median'))]), ordinal_cols),
    ('num', numeric_transformer, [c for c in numeric_feat if c not in ordinal_cols]),
    ('cat', categorical_transformer, cat_feat)
], remainder='drop', sparse_threshold=0)

In [44]:
# Split back into train/test

n_train = train.shape[0]
X_all = all_data
X = X_all.iloc[:n_train, :].copy()
X_test = X_all.iloc[n_train:, :].copy().reset_index(drop=True)
assert X.shape[0] == y.shape[0]

In [51]:
# Modelling

def cv_rmse(model, X, y, folds=5):
    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=kf, n_jobs=n_jobs)
    return -scores.mean()

In [52]:
rf_pipeline = Pipeline(steps=[('pre', preprocessor),
                              ('model', RandomForestRegressor(n_estimators=300, random_state=random_state, n_jobs=n_jobs))])
xgb_pipeline = Pipeline(steps=[('pre', preprocessor),
                               ('model', xgb.XGBRegressor(objective='reg:squarederror',
                                                         n_estimators=1000,
                                                         learning_rate=0.05,
                                                         random_state=random_state,
                                                         n_jobs=8))])

In [53]:
print('Evaluating baseline RandomForest...')
rf_rmse = cv_rmse(rf_pipeline, X, y, folds=5)
print(f'RF CV RMSE (log-target space): {rf_rmse:.5f}')

Evaluating baseline RandomForest...
RF CV RMSE (log-target space): 0.14479


In [54]:
print('Evaluating baseline XGBoost...')
xgb_rmse = cv_rmse(xgb_pipeline, X, y, folds=5)
print(f'XGB CV RMSE (log-target space): {xgb_rmse:.5f}')

Evaluating baseline XGBoost...
XGB CV RMSE (log-target space): 0.13948


# HyperParameter Tuning 

In [55]:
xgb_param_dist = {
    'model__n_estimators': [300, 500, 800],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.5, 1],
    'model__reg_lambda': [1, 2, 5]
}

In [56]:
rs = RandomizedSearchCV(xgb_pipeline, xgb_param_dist, n_iter=25, cv=3, scoring='neg_root_mean_squared_error',
                        random_state=random_state, n_jobs=1, verbose=1)

print('Starting randomized search for XGBoost...\n')
rs.fit(X, y)
print('Best XGB params:', rs.best_params_)
print('\nBest XGB CV RMSE:', -rs.best_score_)

Starting randomized search for XGBoost...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best XGB params: {'model__subsample': 0.6, 'model__reg_lambda': 2, 'model__reg_alpha': 0, 'model__n_estimators': 800, 'model__max_depth': 3, 'model__learning_rate': 0.05, 'model__colsample_bytree': 0.6}
Best XGB CV RMSE: 0.1253749526186794


In [57]:
best_xgb = rs.best_estimator_
best_xgb.fit(X, y)

0,1,2
,steps,"[('pre', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ordinal', ...), ('num', ...), ...]"
,remainder,'drop'
,sparse_threshold,0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,cols,"['ExterQual', 'ExterCond', ...]"
,mapping,"{'Ex': 5, 'Fa': 2, 'Gd': 4, 'Missing': 0, ...}"

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6
,device,
,early_stopping_rounds,
,enable_categorical,False


In [58]:
# RandomForest small grid search

rf_grid = {
    'model__n_estimators': [200, 400],
    'model__max_features': ['sqrt', 0.3]
}

rg = GridSearchCV(rf_pipeline, rf_grid, cv=3, scoring='neg_root_mean_squared_error')
rg.fit(X, y)
print('Best RF params:', rg.best_params_)
print('Best RF CV RMSE:', rg.best_score_)
best_rf = rg.best_estimator_

Best RF params: {'model__max_features': 0.3, 'model__n_estimators': 400}
Best RF CV RMSE: -0.14247111290160827


# Ensemble Predictions

In [59]:
print('Creating Ensemble Predictions...')
preds_xgb_log = best_xgb.predict(X_test)
preds_rf_log = best_rf.predict(X_test)
preds_ensemble_log = 0.5 * preds_xgb_log + 0.5 * preds_rf_log
preds_ensemble = np.expm1(preds_ensemble_log)

Creating Ensemble Predictions...


# Save the Model

In [60]:
joblib.dump(best_xgb, 'best_xgb__pipeline.joblib')
joblib.dump(best_rf, 'best_rf__pipeline.joblib')
print('Saved best_xgb_pipeline.joblib and best_rf__pipeline.joblib')

Saved best_xgb_pipeline.joblib and best_rf__pipeline.joblib


In [61]:
# Submission File

submission = pd.DataFrame({'Id': test_ids, 'SalePrice': preds_ensemble})
submission.to_csv('submission_ensemble.csv', index=False)
print("Submission written './submission_ensemble.csv' ")

Submission written './submission_ensemble.csv' 


In [79]:
pre = best_xgb.named_steps["pre"]

# 1. Ordinal block
ordinal_feature_names = ordinal_cols

# 2. Numeric block
numeric_feature_names = [c for c in numeric_feat if c not in ordinal_cols]

# 3. Categorical block (OneHotEncoder)
cat_pipe = pre.named_transformers_["cat"]
onehot = cat_pipe.named_steps["onehot"]

In [80]:
if hasattr(onehot, "get_feature_names_out"):
    categorical_feature_names = list(onehot.get_feature_names_out(cat_feat))
else:
    categorical_feature_names = []

# Combine in EXACT ColumnTransformer order
feature_names = (
    ordinal_feature_names
    + numeric_feature_names
    + categorical_feature_names
)

In [81]:
# --- XGBOOST FEATURE IMPORTANCES ---
booster = best_xgb.named_steps["model"]

if hasattr(booster, "feature_importances_"):
    importances = booster.feature_importances_

    # Now these MUST match
    if len(importances) == len(feature_names):
        fi = pd.Series(importances, index=feature_names)
        fi = fi.sort_values(ascending=False).head(30)
        fi.to_csv("feature_importances.csv")
        print("Saved top feature importances to feature_importances.csv")
    else:
        print("Still mismatch:", len(importances), "vs", len(feature_names))
else:
    print("Model does not provide feature_importances_.")

Saved top feature importances to feature_importances.csv
