In [1]:
%pip install catboost

Collecting catboostNote: you may need to restart the kernel to use updated packages.

  Downloading catboost-1.2.8-cp313-cp313-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp313-cp313-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/102.4 MB ? eta -:--:--
    --------------------------------------- 1.8/102.4 MB 8.3 MB/s eta 0:00:13
   - -------------------------------------- 3.7/102.4 MB 8.3 MB/s eta 0:00:12
   - -------------------------------------- 5.0/102.4 MB 7.6 MB/s eta 0:00:13
   -- ------------------------------------- 5.8/102.4 MB 7.6 MB/s eta 0:00:13
   -- ------------------------------------- 6.8/102.4 MB 6.4 MB/s eta 0:00:16
   -- ------------------------------------- 7.6/102.4 MB 5.8 MB/s eta 0:00:17
   --- ------------------------------------ 8.1/102.4 MB 5.6 MB/s eta

In [4]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, RepeatedKFold, validation_curve
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

In [6]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.features]

In [7]:
df = pd.read_excel("C:\\Users\\singh\\House-Price-Pediction\\HousePricePrediction\\HousePricePrediction.xlsx")
df.drop('Id', axis=1, inplace=True)
df = df.dropna(subset=['SalePrice'])

In [8]:
y = np.log1p(df['SalePrice'])
df = df.drop('SalePrice', axis=1)

num_features = df.select_dtypes(include=[np.number]).columns.tolist()
cat_features = df.select_dtypes(include=['object']).columns.tolist()

In [9]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_features)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_features)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
full_pipeline = FeatureUnion([
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

X_prepared = full_pipeline.fit_transform(df)
onehot_encoder = full_pipeline.transformer_list[1][1].named_steps['onehot']
feature_names = num_features + list(onehot_encoder.get_feature_names_out(cat_features))
X_df = pd.DataFrame(X_prepared, columns=feature_names)

def remove_by_vif(X, thresh=5.0):
    while True:
        vif = pd.Series(
            [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
            index=X.columns
        )
        max_vif = vif.max()
        if max_vif > thresh:
            drop_col = vif.idxmax()
            print(f"Removing '{drop_col}' due to high VIF: {max_vif:.2f}")
            X = X.drop(columns=[drop_col])
        else:
            break
    return X

def remove_by_pvalue(X, y, thresh=0.05):
    while True:
        X_const = sm.add_constant(X)
        y_aligned = y.loc[X.index]
        model = sm.OLS(y_aligned, X_const).fit()
        pvalues = model.pvalues.iloc[1:]
        max_pval = pvalues.max()
        if max_pval > thresh:
            drop_col = pvalues.idxmax()
            print(f"Removing '{drop_col}' due to high p-value: {max_pval:.4f}")
            X = X.drop(columns=[drop_col])
        else:
            break
    return X

In [10]:
X_df_clean = remove_by_vif(X_df.copy())
X_df_final = remove_by_pvalue(X_df_clean.copy(), y)


  vif = 1. / (1. - r_squared_i)


Removing 'MSZoning_C (all)' due to high VIF: inf


  vif = 1. / (1. - r_squared_i)


Removing 'LotConfig_Corner' due to high VIF: inf


  vif = 1. / (1. - r_squared_i)


Removing 'BldgType_1Fam' due to high VIF: inf
Removing 'Exterior1st_VinylSd' due to high VIF: 57.09
Removing 'MSZoning_RL' due to high VIF: 7.46
Removing 'MSZoning_RH' due to high p-value: 0.8785
Removing 'LotConfig_FR3' due to high p-value: 0.8091
Removing 'Exterior1st_MetalSd' due to high p-value: 0.8059
Removing 'Exterior1st_CBlock' due to high p-value: 0.7691
Removing 'Exterior1st_WdShing' due to high p-value: 0.5309
Removing 'Exterior1st_AsphShn' due to high p-value: 0.5227
Removing 'Exterior1st_ImStucc' due to high p-value: 0.4998
Removing 'Exterior1st_Plywood' due to high p-value: 0.4538
Removing 'LotConfig_CulDSac' due to high p-value: 0.3888
Removing 'Exterior1st_AsbShng' due to high p-value: 0.3404
Removing 'Exterior1st_CemntBd' due to high p-value: 0.2967
Removing 'LotConfig_FR2' due to high p-value: 0.2353
Removing 'MSZoning_FV' due to high p-value: 0.1888
Removing 'Exterior1st_Wd Sdng' due to high p-value: 0.1793
Removing 'Exterior1st_Stucco' due to high p-value: 0.2350
Re