# Imports

In [1]:
# General libraries
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn model selection and metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

# Scikit-learn transformers and preprocessors & Feature Selection
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.pipeline import make_pipeline

# Scikit-learn regressors
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# External regressor
import xgboost as xgb

# Miscellaneous settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [2]:
houses = pd.read_csv('https://raw.githubusercontent.com/MerleSt/HousingPricePrediction/main/Data/housing_prices.csv')

In [3]:
houses.drop(columns=['Id'], inplace=True)

# Split Data

In [5]:
X = houses.copy()
y = X.pop('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# # Temporarily set the display options to show all rows
# pd.set_option('display.max_rows', None)

# # Display the NaN counts for all columns
# print(X_train.isna().sum())

# # Reset the display options back to default settings
# pd.reset_option('display.max_rows')

In [7]:
# Adjust pandas display settings
pd.set_option('display.float_format', '{:.4f}'.format)

# Selecting only numeric columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

df = pd.DataFrame({
    'Feature': numeric_cols,
    'Variance': X_train[numeric_cols].var().round(4),
    'Range': X_train[numeric_cols].max() - X_train[numeric_cols].min()
})

# Sorting the DataFrame by variance
df = df.sort_values(by='Variance', ascending=False)

print(df)

# Reset the display settings to default if needed
pd.reset_option('display.float_format')


                     Feature       Variance       Range
LotArea              LotArea 115763960.9755 213945.0000
MiscVal              MiscVal    305852.8917  15500.0000
GrLivArea          GrLivArea    275029.6424   5308.0000
BsmtFinSF1        BsmtFinSF1    210746.1622   5644.0000
BsmtUnfSF          BsmtUnfSF    199241.3123   2336.0000
TotalBsmtSF      TotalBsmtSF    194195.6281   6110.0000
2ndFlrSF            2ndFlrSF    193222.5519   2065.0000
1stFlrSF            1stFlrSF    149517.7629   4358.0000
GarageArea        GarageArea     44561.2565   1418.0000
MasVnrArea        MasVnrArea     29940.1555   1378.0000
BsmtFinSF2        BsmtFinSF2     25032.7771   1127.0000
WoodDeckSF        WoodDeckSF     16818.4428    857.0000
OpenPorchSF      OpenPorchSF      4821.0221    547.0000
EnclosedPorch  EnclosedPorch      3854.3271    552.0000
ScreenPorch      ScreenPorch      3122.7027    480.0000
LowQualFinSF    LowQualFinSF      2293.1749    572.0000
MSSubClass        MSSubClass      1808.9593    1

In [None]:
# Correlation matrix
corr_matrix = X_train.corr()

# Making heatmap larger
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

# Finding pairs with high correlation
high_corr_pairs = []
for feature1 in corr_matrix.columns:
    for feature2 in corr_matrix.columns:
        # We only consider pairs where feature1 < feature2 to avoid duplicate pairs and self-correlation
        if feature1 < feature2 and (corr_matrix.loc[feature1, feature2] > 0.8 or corr_matrix.loc[feature1, feature2] < -0.8):
            high_corr_pairs.append((feature1, feature2, corr_matrix.loc[feature1, feature2]))

# Print the high correlation pairs
print("\nPairs with high correlation:")
for pair in high_corr_pairs:
    print(f"Features: {pair[0]} - {pair[1]}, Correlation: {pair[2]:.2f}")

# Preprocess Data

In [8]:
num_features = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

## Encoding

In [9]:
# Define orders
order_LandSlope = ['Sev', 'Mod', 'Gtl']
order_ExterQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_ExterCond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_BsmtExposure = ['NA', 'No', 'Mn', 'Av', 'Gd']
order_BsmtFinType1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_BsmtFinType2 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
order_HeatingQC = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_KitchenQual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
order_FireplaceQu = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_GarageFinish = ['NA', 'Unf', 'RFn', 'Fin']
order_GarageQual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_GarageCond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
order_PoolQC = ['NA', 'Fa', 'TA', 'Gd', 'Ex']
order_Fence = ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

# Setup the encoder
ordinal_enc = OrdinalEncoder(
    categories=[
        order_LandSlope, order_ExterQual, order_ExterCond, order_BsmtQual, order_BsmtCond, 
        order_BsmtExposure, order_BsmtFinType1, order_BsmtFinType2, order_HeatingQC, 
        order_KitchenQual, order_FireplaceQu, order_GarageFinish, order_GarageQual, 
        order_GarageCond, order_PoolQC, order_Fence
    ],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)


In [10]:
cat_ordinal = [
    'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 
    'GarageCond', 'PoolQC', 'Fence'
]
cat_onehot = [col for col in cat_features if col not in cat_ordinal]

## Pipelines

### Select From Model

In [11]:
num_pipeline = make_pipeline(
    SimpleImputer()
)

cat_ordinal_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    ordinal_enc
)

cat_onehot_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(drop='first', handle_unknown='ignore')
)

In [12]:
preprocessor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipeline, num_features),
    ('onehot', cat_onehot_pipe, cat_onehot),
    ('ordinal', cat_ordinal_pipe, cat_ordinal)
])

In [27]:
pipeline = make_pipeline(preprocessor, SelectFromModel(RandomForestRegressor(random_state=42)), RandomForestRegressor(random_state=42))

In [28]:
pipeline.fit(X_train, y_train)

In [29]:
predictions = pipeline.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions)
r2 = r2_score(y_true=y_test, y_pred=predictions)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

Unnamed: 0,MAE,RMSE,MAPE,R2
0,17675.84,28383.28,0.10898,0.894971


### Recursive Feature Elimination

In [16]:
num_pipeline = make_pipeline(
    SimpleImputer()
)

cat_ordinal_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    ordinal_enc
)

cat_onehot_pipe = make_pipeline(
    SimpleImputer(strategy='constant'),
    OneHotEncoder(drop='first', handle_unknown='ignore')
)

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipeline, num_features),
    ('onehot', cat_onehot_pipe, cat_onehot),
    ('ordinal', cat_ordinal_pipe, cat_ordinal)
])

In [32]:
rfe_tree = RFECV(RandomForestRegressor(random_state=42), n_jobs=-2, verbose=3)

In [33]:
pipeline = make_pipeline(preprocessor, rfe_tree, RandomForestRegressor(random_state=42))

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions)
r2 = r2_score(y_true=y_test, y_pred=predictions)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

In [25]:
pipeline.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   

# Hypertuning

## Randomized Grid Search

In [None]:
param_grid = {
    'selectfrommodel__estimator__n_estimators': [50, 100, 200, 300],
    'selectfrommodel__estimator__max_depth': [None, 10, 20, 30, 40],
    'selectfrommodel__estimator__min_samples_split': [2, 5, 10],
    'selectfrommodel__estimator__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__n_estimators': [50, 100, 200, 300],
    'randomforestregressor__max_depth': [None, 10, 20, 30, 40],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4]
}

In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=10,
    random_state=42,
    n_jobs=-2
)

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_params_

## Evaluate Model

In [None]:
predictions = search.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions).round(2)
r2 = r2_score(y_true=y_test, y_pred=predictions).round(2)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

In [None]:
model_df

## Grid Search

In [None]:
param_grid = {
    'selectfrommodel__estimator__n_estimators': [90, 100, 110],
    'selectfrommodel__estimator__max_depth': [25, 30, 35],
    'selectfrommodel__estimator__min_samples_split': [2],
    'selectfrommodel__estimator__min_samples_leaf': [1, 2, 3],
    'randomforestregressor__n_estimators': [90, 100, 110],
    'randomforestregressor__max_depth': [None, 5, 10],
    'randomforestregressor__min_samples_split': [4, 5, 6],
    'randomforestregressor__min_samples_leaf': [1, 2]
}

In [None]:
search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    verbose=2,
    n_jobs=-2
)

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_params_

## Evaluate Model

In [None]:
predictions = search.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=predictions).round(2)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions, squared=False).round(2)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=predictions).round(2)
r2 = r2_score(y_true=y_test, y_pred=predictions).round(2)

model_df = pd.DataFrame({
    "MAE": [mae],
    "RMSE": [rmse],
    "MAPE": [mape],
    "R2": [r2]},
)
model_df

# Principal Component Analysis

# The End

In [None]:
print('This is the End, you know...')