## Import libraries

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

## Loading data

##### CSV file loading

In [2]:
df = pd.read_csv('data/Housing_basic_cleaning.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,floors,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefneighbourhood,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [3]:
df.shape

(545, 13)

## Feature engineering

##### 1.Splitting the data to prevent data leakage.

In [4]:
df['log_price'] = np.log10(df['price'])

X = df.drop(columns=['price', 'log_price'])
y = df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### 2.Fixing columns data types. Encoding categorical variables.

In [5]:
df.dtypes

price                  int64
area                   int64
bedrooms               int64
bathrooms              int64
floors                 int64
mainroad               int64
guestroom              int64
basement               int64
hotwaterheating        int64
airconditioning        int64
parking                int64
prefneighbourhood      int64
furnishingstatus      object
log_price            float64
dtype: object

In [6]:
columns = [5, 6, 7, 8, 9, 11, 12]
for col in columns:
    values = df.iloc[:, col].unique()
    print(values)

[1 0]
[0 1]
[0 1]
[0 1]
[1 0]
[1 0]
['furnished' 'semi-furnished' 'unfurnished']


In [7]:
encoder = OneHotEncoder(sparse_output=False, dtype=int)
encoded = encoder.fit_transform(X_train[['furnishingstatus']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['furnishingstatus']), index=X_train.index)
X_train = pd.concat([X_train.drop('furnishingstatus', axis=1), encoded_df], axis=1)

encoded = encoder.transform(X_test[['furnishingstatus']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['furnishingstatus']), index=X_test.index)
X_test = pd.concat([X_test.drop('furnishingstatus', axis=1), encoded_df], axis=1)

X_train.head()

Unnamed: 0,area,bedrooms,bathrooms,floors,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefneighbourhood,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
46,6000,3,2,4,1,0,0,0,1,1,0,1,0,0
93,7200,3,2,1,1,0,1,0,1,3,0,0,1,0
335,3816,2,1,1,1,0,1,0,1,2,0,1,0,0
412,2610,3,1,2,1,0,1,0,0,0,1,0,0,1
471,3750,3,1,2,1,0,0,0,0,0,0,0,0,1


In [8]:
for dataset in [X_train, X_test]:
    dataset.rename(columns={'furnishingstatus_furnished': 'furnished', 'furnishingstatus_semi-furnished': 'semifurnished', 'furnishingstatus_unfurnished': 'unfurnished'}, inplace=True)

X_test.head(1)

Unnamed: 0,area,bedrooms,bathrooms,floors,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefneighbourhood,furnished,semifurnished,unfurnished
316,5900,4,2,2,0,0,1,0,0,1,0,0,0,1


In [9]:
X_train.dtypes

area                 int64
bedrooms             int64
bathrooms            int64
floors               int64
mainroad             int64
guestroom            int64
basement             int64
hotwaterheating      int64
airconditioning      int64
parking              int64
prefneighbourhood    int64
furnished            int32
semifurnished        int32
unfurnished          int32
dtype: object

##### SUMMARY: *All categorical values are encoded.*

## Creating and training basics models. Metrics

Models Creation

In [10]:
models = {
    'Linear Regression': make_pipeline(StandardScaler(), LinearRegression()),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVR': make_pipeline(StandardScaler(), SVR())
}

##### Models Training and Metrics

In [11]:
def evaluate_model(model, X_test, y_test_log, name):
    y_pred_log = model.predict(X_test)
    y_pred_real = 10 ** y_pred_log
    y_test_real = 10 ** y_test_log
    print(f'\n{name}:')
    print('MAE:  {:0.0f}'.format(mean_absolute_error(y_test_real, y_pred_real)))
    print('RMSE: {:0.0f}'.format(np.sqrt(mean_squared_error(y_test_real, y_pred_real))))
    print('R2:   {:0.2f}'.format(r2_score(y_test_real, y_pred_real)))

for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"data\{name.lower().replace(' ', '_')}.pkl")
    evaluate_model(model, X_test, y_test, name)


Linear Regression:
MAE:  958540
RMSE: 1313848
R2:   0.66

Random Forest:
MAE:  1021741
RMSE: 1427804
R2:   0.60

XGBoost:
MAE:  1010367
RMSE: 1409496
R2:   0.61

SVR:
MAE:  1042196
RMSE: 1434809
R2:   0.59


## Feature Selection/Extraction and New Models

##### Models Creation

In [12]:
n_features_to_select = 10
rfe_lr = RFE(estimator=LinearRegression(), n_features_to_select=n_features_to_select)
rfe_rf = RFE(estimator=RandomForestRegressor(random_state=42), n_features_to_select=n_features_to_select)
rfe_xgb = RFE(estimator=XGBRegressor(random_state=42), n_features_to_select=n_features_to_select)
pca_svr = PCA(n_components=n_features_to_select)

models_fs = {
    'Linear Regression RFE': make_pipeline(StandardScaler(), rfe_lr, LinearRegression()),
    'Random Forest RFE': make_pipeline(rfe_rf, RandomForestRegressor(random_state=42)),
    'XGBoost RFE': make_pipeline(rfe_xgb, XGBRegressor(random_state=42)),
    'SVR PCA': make_pipeline(StandardScaler(), pca_svr, SVR())
}

##### Models Training and Metrics

In [13]:
for name, model in models_fs.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"data\{name.lower().replace(' ', '_')}.pkl")
    evaluate_model(model, X_test, y_test, name)


Linear Regression RFE:
MAE:  995063
RMSE: 1338047
R2:   0.65

Random Forest RFE:
MAE:  1046051
RMSE: 1450190
R2:   0.58

XGBoost RFE:
MAE:  1106911
RMSE: 1511326
R2:   0.55

SVR PCA:
MAE:  1044139
RMSE: 1418707
R2:   0.60


## Hyperparameter Tuning and New Models

##### Models and Parameters Creation

In [14]:
pipe_lr = make_pipeline(StandardScaler(), RFE(estimator=LinearRegression()), LinearRegression())
pipe_rf = make_pipeline(RFE(estimator=RandomForestRegressor(random_state=42)), RandomForestRegressor(random_state=42))
pipe_xgb = make_pipeline(RFE(estimator=XGBRegressor(random_state=42)), XGBRegressor(random_state=42))
pipe_svr = make_pipeline(StandardScaler(), PCA(), SVR())

param_grid_lr = {
    'rfe__n_features_to_select': [5, 10, 14],
}

param_grid_rf = {
    'rfe__n_features_to_select': [5, 10, 14],
    'randomforestregressor__n_estimators': [50, 100],
    'randomforestregressor__max_depth': [None, 10, 20]
}

param_grid_xgb = {
    'rfe__n_features_to_select': [5, 10, 14],
    'xgbregressor__n_estimators': [50, 100],
    'xgbregressor__max_depth': [3, 6],
    'xgbregressor__learning_rate': [0.01, 0.1]
}

param_grid_svr = {
    'pca__n_components': [5, 10, 14],
    'svr__C': [0.1, 1, 10],
    'svr__gamma': ['scale', 'auto']
}

models_params = {
    'Linear Regression RFE CV': (pipe_lr, param_grid_lr),
    'Random Forest RFE CV': (pipe_rf, param_grid_rf),
    'XGBoost RFE CV': (pipe_xgb, param_grid_xgb),
    'SVR PCA CV': (pipe_svr, param_grid_svr)
}

##### Model Training and Metrics

In [15]:
best_models = {}

for name, (pipeline, param_grid) in models_params.items():
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0, scoring='neg_mean_absolute_error')
    grid.fit(X_train, y_train)

    print(f"\nBest params for {name}: {grid.best_params_}")
    best_model = grid.best_estimator_

    joblib.dump(best_model, f"data\{name.lower().replace(' ', '_')}.pkl")
    evaluate_model(best_model, X_test, y_test, name)
    best_models[name] = best_model


Best params for Linear Regression RFE CV: {'rfe__n_features_to_select': 10}

Linear Regression RFE CV:
MAE:  995063
RMSE: 1338047
R2:   0.65

Best params for Random Forest RFE CV: {'randomforestregressor__max_depth': 10, 'randomforestregressor__n_estimators': 100, 'rfe__n_features_to_select': 14}

Random Forest RFE CV:
MAE:  1020933
RMSE: 1423617
R2:   0.60

Best params for XGBoost RFE CV: {'rfe__n_features_to_select': 10, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 50}

XGBoost RFE CV:
MAE:  1035784
RMSE: 1404774
R2:   0.61

Best params for SVR PCA CV: {'pca__n_components': 5, 'svr__C': 0.1, 'svr__gamma': 'scale'}

SVR PCA CV:
MAE:  1010598
RMSE: 1447952
R2:   0.59


##### SUMMARY:*The use of feature selection/extraction and hyperparameter tuning does not significantly affect the performance of selected models and their metrics. This may be due to too few features or their impact on price (e.g., the exact location, building age, etc. are missing). The Linear Regression model will work best, although with still unsatisfactory efficiency.*

## Imput Prediction

In [16]:
sample = {
    'area': 3000,
    'bedrooms': 3,
    'bathrooms': 2,
    'floors': 2,
    'mainroad': 1,
    'guestroom': 0,
    'basement': 0,
    'hotwaterheating': 0,
    'airconditioning': 1,
    'parking': 1,
    'prefneighbourhood': 0,
    'furnished': 0,
    'semifurnished': 0,
    'unfurnished': 1
}

def predict_price_from_file(model_path: str, input_data_dictionary: dict):
    model = joblib.load(model_path)        
    df = pd.DataFrame([input_data_dictionary])    
    try:
        log_price = model.predict(df)[0]
        return round(10 ** log_price, 2)
    except Exception as e:
        raise RuntimeError(f'Prediction failed: {e}')

pred_sample = predict_price_from_file(model_path='data\linear_regression_rfe_cv.pkl', input_data_dictionary=sample)
print(f'Predicted price: {pred_sample}')

Predicted price: 4583540.08
