In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score

from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

import warnings

In [3]:
warnings.filterwarnings('ignore')

In [5]:
model_adaboost_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_adaboost_best.pkl')
model_bagging_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_bagging_best.pkl')
model_dt_best2 = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_dt_best2.pkl')
model_gbr_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_gbr_best.pkl')
model_rf_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_rf_best.pkl')
model_xgb_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_xgb_best.pkl')
model_stacking_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/2_models/model_stacking_best.pkl')

In [6]:
df_valid = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/target-encoded/df_valid.csv', sep=',')
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/target-encoded/df_test.csv', sep=',')

In [7]:
df_valid.head()

Unnamed: 0,price,beds,baths,average_rent,year_of_completion,total_parking_spaces,total_floors,total_building_area_sqft,elevators,price_y,year,month,quarter,price_category_te,type_te,furnishing_te,completion_status_te,building_name_te,area_name_te,city_te
0,699000,1,2,0,0,0,0,0,0,105.711255,2024,4,2,109.197607,122.459351,132.139714,135.142661,132.63834,117.654913,131.708092
1,2250000,2,3,130238,2017,294,7,296,8,142.037042,2023,12,4,138.174951,122.459351,132.139714,127.392917,131.427318,131.427318,133.688405
2,2899000,2,2,0,0,0,0,0,0,151.377403,2024,2,1,138.174951,122.459351,132.139714,135.142661,132.63834,138.375815,133.688405
3,2000000,5,8,171945,0,0,0,0,0,137.889249,2024,4,2,138.174951,149.132629,132.139714,127.392917,132.63834,138.198333,112.522653
4,2000000,3,4,120026,0,0,0,0,0,137.889249,2024,3,1,138.174951,145.199113,132.139714,127.392917,132.63834,130.704829,133.688405


In [8]:
X_valid = df_valid.drop(['price', 'price_y'], axis = 1)

y_valid_actual = df_valid['price_y']

In [9]:
# Bagging
y_pred_model_bagging = model_bagging_best.predict(X_valid)

In [10]:
# Decision Tree Regressor
y_pred_model_dt = model_dt_best2.predict(X_valid)

In [11]:
# Gradient Boosting Regressor
y_pred_model_gbr = model_gbr_best.predict(X_valid)

In [12]:
# Random Forest
y_pred_model_rf = model_rf_best.predict(X_valid)

In [13]:
# AdaBoost
y_pred_model_adaboost = model_adaboost_best.predict(X_valid)

In [14]:
# Extreme Gradient Boosting
y_pred_model_xgb = model_xgb_best.predict(X_valid)

In [15]:
# Ensemble Learning - Stacking
y_pred_model_stacking = model_stacking_best.predict(X_valid)

In [16]:
def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)

    # return results in a form of a dictionary
    return {
        'RMSE': rmse,
        'MAE': mae,
        'MedAE': medae,
        'MAPE': mape,
        'R2': r2
    }

In [17]:
# model Bagging
regression_metrics(y_valid_actual, y_pred_model_bagging)

{'RMSE': np.float64(23.71696115963827),
 'MAE': 10.387094237128435,
 'MedAE': np.float64(4.048222567404203),
 'MAPE': np.float64(inf),
 'R2': 0.6126815893235369}

In [18]:
# model Decision Tree
regression_metrics(y_valid_actual, y_pred_model_dt)

{'RMSE': np.float64(23.831779204505228),
 'MAE': 10.681432183162212,
 'MedAE': np.float64(4.267711045375279),
 'MAPE': np.float64(inf),
 'R2': 0.6089223565342348}

In [19]:
# model Gradient Boosting
regression_metrics(y_valid_actual, y_pred_model_gbr)

{'RMSE': np.float64(23.659513537355668),
 'MAE': 10.752630892998342,
 'MedAE': np.float64(4.547160703538552),
 'MAPE': np.float64(inf),
 'R2': 0.6145556552138509}

In [20]:
# model Random Forest
regression_metrics(y_valid_actual, y_pred_model_rf)

{'RMSE': np.float64(24.431104400446376),
 'MAE': 10.89007803931039,
 'MedAE': np.float64(4.208813731042788),
 'MAPE': np.float64(inf),
 'R2': 0.5890052677670914}

In [22]:
# model AdaBoost
regression_metrics(y_valid_actual, y_pred_model_adaboost)

{'RMSE': np.float64(24.750680293420473),
 'MAE': 12.148239792451173,
 'MedAE': np.float64(6.533102927965913),
 'MAPE': np.float64(inf),
 'R2': 0.5781827488965287}

In [23]:
# model XGB
regression_metrics(y_valid_actual, y_pred_model_xgb)

{'RMSE': np.float64(23.450498373023294),
 'MAE': 10.306219604679724,
 'MedAE': np.float64(4.168821037658475),
 'MAPE': np.float64(inf),
 'R2': 0.6213358327575889}

In [24]:
# model Ensemble - Stacking
regression_metrics(y_valid_actual, y_pred_model_stacking)

{'RMSE': np.float64(23.524820880994316),
 'MAE': 10.281194675546399,
 'MedAE': np.float64(4.080756952800471),
 'MAPE': np.float64(inf),
 'R2': 0.6189318012628457}

As you can see, IQR method which removed outliers - eventually removed important data, hence my model did not perform well, ehh time wasted haha