### Main Data Engineering

In [None]:
import pandas as pd
import numpy as np
sale_df = pd.read_csv("/content/drive/MyDrive/rema/Sale_Listing.csv")
rent_df = pd.read_csv("/content/drive/MyDrive/rema/Rent_Listing.csv")
df = pd.read_csv("/content/drive/MyDrive/rema/REMA_Full_Data.csv")
#df = pd.concat([sale_df, rent_df], ignore_index=True)

In [None]:
#sale preprocessing

sale_df = sale_df[sale_df["price_clean"] >= 10000]
sale_df = sale_df.drop(columns=['description', 'header', 'rent_period', 'price','building_age'], errors='ignore')
split_location = sale_df['location'].str.split(',', n=1, expand=True)
sale_df['City'] = split_location[0].str.strip()
sale_df['Neighborhood'] = split_location[1].str.strip()
sale_df = sale_df.drop(columns=['location'])
sale_df = sale_df.dropna(subset=['price_clean', 'surface_area'])
sale_df['land_area'] = sale_df['land_area'].fillna(0)
sale_df['number_of_floors'] = sale_df['number_of_floors'].fillna(1)
sale_df['bathroom'] = sale_df['bathroom'].fillna(0)
sale_df['bedroom'] = sale_df['bedroom'].fillna(0)
sale_df['floor'] = sale_df['floor'].fillna(0)
sale_df['building_age_years'] = sale_df['building_age_years'].fillna(0)
sale_df['bedroom'] = sale_df['bedroom'].replace('studio', 0.5)
sale_df['furnishing'] = sale_df['furnishing'].fillna('unfurnished')


In [None]:
######### sale data

floor_dict = {
    "ground floor": 0,
    "first floor": 1,
    "second floor": 2,
    "third floor": 3,
    "semi ground floor": 0.5,
    # leave last floor out or map to np.nan
    "0": 0,
    "basement floor": -1,
    "fourth floor": 4,
    "fifth floor": 5,
    "sixth floor": 6,
    "seventh floor": 7,
    "eighth floor": 8,
    "ninth floor": 9,
    "more than 10 floors": 11.5
}

sale_df['floor_lower'] = sale_df['floor'].astype(str).str.lower().str.strip()
mask_last_roof = sale_df['floor_lower'] == 'last floor with roof'
sale_df['floor_numeric'] = sale_df['floor_lower'].map(floor_dict)

valid_floors = sale_df['floor_numeric'][(sale_df['floor_numeric'] >= 2)]
if valid_floors.dropna().size > 0:
    avg_floor = valid_floors.mean()
else:
    avg_floor = 3.0

sale_df.loc[mask_last_roof, 'floor_numeric'] = avg_floor + 0.5

type_map = {
    "apartment":0,
    "town house":1,
    "villas and palaces":2,
    "whole building":3,
    "farms and chalets":4
}
sale_df["type"] = sale_df["type"].map(type_map)

furnishing_map = {
    "unfurnished":0,
    "furnished":1,
    "semi furnished":0.5
}
sale_df["furnishing"] = sale_df["furnishing"].map(furnishing_map)

from sklearn.preprocessing import LabelEncoder
city_enc = LabelEncoder()
sale_df["City"] = city_enc.fit_transform(sale_df["City"])

enc = LabelEncoder()
sale_df["Neighborhood"] = enc.fit_transform(sale_df["Neighborhood"])

import re

def convert_floor(text):
    text = str(text).lower()

    if "basement" in text: return -1
    if "ground" in text and "semi" in text: return 0.5
    if "ground" in text: return 0

    match = re.search(r"(\d+)", text)
    if match: return int(match.group(1))

    if "more than" in text: return 11  # adjustable

    return None  # if unrecognized

sale_df["floor"] = sale_df["floor"].apply(convert_floor)


In [None]:
sale_df

Unnamed: 0,type,furnishing,surface_area,land_area,bedroom,bathroom,floor,number_of_floors,price_clean,building_age_years,listing,City,Neighborhood,floor_lower,floor_numeric
0,0,0.0,185.0,0.0,6,3.0,,1.0,87000.0,0.5,sale,2,35,last floor with roof,3.237349
1,0,0.0,215.0,0.0,6,3.0,10.0,1.0,95000.0,0.5,sale,2,35,more than 10 floors,11.500000
2,0,0.0,150.0,0.0,2,3.0,0.0,1.0,59500.0,7.5,sale,13,322,ground floor,0.000000
4,0,0.0,180.0,0.0,3,3.0,0.0,1.0,125000.0,0.5,sale,2,421,ground floor,0.000000
6,0,0.0,120.0,0.0,4,2.0,0.0,1.0,12500.0,0.5,sale,13,42,ground floor,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7198,3,0.0,370.0,1024.0,0,0.0,0.0,3.0,85000.0,7.5,sale,2,20,0,0.000000
7199,3,0.0,400.0,296.0,0,0.0,0.0,3.0,50000.0,20.0,sale,2,314,0,0.000000
7200,3,0.0,120.0,400.0,0,0.0,0.0,3.0,65000.0,7.5,sale,13,28,0,0.000000
7201,3,0.0,200.0,500.0,0,0.0,0.0,3.0,90000.0,20.0,sale,13,264,0,0.000000


In [None]:
#### rent data

split_location = rent_df['location'].str.split(',', n=1, expand=True)
rent_df['City'] = split_location[0].str.strip()
rent_df['Neighborhood'] = split_location[1].str.strip()

print(type(rent_df))
rent_df = rent_df.drop(columns=['location'])
print(type(rent_df))

rent_df = rent_df.drop(columns=['building_age'])
rent_df = rent_df.dropna(subset=['price_clean'])
rent_df = rent_df.drop(columns=['header', 'description','price'])
rent_df['rent_period'] = rent_df['rent_period'].fillna(rent_df['rent_period'].mode()[0])
rent_df = rent_df.dropna(subset=['building_age_years',
                                 'number_of_floors','floor',
                                 'bathroom','bedroom','surface_area',
                                 'furnishing']).reset_index(drop=True)

furnish_map = {
    "furnished" : 1,
    "semi furnished" : 0.5,
    "unfurnished" : 0
}
rent_df['furnishing'] = rent_df['furnishing'].str.lower().map(furnish_map)

period_map = {
    "daily":1,
    "weekly":7,
    "monthly":30,
    "yearly":365
}
rent_df['rent_period'] = rent_df['rent_period'].str.lower().map(period_map)


def bedroom_to_num(x):
    x=str(x).lower()
    if x=="studio":
        return 0.5
    else:
        return float(x)

rent_df['bedroom'] = rent_df['bedroom'].apply(bedroom_to_num)

import numpy as np

floor_map_final = {
    "ground floor": 0,
    "first floor": 1,
    "second floor": 2,
    "third floor": 3,
    "semi ground floor": 0.5,
    # leave last floor out or map to np.nan
    "0": 0,
    "basement floor": -1,
    "fourth floor": 4,
    "fifth floor": 5,
    "sixth floor": 6,
    "seventh floor": 7,
    "eighth floor": 8,
    "ninth floor": 9,
    "more than 10 floors": 11.5
}

# just keeping the original text
rent_df['floor_lower'] = rent_df['floor'].astype(str).str.lower().str.strip()

# masking the last roof
last_roof_mask = rent_df['floor_lower'] == 'last floor with roof'

# mapping to numeric into a new column (so we that we don't lose the text)
rent_df['floor_numeric'] = rent_df['floor_lower'].map(floor_map_final)

# computeing average for only from building floors >= 2
valid_floors = rent_df['floor_numeric'][(rent_df['floor_numeric'] >= 2)] #sale_df
if valid_floors.dropna().size > 0:
    avg_floor = valid_floors.mean()          # you can keep decimal
else:
    avg_floor = 3.0

rent_df.loc[last_roof_mask, 'floor_numeric'] = avg_floor + 0.5

rent_df[['floor_lower','floor_numeric']].loc[last_roof_mask].head()

rent_df['City'] = rent_df['City'].astype('category').cat.codes
rent_df['Neighborhood'] = rent_df['Neighborhood'].astype('category').cat.codes

type_dict = {
    "apartment":0,
    "villas and palaces":2,
    "farms and chalets":4
}
rent_df["type"] = rent_df["type"].replace(type_dict).astype(int)


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


  rent_df["type"] = rent_df["type"].replace(type_dict).astype(int)


In [None]:
rent_df

Unnamed: 0,type,furnishing,rent_period,surface_area,land_area,bedroom,bathroom,floor,number_of_floors,price_clean,building_age_years,listing,City,Neighborhood,floor_lower,floor_numeric
0,0,1.0,1,50.0,,0.5,1.0,first floor,1.0,30.0,0.5,rent,2,244,first floor,1.0
1,0,0.5,365,278.0,,3.0,4.0,first floor,1.0,19900.0,7.5,rent,2,11,first floor,1.0
2,0,0.0,365,105.0,,2.0,2.0,basement floor,1.0,4000.0,7.5,rent,2,135,basement floor,-1.0
3,0,0.0,365,110.0,,3.0,2.0,second floor,1.0,190.0,3.0,rent,2,245,second floor,2.0
4,0,1.0,365,130.0,,2.0,3.0,ground floor,1.0,13000.0,3.0,rent,2,10,ground floor,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4796,4,0.5,365,200.0,3000.0,6.0,2.0,ground floor,2.0,88000.0,7.5,rent,13,32,ground floor,0.0
4797,4,0.0,365,1.0,1.0,3.0,3.0,ground floor,2.0,67000.0,3.0,rent,13,32,ground floor,0.0
4798,4,1.0,365,220.0,500.0,6.0,3.0,ground floor,1.0,90000.0,3.0,rent,13,32,ground floor,0.0
4799,2,0.0,365,300.0,260.0,6.0,6.0,ground floor,2.0,50000.0,14.5,rent,2,206,ground floor,0.0


In [None]:
#rent_df['type'] = rent_df['type'].astype(str)
#sale_df['type'] = sale_df['type'].astype(str)

cols = ['bedroom','bathroom','number_of_floors','floor_numeric']
for c in cols:
    rent_df[c] = rent_df[c].astype(float)
    sale_df[c] = sale_df[c].astype(float)


full_df = pd.concat([rent_df, sale_df], axis=0, ignore_index=True)
full_df.drop(columns=['land_area','floor_lower','floor','rent_period'], inplace=True)


In [None]:
full_df

Unnamed: 0,type,furnishing,surface_area,bedroom,bathroom,number_of_floors,price_clean,building_age_years,listing,City,Neighborhood,floor_numeric
0,0,1.0,50.0,0.5,1.0,1.0,30.0,0.5,rent,2,244,1.0
1,0,0.5,278.0,3.0,4.0,1.0,19900.0,7.5,rent,2,11,1.0
2,0,0.0,105.0,2.0,2.0,1.0,4000.0,7.5,rent,2,135,-1.0
3,0,0.0,110.0,3.0,2.0,1.0,190.0,3.0,rent,2,245,2.0
4,0,1.0,130.0,2.0,3.0,1.0,13000.0,3.0,rent,2,10,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
11647,3,0.0,370.0,0.0,0.0,3.0,85000.0,7.5,sale,2,20,0.0
11648,3,0.0,400.0,0.0,0.0,3.0,50000.0,20.0,sale,2,314,0.0
11649,3,0.0,120.0,0.0,0.0,3.0,65000.0,7.5,sale,13,28,0.0
11650,3,0.0,200.0,0.0,0.0,3.0,90000.0,20.0,sale,13,264,0.0


In [None]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11652 entries, 0 to 11651
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   type                11652 non-null  int64  
 1   furnishing          11652 non-null  float64
 2   surface_area        11652 non-null  float64
 3   bedroom             11652 non-null  float64
 4   bathroom            11652 non-null  float64
 5   number_of_floors    11652 non-null  float64
 6   price_clean         11652 non-null  float64
 7   building_age_years  11652 non-null  float64
 8   listing             11652 non-null  object 
 9   City                11652 non-null  int64  
 10  Neighborhood        11652 non-null  int64  
 11  floor_numeric       11652 non-null  float64
dtypes: float64(8), int64(3), object(1)
memory usage: 1.1+ MB


MODELLING

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import *

import joblib
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
full_df = full_df.sample(frac=1).reset_index(drop=True)
full_df

Unnamed: 0,type,furnishing,surface_area,bedroom,bathroom,number_of_floors,price_clean,building_age_years,listing,City,Neighborhood,floor_numeric
0,0,0.5,179.0,3.0,3.0,1.0,128000.0,14.5,sale,2,58,3.237349
1,0,1.0,189.0,3.0,3.0,1.0,8500.0,7.5,rent,2,250,3.000000
2,0,0.0,204.0,3.0,5.0,1.0,215000.0,0.5,sale,2,214,0.000000
3,0,0.5,220.0,3.0,2.0,1.0,8000.0,20.0,rent,2,10,0.500000
4,0,0.0,450.0,3.0,3.0,1.0,12000.0,7.5,rent,2,18,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
11647,0,1.0,40.0,0.5,1.0,1.0,220.0,3.0,rent,2,9,3.000000
11648,0,0.0,130.0,3.0,3.0,1.0,80000.0,0.5,sale,2,24,1.000000
11649,0,1.0,100.0,3.0,2.0,1.0,900.0,3.0,rent,2,146,4.000000
11650,0,1.0,77.0,1.0,1.0,1.0,315000.0,3.0,sale,2,10,11.500000


In [None]:
X = full_df.drop(columns=['price_clean'])
y = np.log1p(full_df["price_clean"])

In [None]:
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist() #for numerical features
cat_cols = X.select_dtypes(include=['object']).columns.tolist() #for categorical features

print(num_cols)
print(cat_cols)

['type', 'furnishing', 'surface_area', 'bedroom', 'bathroom', 'number_of_floors', 'building_age_years', 'City', 'Neighborhood', 'floor_numeric']
['listing']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num  = scaler.transform(X_test[num_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_cat = encoder.fit_transform(X_train[cat_cols])
X_test_cat  = encoder.transform(X_test[cat_cols])

X_train_final = np.hstack([X_train_num, X_train_cat])
X_test_final  = np.hstack([X_test_num, X_test_cat])

xgbr = XGBRegressor(n_estimators=500,  learning_rate=0.01,  max_depth=12)
xgbr.fit(X_train_final, y_train)

xgbr_preds = xgbr.predict(X_test_final)

pred_real = np.expm1(xgbr_preds)
y_test_real = np.expm1(y_test)

print("\n#####################################")
print("\n########### { XGBR MODEL } ##########")
print("\n#####################################\n")

print("MAE ==> ", mean_absolute_error(y_test_real, pred_real))
print("RMSE ==> ", np.sqrt(mean_squared_error(y_test_real, pred_real)))
print("R2 ==> ", r2_score(y_test, xgbr_preds))

joblib.dump(xgbr, "REMA_xgbr_model.pkl")

KeyError: "['floor_numeric'] not in index"

### Finalized REMA XGBR MODEL PIPELINE

In [None]:
#RESOURCE[1] ==> https://www.kaggle.com/code/haiderrasoolqadri/pipeline-mastery-best-model-selection#Some-Basic-Definitions
#RESOURCE[2] ==> https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#RESOURCE[3] ==> https://www.kaggle.com/code/wojteksy/housing-prices-pipelines-custom-transformer

import pandas as pd
import numpy as np
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

#The start of the class of the REMA pipeline
class RealEstateFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        #####
        if 'City' not in X.columns and 'location' in X.columns:
            split_loc = X['location'].str.split(',', n=1, expand=True)
            X['City'] = split_loc[0].str.strip()
            if split_loc.shape[1] > 1:
                X['Neighborhood'] = split_loc[1].str.strip()
            else:
                X['Neighborhood'] = "Unknown"

        # double checking that listing exists in data just incase
        if 'listing' not in X.columns:
            X['listing'] = 'unknown'

        # roaming and cleaning and checking
        def clean_bedroom(val):
            s = str(val).lower()
            if 'studio' in s:
              return 0.5
            try:
              return float(s)
            except:
              return np.nan

        def clean_floor(val):
            val = str(val).lower()
            if "basement" in val:
              return -1
            if "ground" in val and "semi" in val:
              return 0.5
            if "ground" in val:
              return 0
            match = re.search(r"(\d+)", val)
            if match:
              return float(match.group(1))
            return 0

        if 'bedroom' in X.columns:
            X["bedroom"] = X["bedroom"].apply(clean_bedroom)

        if 'floor' in X.columns:
            X["floor_numeric"] = X["floor"].apply(clean_floor)
        else:
            X["floor_numeric"] = 0 # JUST IN case

        #manually cleaning and categorizing accordingly
        furnish_map = {"unfurnished": 0,
                       "semi furnished": 0.5,
                       "furnished": 1}
        if 'furnishing' in X.columns:
            X["furnishing"] = X["furnishing"].astype(str).str.lower().map(furnish_map).fillna(0)

        type_map = {
            "apartment": 0,
            "town house": 1,
            "villas and palaces": 2,
            "whole building": 3,
            "farms and chalets": 4
        }

        if 'type' in X.columns:
            X["type_numeric"] = X["type"].astype(str)
            X["type_numeric"] = X["type_numeric"].str.lower().map(type_map).fillna(0)


        # the finall chosen columns
        selected_cols = ["surface_area", "bedroom", "bathroom", "floor_numeric",
                        "furnishing", "type_numeric", "City", "Neighborhood", "listing"]

        # double checking existing columns to remove the empty, or unnecessary ones and keep the ones we
        # care mostly about in our model
        final_cols = [c for c in selected_cols if c in X.columns]
        return X[final_cols]

# preparring pipeline lastly

categorical_features = ["City", "Neighborhood"]
one_hot_features = ["listing"]
numeric_features = ["surface_area", "bedroom", "bathroom", "floor_numeric", "furnishing", "type_numeric"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat_ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
        ("cat_onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_features),
    ],
    verbose_feature_names_out= False
)

pipeline = Pipeline(steps=[
    ("engineer", RealEstateFeatureEngineer()),
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(n_estimators=500,learning_rate=0.05,max_depth=8))
])

###########################################################################
################ Data Loading & Execution (RUN IT!!) ################
###########################################################################

# reloading data in raw state/form
sale_df = pd.read_csv("/content/drive/MyDrive/rema/Sale_Listing.csv")
rent_df = pd.read_csv("/content/drive/MyDrive/rema/Rent_Listing.csv")

sale_df['listing'] = 'sale'
rent_df['listing'] = 'rent'

# make sure the pruce is sufficient and is following our guidelines and threshold
sale_df = sale_df[sale_df['price_clean'] >= 10000].dropna(subset=['price_clean', 'surface_area'])
rent_df = rent_df.dropna(subset=['price_clean', 'surface_area'])

full_df = pd.concat([sale_df, rent_df], ignore_index=True)
full_df = full_df.sample(frac=1).reset_index(drop=True)

X = full_df.drop(columns=["price_clean"])
y = np.log1p(full_df["price_clean"])

# training/testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Pipeline...")
pipeline.fit(X_train, y_train)

# preds in actual numbers
pred_log = pipeline.predict(X_test)
pred_real = np.expm1(pred_log)
y_real = np.expm1(y_test)

print("\n#####################################")
print("\n########### { XGBR MODEL } ##########")
print("\n#####################################\n")

print("MAE ==> ", mean_absolute_error(y_real, pred_real))
print("RMSE ==> ", np.sqrt(mean_absolute_error(y_real, pred_real)))
print("R2 ==> ", r2_score(y_test, pred_log))

### code optimization overall and input and output same great output with same strict input rules and good implementation executable .pkl pipeline
#for webiste + explain ss in doc + finalize codes and doc

Training Pipeline...

#####################################

########### { XGBR MODEL } ##########

#####################################

MAE ==>  30807.17699638547
RMSE ==>  175.51973392295656
R2 ==>  0.9040583371432461


In [None]:
import joblib
joblib.dump(pipeline, "REMA_pipeline.pkl")

['REMA_pipeline.pkl']

In [None]:
#testing executable .pkl pipeline with code and real data values
pipeline = joblib.load("REMA_pipeline.pkl")

data = {
    "City": ["Amman"],
    "type": ["apartment"],
    "surface_area": [165],
    "bedroom": [3],
    "bathroom": [3],
    "furnishing": ["unfurnished"],
    "floor": ["ground floor"],
    'Neighborhood': ['Umm Zuwaytinah'],
    'listing': ['sale']

}

prediction = pipeline.predict(pd.DataFrame(data))

In [None]:
#actual predicted listing price in JOD
np.expm1(prediction)

array([82218.53], dtype=float32)