In [41]:
DATASET_PATH = '../dataset/egypt_real_estate_listings_cleaned.csv'

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler , MultiLabelBinarizer 
from sklearn.manifold import TSNE
from statsmodels.api import OLS 
import statsmodels.api as sm
from statsmodels.api import add_constant
from scipy import stats
from sklearn.model_selection import train_test_split
import joblib

In [43]:
df = pd.read_csv(DATASET_PATH)

In [44]:
df.head()

Unnamed: 0,price,type,available_from,payment_method,governorate,city,district,compound,size_sqm,bedrooms_num,has_maid_room,bathrooms_num
0,12200000.0,Apartment,2023-06-14,Cash,North Coast,Al Alamein,New Alamein City,North Edge Towers,114.0,2,False,1
1,7000000.0,Apartment,2023-07-18,Cash,Cairo,Heliopolis - Masr El Gedida,Almazah,Almazah,135.0,3,False,2
2,12000000.0,Twin House,2023-08-03,Cash,Cairo,New Cairo City,The 5Th Settlement,Layan Residence,266.0,3,True,4
3,11000000.0,Apartment,2023-09-20,Cash,Cairo,New Cairo City,El Katameya,District 5 Residences,154.0,2,False,3
4,18500000.0,iVilla,2023-12-24,Cash,Cairo,New Cairo City,The 5Th Settlement,Mountain View Hyde Park,305.0,3,True,3


In [45]:
# === Numerical Features Hypothesis Test (Pearson Correlation) ===
num_features = df.select_dtypes(include=[np.number]).columns.drop('price').tolist()
print("📊 Numerical Features Hypothesis Test (Correlation with Price)")
for feature in num_features:
    corr, p_val = stats.pearsonr(df[feature], df['price'])
    print(f"{feature}: correlation = {corr:.3f}, p-value = {p_val:.4f}")
    if p_val < 0.05:
        print(" → Reject H₀: Significant relationship with price ✅\n")
    else:
        print(" → Fail to reject H₀: No significant relationship with price ❌\n")


📊 Numerical Features Hypothesis Test (Correlation with Price)
size_sqm: correlation = 0.460, p-value = 0.0000
 → Reject H₀: Significant relationship with price ✅

bedrooms_num: correlation = 0.483, p-value = 0.0000
 → Reject H₀: Significant relationship with price ✅

bathrooms_num: correlation = 0.528, p-value = 0.0000
 → Reject H₀: Significant relationship with price ✅



In [46]:
# === Categorical Features Hypothesis Test (ANOVA / t-test) ===
cat_features = df.select_dtypes(include=['object']).columns.tolist()
print("🧩 Categorical Features Hypothesis Test (Group Differences in Price)")
for feature in cat_features:
    groups = [df.loc[df[feature] == cat, 'price'] for cat in df[feature].unique()]
    
    # Skip if category has only one unique value
    if len(groups) < 2:
        continue

    # Choose test type
    if len(groups) == 2:
        stat, p_val = stats.ttest_ind(groups[0], groups[1], equal_var=False)
        test_type = "t-test"
    else:
        stat, p_val = stats.f_oneway(*groups)
        test_type = "ANOVA"
    
    print(f"{feature}: {test_type} p-value = {p_val:.4f}")
    if p_val < 0.05:
        print(" → Reject H₀: Mean price differs significantly between categories ✅\n")
    else:
        print(" → Fail to reject H₀: No significant mean difference ❌\n")

🧩 Categorical Features Hypothesis Test (Group Differences in Price)
type: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅

available_from: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅

payment_method: t-test p-value = 0.1991
 → Fail to reject H₀: No significant mean difference ❌

governorate: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅

city: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅

district: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅

compound: ANOVA p-value = 0.0000
 → Reject H₀: Mean price differs significantly between categories ✅



In [47]:
new_df = df.drop(columns=['payment_method'])

In [48]:
significant_features = new_df.columns.tolist()

In [49]:
significant_features

['price',
 'type',
 'available_from',
 'governorate',
 'city',
 'district',
 'compound',
 'size_sqm',
 'bedrooms_num',
 'has_maid_room',
 'bathrooms_num']

In [50]:
cat_features = new_df.select_dtypes(include=['object']).columns.tolist()

In [51]:
cat_features

['type', 'available_from', 'governorate', 'city', 'district', 'compound']

In [52]:
df_encoded = pd.get_dummies(df[significant_features], drop_first=True)

In [53]:
df_encoded

Unnamed: 0,price,size_sqm,bedrooms_num,has_maid_room,bathrooms_num,type_Cabin,type_Chalet,type_Duplex,type_Full Floor,type_Hotel Apartment,...,compound_Zed Towers,compound_Zeid Ibn Sabet St.,compound_Zezenia,compound_Zizinia Al Mostakbal,compound_Zizinia St.,compound_Zoheira Abdeen St.,compound_Zomoroda,compound_Zomra East,compound_Zoya,compound_بوابة النعيم
0,12200000.0,114.0,2,False,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7000000.0,135.0,3,False,2,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,12000000.0,266.0,3,True,4,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,11000000.0,154.0,2,False,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,18500000.0,305.0,3,True,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18544,4018350.0,86.0,3,False,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18545,3505500.0,81.0,2,False,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18546,13400000.0,121.0,3,True,3,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18547,4100000.0,128.0,3,False,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [54]:
num_cols = df_encoded.select_dtypes(include=[np.number]).columns.tolist()

In [55]:
num_cols

['price', 'size_sqm', 'bedrooms_num', 'bathrooms_num']

In [57]:
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

In [None]:
X = df_encoded
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)


In [62]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((14839, 2329), (3710, 2329), (14839,), (3710,))

In [64]:
X_train

Unnamed: 0,price,size_sqm,bedrooms_num,has_maid_room,bathrooms_num,type_Cabin,type_Chalet,type_Duplex,type_Full Floor,type_Hotel Apartment,...,compound_Zed Towers,compound_Zeid Ibn Sabet St.,compound_Zezenia,compound_Zizinia Al Mostakbal,compound_Zizinia St.,compound_Zoheira Abdeen St.,compound_Zomoroda,compound_Zomra East,compound_Zoya,compound_بوابة النعيم
10282,-0.430126,-1.074575,-1.687612,False,-1.508434,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7738,-0.095314,0.641360,0.907964,True,0.054473,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10653,-0.432613,0.453424,0.907964,True,0.835926,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3315,-0.320180,0.085724,0.042772,False,-0.726980,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6741,-0.374775,-0.780415,-0.822420,False,-0.726980,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-0.558018,-0.976521,-1.687612,False,-1.508434,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
11964,-0.434775,-0.486254,-0.822420,False,0.054473,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5390,-0.278839,-0.625163,0.042772,False,-0.726980,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
860,-0.449910,0.167435,0.042772,False,0.054473,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [66]:
# Save preprocessors and model
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(df_encoded.columns, "../models/model_columns.pkl")

['../models/model_columns.pkl']