***Imports and Data Loading***

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

df_encoded = pd.read_excel("housing_encoded.xlsx")

Target Creation (Price Classification)

In [25]:
# Create price classes: Low (0), Medium (1), High (2)
df_encoded["price_class"] = pd.qcut(
    df_encoded["purchase_price"],
    q=3,
    labels=[0, 1, 2]
)

***Feature Selection***

In [26]:
features = [
    'year_build', 'no_rooms', 'sqm', 'sqm_price',
    'year', 'quarter_num',
    'house_type_Farm', 'house_type_Summerhouse',
    'house_type_Townhouse', 'house_type_Villa',
    'region_Fyn & islands', 'region_Jutland',
    'region_Zealand',
    'sales_type_family_sale',
    'sales_type_other_sale',
    'sales_type_regular_sale'
]

X = df_encoded[features]
y = df_encoded["price_class"]

print(X.shape)
print(y.shape)

(99989, 16)
(99989,)


***Train-Test Split***

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

***Feature Scaling (for Logistic Regression)***

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

***Logistic Regression Classifier***

In [29]:
log_model = LogisticRegression(
    max_iter=5000,
    class_weight="balanced",
    random_state=42
)

log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

log_acc = accuracy_score(y_test, y_pred_log)
log_cm = confusion_matrix(y_test, y_pred_log)

print("Logistic Regression Accuracy:", log_acc)
print("Confusion Matrix:")
print(" LOW    MID  HIGH")
print(log_cm)

Logistic Regression Accuracy: 0.8802880288028803
Confusion Matrix:
 LOW    MID  HIGH
[[9230  704   65]
 [ 983 8429  752]
 [   0 1087 8747]]


***Decision Tree Classifier***

In [30]:
tree_model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42
)

tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

tree_acc = accuracy_score(y_test, y_pred_tree)
tree_cm = confusion_matrix(y_test, y_pred_tree)

print("Decision Tree Accuracy:", tree_acc)
print("Confusion Matrix:")
print(" LOW    MID  HIGH")
print(tree_cm)

Decision Tree Accuracy: 0.9859319265259859
Confusion Matrix:
 LOW    MID  HIGH
[[ 9874   125     0]
 [   93 10000    71]
 [    0   133  9701]]


***Random Forest Classifier***

In [17]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

rf_acc = accuracy_score(y_test, y_pred_rf)
rf_cm = confusion_matrix(y_test, y_pred_rf)

print("Random Forest Accuracy:", rf_acc)
print("Confusion Matrix:\n", rf_cm)

Random Forest Accuracy: 0.9631629829649632
Confusion Matrix:
 [[9691  308    0]
 [ 278 9729  157]
 [   0  362 9472]]


***Ensemble Model (Hard Voting)***

In [18]:
voting_model = VotingClassifier(
    estimators=[
        ("logistic", log_model),
        ("tree", tree_model),
        ("rf", rf_model)
    ],
    voting="hard"
)

voting_model.fit(X_train, y_train)
y_pred_ensemble = voting_model.predict(X_test)

ensemble_acc = accuracy_score(y_test, y_pred_ensemble)
ensemble_cm = confusion_matrix(y_test, y_pred_ensemble)

print("Ensemble Accuracy:", ensemble_acc)
print("Confusion Matrix:\n", ensemble_cm)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Ensemble Accuracy: 0.9726972697269727
Confusion Matrix:
 [[9780  219    0]
 [ 204 9836  124]
 [   0  272 9562]]


***Model Comparison Table***

In [19]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "Random Forest",
        "Hard Voting Ensemble"
    ],
    "Accuracy": [
        log_acc,
        tree_acc,
        rf_acc,
        ensemble_acc
    ]
})

print(results)

                  Model  Accuracy
0   Logistic Regression  0.880288
1         Decision Tree  0.985932
2         Random Forest  0.963163
3  Hard Voting Ensemble  0.972697
