**Fall 2025 Machine Learning (CSCI-4734 - 10859)**  
Assignment 2

**ADA CSCI4734 2025F House Pricing**  
Predict the price of the houses.

In [16]:
# Import necessary libraries for data manipulation, modeling, and evaluation

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import GroupKFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

# Ensure reproducibility
np.random.seed(42)

In [17]:
# Load training and test datasets from CSV files for model development and evaluation.
# The shapes are printed to verify successful loading and dataset sizes.

In [2]:
# LOAD DATA
train = pd.read_csv("/content/binaaz_train.csv")
test = pd.read_csv("/content/binaaz_test.csv")

print(f"Train: {train.shape}, Test: {test.shape}")

Train: (69315, 18), Test: (20000, 17)


In [18]:
# This function removes non-numeric characters from the area column and converts values to numeric.
# Invalid or missing values are coerced to NaN for safe processing.

In [3]:
# CLEAN AREA
def clean_area(x):
    if isinstance(x, str):
        x = re.sub(r"[^0-9.]", "", x)
    return pd.to_numeric(x, errors="coerce")

train["Sahə"] = train["Sahə"].apply(clean_area)
test["Sahə"] = test["Sahə"].apply(clean_area)

In [19]:
# This function extracts the current floor and total number of floors from text values.
# If extraction fails, it assigns default values to avoid missing data issues.

In [4]:
# FLOOR EXTRACTOR
def extract_floor_info(x):
    if isinstance(x, str):
        if "/" in x:
            p = x.split("/")
            if len(p) == 2:
                try:
                    return int(p[0]), int(p[1])
                except:
                    pass
        nums = re.findall(r"\d+", x)
        if len(nums) >= 2:
            return int(nums[0]), int(nums[1])
    return 1, 5

train["floor"], train["max_floor"] = zip(*train["Mərtəbə"].apply(extract_floor_info))
test["floor"], test["max_floor"] = zip(*test["Mərtəbə"].apply(extract_floor_info))

In [20]:
# This extracts the district name from the address by splitting on commas.
# If the district is missing or the format is unexpected, it assigns "unknown".

In [5]:
# DISTRICT PARSER
train["district"] = train["Ünvan"].apply(
    lambda x: x.split(",")[1].strip() if isinstance(x, str) and len(x.split(",")) > 1 else "unknown")
test["district"] = test["Ünvan"].apply(
    lambda x: x.split(",")[1].strip() if isinstance(x, str) and len(x.split(",")) > 1 else "unknown")

In [21]:
# This converts latitude and longitude to numeric values and fills missing entries using district-level medians.
# If a district median is unavailable, the global median is used as a fallback.

In [6]:
# CLEAN LAT/LON with improved median imputation
for col in ["latitude", "longitude"]:
    train[col] = pd.to_numeric(train[col], errors="coerce")
    test[col] = pd.to_numeric(test[col], errors="coerce")

    medians = train.groupby("district")[col].median()
    global_med = train[col].median()

    train[col] = train.apply(lambda row: medians.get(row["district"], global_med)
                             if pd.isna(row[col]) else row[col], axis=1)

    test[col] = test.apply(lambda row: medians.get(row["district"], global_med)
                            if pd.isna(row[col]) else row[col], axis=1)

In [22]:
# This section creates interaction, ratio, and location-based features to capture spatial and structural patterns.
# Missing numeric values are filled with training-set medians to ensure model stability.

In [7]:
# FEATURE ENGINEERING (Expanding features)
train["floor_ratio"] = train["floor"] / train["max_floor"].replace(0, 5)
test["floor_ratio"] = test["floor"] / test["max_floor"].replace(0, 5)

train["area_per_room"] = train["Sahə"] / train["Otaq sayı"].replace(0, 1)
test["area_per_room"] = test["Sahə"] / test["Otaq sayı"].replace(0, 1)

train["log_area_per_room"] = np.log1p(train["area_per_room"])
test["log_area_per_room"] = np.log1p(test["area_per_room"])

baku_center = (40.4093, 49.8671)
train["dist_center"] = np.sqrt((train["latitude"] - baku_center[0])**2 +
                               (train["longitude"] - baku_center[1])**2)
test["dist_center"] = np.sqrt((test["latitude"] - baku_center[0])**2 +
                              (test["longitude"] - baku_center[1])**2)

train["lat_round2"] = train["latitude"].round(2)
train["lon_round2"] = train["longitude"].round(2)
test["lat_round2"] = test["latitude"].round(2)
test["lon_round2"] = test["longitude"].round(2)

train["floor_x_area"] = train["floor"] * train["area_per_room"]
test["floor_x_area"] = test["floor"] * test["area_per_room"]

train["dist_x_floorratio"] = train["dist_center"] * train["floor_ratio"]
test["dist_x_floorratio"] = test["dist_center"] * test["floor_ratio"]

numeric_cols = [
    "Sahə","latitude","longitude","floor","max_floor","floor_ratio",
    "area_per_room","log_area_per_room","dist_center","lat_round2","lon_round2",
    "floor_x_area","dist_x_floorratio"
]

for c in numeric_cols:
    med = train[c].median()
    train[c].fillna(med, inplace=True)
    test[c].fillna(med, inplace=True)

In [23]:
# This removes extreme price outliers using the 1st and 99th percentiles to reduce noise.
# A conservative threshold is used to avoid discarding too much valid data.

In [8]:
# ENHANCED OUTLIER REMOVAL (safer)
Q1 = train["price"].quantile(0.01)
Q2 = train["price"].quantile(0.99)
train = train[(train["price"] >= Q1) & (train["price"] <= Q2)]

In [24]:
# The target variable is log-transformed to reduce skewness and stabilize variance for regression.

In [9]:
# TARGET
y = np.log1p(train["price"])

# FEATURE SET
base = [
    "Sahə","Otaq sayı","latitude","longitude","floor","max_floor",
    "floor_ratio","area_per_room","log_area_per_room","dist_center",
    "lat_round2","lon_round2","floor_x_area","dist_x_floorratio"
]

X_train = train[base].copy()
X_test = test[base].copy()

In [25]:
# This encodes categorical features using frequency encoding and label encoding to provide numeric representations.
# Frequency values are log-transformed for smoothing, and label encoding ensures consistent integer mapping.

In [10]:
# ENHANCED TARGET + LABEL ENCODING
cat_cols = ["poster_type", "seher", "district"]

for c in cat_cols:
    freq = train[c].value_counts()
    train_freq = train[c].map(freq).fillna(1)
    test_freq = test[c].map(freq).fillna(1)

    # smoother freq encoding
    X_train[f"{c}_freq"] = np.log1p(train_freq)
    X_test[f"{c}_freq"] = np.log1p(test_freq)

    le = LabelEncoder()
    le.fit(pd.concat([train[c], test[c]]).astype(str))
    X_train[f"{c}_le"] = le.transform(train[c].astype(str))
    X_test[f"{c}_le"] = le.transform(test[c].astype(str))

In [26]:
# Standardize features to have zero mean and unit variance for better model performance.
# The same scaler fitted on the training set is applied to the test set to maintain consistency.

In [11]:
# SCALING
scaler = StandardScaler()
X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [27]:
# This class trains multiple high-performance regressors (Random Forest, XGBoost, LightGBM) and combines them using optimized weights.
# It further applies Ridge stacking on out-of-fold predictions to improve stability and reduce overall MAE.

In [12]:
# ADVANCED ENSEMBLE (with hyperparameter tuning)
class AdvancedOptimizedEnsemble:
    def __init__(self, n_folds=15):
        self.n_folds = n_folds

    def create_models(self):
        return [
            ("rf1", RandomForestRegressor(
                n_estimators=400, max_depth=35, min_samples_split=2,
                max_features=0.88, random_state=42, n_jobs=-1)),
            ("rf2", RandomForestRegressor(
                n_estimators=350, max_depth=40, min_samples_split=2,
                max_features=0.92, random_state=43, n_jobs=-1)),
            ("rf3", RandomForestRegressor(
                n_estimators=300, max_depth=32, min_samples_split=4,
                max_features=0.85, random_state=44, n_jobs=-1)),
            ("xgb", XGBRegressor(
                n_estimators=600, max_depth=8, learning_rate=0.05,
                subsample=0.95, colsample_bytree=0.95,
                reg_alpha=0.75, reg_lambda=0.85,
                gamma=0.15, min_child_weight=3, tree_method="hist",
                random_state=42, n_jobs=-1)),
            ("lgb", LGBMRegressor(
                n_estimators=700, max_depth=7, learning_rate=0.05,
                subsample=0.95, colsample_bytree=0.95,
                reg_alpha=0.8, reg_lambda=0.8,
                min_child_samples=30, num_leaves=45,
                random_state=42, n_jobs=-1))
        ]

    def fit_predict(self, X, y, X_test):
        groups = train["district"]
        kf = GroupKFold(n_splits=self.n_folds)

        models = self.create_models()
        oof = np.zeros((len(X), len(models)))
        preds = np.zeros((len(X_test), len(models)))

        print("Training models...")

        for i, (name, model) in enumerate(models):
            print(f" ▶ {name}")

            fold_preds = np.zeros((len(X_test), self.n_folds))
            fold_oof = np.zeros(len(X))

            for f, (tr, val) in enumerate(kf.split(X, y, groups)):
                model.fit(X.iloc[tr], y.iloc[tr])
                fold_oof[val] = model.predict(X.iloc[val])
                fold_preds[:, f] = model.predict(X_test)

            oof[:, i] = fold_oof
            preds[:, i] = fold_preds.mean(axis=1)

            print(f"   OOF MAE = {mean_absolute_error(np.expm1(y), np.expm1(fold_oof)):.4f}")

        # Slightly stronger noise for improved ridge stability
        oof_noisy = oof + np.random.normal(0, 1.5e-4, oof.shape)

        def loss(w):
            return mean_absolute_error(np.expm1(y), np.expm1(oof @ w))

        init = np.ones(len(models)) / len(models)
        bounds = [(0, 1)] * len(models)
        cons = {"type": "eq", "fun": lambda w: w.sum() - 1}

        res = minimize(
            loss, init, method="SLSQP",
            bounds=bounds, constraints=cons,
            options={"maxiter": 400, "ftol": 1e-12}
        )

        w = res.x
        print("Optimized weights:", w)

        final_pred = preds @ w

        # Ridge stacking (slightly stronger)
        ridge = Ridge(alpha=0.05)
        ridge.fit(oof_noisy, y)
        stack_pred = ridge.predict(preds)

        print("Stacking OOF:", mean_absolute_error(np.expm1(y),
                                                   np.expm1(ridge.predict(oof_noisy))))

        return np.expm1(stack_pred)

In [28]:
# Train the advanced ensemble on the scaled features and generate predictions for the test set.
# Then apply smart postprocessing to smooth extreme predictions based on room-specific price ranges and clip overall outliers.

In [13]:
# TRAIN & PREDICT
ensemble = AdvancedOptimizedEnsemble()
test_pred = ensemble.fit_predict(X_train_sc, y, X_test_sc)

# IMPROVED SMART POSTPROCESSING
room_ranges = {}
for r in sorted(train["Otaq sayı"].unique()):
    subset = train[train["Otaq sayı"] == r]
    if len(subset) > 8:
        room_ranges[r] = (
            subset["price"].quantile(0.10),
            subset["price"].quantile(0.90)
        )

final = test_pred.copy()

def smooth_pull(x, low, high, strength=0.15):
    if x < low:
        return x + (low - x) * strength
    if x > high:
        return x - (x - high) * strength
    return x

for i in range(len(final)):
    r = test.iloc[i]["Otaq sayı"]
    if r in room_ranges:
        low, high = room_ranges[r]
        final[i] = smooth_pull(final[i], low, high)

# Slightly tighter clipping
p1, p99 = train["price"].quantile([0.010, 0.990])
final = np.clip(final, p1, p99)

Training models...
 ▶ rf1
   OOF MAE = 20111.3451
 ▶ rf2
   OOF MAE = 20171.5900
 ▶ rf3
   OOF MAE = 20210.8143
 ▶ xgb
   OOF MAE = 25711.6224
 ▶ lgb
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 63444, number of used features: 20
[LightGBM] [Info] Start training from score 11.955161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2827
[LightGBM] [Info] Number of data points in the train set: 63444, number of used features: 20
[LightGBM] [Info] Start training from score 11.952496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[L

In [29]:
# Prepare the final submission file with test IDs and the postprocessed predicted prices.
# Save it as a CSV for upload or evaluation and display the first few rows to verify.

In [15]:
# SUBMISSION
submission = pd.DataFrame({
    "_id": test["_id"],
    "price": final
})
submission.to_csv("binaaz_rf_optimized_final_tuned.csv", index=False)
print(submission.head())


      _id          price
0   20886   73097.450430
1  117465  133078.483905
2  125220  519469.305021
3   77683  129357.980604
4   77672  296266.924731


**AI Usage Declaration:**    
This project was implemented with the assistance of AI (GPT-5-mini), primarily for code finetuning, optimization suggestions, and improving workflow efficiency. While AI helped streamline and refine the implementation, the overall understanding of the procedures, data processing steps, and model decisions was fully reviewed and accepted by the author.