## 1. Setup & Imports

In [1]:
# Cell 1: Setup & Imports
import os, gc, math, warnings
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, RidgeCV, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Try LGB/XGB
LGB_AVAILABLE = False
XGB_AVAILABLE = False
try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except Exception:
    pass
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except Exception:
    pass

BASE_DIR = os.path.abspath(os.getcwd())
INPUT_DIR = os.path.join(BASE_DIR, "data")
OUT_DIR = os.path.join(BASE_DIR, "results", "advanced_v2_outputs")
os.makedirs(OUT_DIR, exist_ok=True)
SEED = 42
np.random.seed(SEED)

print("LGB:", LGB_AVAILABLE, "XGB:", XGB_AVAILABLE)


LGB: True XGB: True


## 2. Competition metric & utilities

In [2]:
# Cell 2: Two-stage metric and helper utilities

def two_stage_score_from_arrays(y_true, y_pred):
    eps = 1e-12
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    ape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), eps)
    frac_bad = np.mean(ape > 1.0)
    if frac_bad > 0.30:
        return 0.0
    good_mask = (ape <= 1.0)
    if good_mask.sum() == 0:
        return 0.0
    mape = np.mean(ape[good_mask])
    scaled = mape / good_mask.mean()
    return 1.0 - scaled

def evaluate_preds(y_true, y_pred, name="eval"):
    score = two_stage_score_from_arrays(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    frac_bad = np.mean(np.abs(y_pred - y_true) / np.maximum(np.abs(y_true),1e-12) > 1.0)
    print(f"{name} | two-stage score: {score:.6f} | MAE: {mae:.4f} | frac_bad: {frac_bad:.4f}")
    return {"score":score, "mae":mae, "frac_bad":frac_bad}

def save_df(df, fname):
    path = os.path.join(OUT_DIR, fname)
    df.to_csv(path, index=False)
    print("Saved:", path)
    return path


## 3. Load base datasets (raw CSVs)

In [3]:
# Cell 3: Load base files from INPUT_DIR
files_needed = [
 "train/new_house_transactions.csv",
 "train/new_house_transactions_nearby_sectors.csv",
 "train/pre_owned_house_transactions.csv",
 "train/pre_owned_house_transactions_nearby_sectors.csv",
 "train/land_transactions.csv",
 "train/land_transactions_nearby_sectors.csv",
 "train/sector_POI.csv",
 "train/city_search_index.csv",
 "train/city_indexes.csv",
 "test.csv",
 "sample_submission.csv"
]

data = {}
for f in files_needed:
    # Adjust path for train/ subdirectory
    if f.startswith('train/'):
        fp = os.path.join(INPUT_DIR, f)
    else:
        fp = os.path.join(INPUT_DIR, f)

    if os.path.exists(fp):
        print("Loading", f)
        data[f] = pd.read_csv(fp)
    else:
        # Fallback for files that might be in train subfolder
        fp_alt = os.path.join(INPUT_DIR, 'train', f)
        if os.path.exists(fp_alt):
            print("Loading", f, "from train/")
            data[f] = pd.read_csv(fp_alt)
        else:
            print("Missing", f, " — check path")
            data[f] = None

# quick shapes
for k,v in data.items():
    if v is None: continue
    print(k, v.shape)


Loading train/new_house_transactions.csv
Loading train/new_house_transactions_nearby_sectors.csv
Loading train/pre_owned_house_transactions.csv
Loading train/pre_owned_house_transactions_nearby_sectors.csv
Loading train/land_transactions.csv
Loading train/land_transactions_nearby_sectors.csv
Loading train/sector_POI.csv
Loading train/city_search_index.csv
Loading train/city_indexes.csv
Loading test.csv
Loading sample_submission.csv
train/new_house_transactions.csv (5433, 11)
train/new_house_transactions_nearby_sectors.csv (5360, 11)
train/pre_owned_house_transactions.csv (5360, 6)
train/pre_owned_house_transactions_nearby_sectors.csv (5427, 6)
train/land_transactions.csv (5896, 6)
train/land_transactions_nearby_sectors.csv (5025, 6)
train/sector_POI.csv (86, 142)
train/city_search_index.csv (4020, 4)
train/city_indexes.csv (7, 74)
test.csv (1152, 2)
sample_submission.csv (1152, 2)


## 4. Clean each base dataset (no merging)

In [4]:
# Cell 4: Clean base datasets individually and save cleaned copies (no merging)
cleaned = {}

# Transaction tables: numeric fill 0, strip strings
for key in ["train/new_house_transactions.csv","train/pre_owned_house_transactions.csv","train/land_transactions.csv",
            "train/new_house_transactions_nearby_sectors.csv","train/pre_owned_house_transactions_nearby_sectors.csv","train/land_transactions_nearby_sectors.csv"]:
    df = data.get(key)
    if df is None:
        cleaned[key] = None
        continue
    numcols = df.select_dtypes(include=[np.number]).columns.tolist()
    df[numcols] = df[numcols].fillna(0)
    for c in df.select_dtypes(include=['object']).columns:
        df[c] = df[c].astype(str).str.strip()
    cleaned[key] = df
    save_df(df, os.path.basename(key).replace(".csv","") + "_clean.csv")

# POI: drop >70% missing, fill rest 0
poi = data.get("train/sector_POI.csv")
if poi is not None:
    miss = poi.isna().mean()
    drop = miss[miss>0.7].index.tolist()
    poi_clean = poi.drop(columns=drop).fillna(0)
    cleaned["train/sector_POI.csv"] = poi_clean
    save_df(poi_clean, "sector_POI_clean.csv")
else:
    cleaned["train/sector_POI.csv"] = None

# city_search: aggregate if repeated keywords
csi = data.get("train/city_search_index.csv")
if csi is not None:
    if "search_volume" in csi.columns:
        csi_agg = csi.groupby("month", as_index=False)["search_volume"].sum().rename(columns={"search_volume":"city_search_volume"})
    else:
        # choose numeric candidate
        cand = [c for c in csi.columns if c.lower().count("search") or c.lower().count("volume")]
        if cand:
            csi_agg = csi.groupby("month", as_index=False)[cand[0]].sum().rename(columns={cand[0]:"city_search_volume"})
        else:
            csi_agg = csi.copy()
    cleaned["train/city_search_index.csv"] = csi_agg
    save_df(csi_agg, "city_search_index_clean.csv")
else:
    cleaned["train/city_search_index.csv"] = None

# city_indexes: drop >80% missing, ffill/bfill
ci = data.get("train/city_indexes.csv")
if ci is not None:
    missci = ci.isna().mean()
    dropci = missci[missci>0.8].index.tolist()
    ci_clean = ci.drop(columns=dropci).fillna(method='ffill').fillna(method='bfill')
    cleaned["train/city_indexes.csv"] = ci_clean
    save_df(ci_clean, "city_indexes_clean.csv")
else:
    cleaned["train/city_indexes.csv"] = None

# save test and sample submission
if data.get("test.csv") is not None:
    save_df(data["test.csv"], "test_clean.csv")
if data.get("sample_submission.csv") is not None:
    save_df(data["sample_submission.csv"], "sample_submission_clean.csv")


Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\new_house_transactions_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\pre_owned_house_transactions_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\land_transactions_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\new_house_transactions_nearby_sectors_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\pre_owned_house_transactions_nearby_sectors_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\land_transactions_nearby_sectors_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\sector_POI_clean.csv
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\ad

## 5. Aggregate base transaction tables to month+sector (for modeling)

In [5]:
# Cell 5: Aggregate (sum) numeric columns by month & sector per table
def agg_month_sector(df):
    if df is None or 'month' not in df.columns or 'sector' not in df.columns:
        return None
    numcols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numcols:
        return None
    return df.groupby(['month','sector'])[numcols].sum().reset_index()

agg_new = agg_month_sector(cleaned["train/new_house_transactions.csv"])
agg_pre = agg_month_sector(cleaned["train/pre_owned_house_transactions.csv"])
agg_land = agg_month_sector(cleaned["train/land_transactions.csv"])
agg_new_near = agg_month_sector(cleaned["train/new_house_transactions_nearby_sectors.csv"])
agg_pre_near = agg_month_sector(cleaned["train/pre_owned_house_transactions_nearby_sectors.csv"])
agg_land_near = agg_month_sector(cleaned["train/land_transactions_nearby_sectors.csv"])

for name, df in [("agg_new",agg_new),("agg_pre",agg_pre),("agg_land",agg_land),
                 ("agg_new_near",agg_new_near),("agg_pre_near",agg_pre_near),("agg_land_near",agg_land_near)]:
    if df is not None:
        save_df(df, f"{name}.csv")
        print(name, df.shape)


Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_new.csv
agg_new (5433, 11)
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_pre.csv
agg_pre (5360, 6)
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_land.csv
agg_land (5896, 6)
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_new_near.csv
agg_new_near (5360, 11)
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_pre_near.csv
agg_pre_near (5427, 6)
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\agg_land_near.csv
agg_land_near (5025, 6)


## 6. Build modeling table by merging aggregates + POI + city search + city indexes (but built from base)

In [6]:
# Cell 6: Merge aggregates into modeling table (no use of prior merged files)
if agg_new is None:
    raise RuntimeError("new_house transactions empty - cannot proceed")

model = agg_new.copy()
def merge_pref(base, other, pref):
    if other is None: return base
    o = other.copy()
    for c in o.columns:
        if c not in ['month','sector']:
            o.rename(columns={c: f"{pref}_{c}"}, inplace=True)
    return base.merge(o, on=['month','sector'], how='left')

model = merge_pref(model, agg_pre, 'pre')
model = merge_pref(model, agg_land, 'land')
model = merge_pref(model, agg_new_near, 'new_near')
model = merge_pref(model, agg_pre_near, 'pre_near')
model = merge_pref(model, agg_land_near, 'land_near')

# merge city_search
if cleaned["train/city_search_index.csv" ] is not None:
    model = model.merge(cleaned["train/city_search_index.csv"], on='month', how='left')

# merge POI
if cleaned["train/sector_POI.csv"] is not None:
    poi = cleaned["train/sector_POI.csv"].copy()
    # ensure sector types align
    try:
        poi['sector'] = poi['sector'].astype(model['sector'].dtype)
    except:
        poi['sector'] = poi['sector'].astype(str)
        model['sector'] = model['sector'].astype(str)
    model = model.merge(poi, on='sector', how='left')

# merge city index by year if exists
ci = cleaned["train/city_indexes.csv"]
if ci is not None:
    if 'city_indicator_data_year' in ci.columns:
        ci2 = ci.rename(columns={'city_indicator_data_year':'year'})
        model['year'] = model['month'].astype(str).str[:4].astype(int)
        keep = ['year'] + [c for c in ci2.select_dtypes(include=[np.number]).columns if c!='year']
        ci2 = ci2[keep]
        ci2 = ci2.rename(columns={c: f"cityidx_{c}" for c in ci2.columns if c!='year'})
        model = model.merge(ci2, on='year', how='left')

# fill numeric NaNs with 0
numcols = model.select_dtypes(include=[np.number]).columns.tolist()
model[numcols] = model[numcols].fillna(0)

save_df(model, "modeling_table_from_base.csv")
print("Model table shape:", model.shape)


Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\modeling_table_from_base.csv
Model table shape: (6419, 251)


## 7. Feature engineering (lags, rolling, ratios) using the modeling table

In [7]:
# Cell 7: Feature engineering
m = model.sort_values(['sector','month']).reset_index(drop=True)
# month extraction
m['month_str'] = m['month'].astype(str)
m['month_num'] = m['month_str'].str.extract(r'-(\d{2})$')[0].fillna('0').astype(int)
m['season'] = ((m['month_num']%12)//3)+1

# key columns to create lags for
key_cols = [c for c in ['num_new_house_transactions','area_new_house_transactions','price_new_house_transactions','amount_new_house_transactions'] if c in m.columns]
for col in key_cols:
    for lag in [1,3,6]:
        m[f"{col}_lag{lag}"] = m.groupby('sector')[col].shift(lag).fillna(0)
    m[f"{col}_roll3"] = m.groupby('sector')[col].rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)

# supply-demand ratio
if 'num_new_house_available_for_sale' in m.columns and 'num_new_house_transactions' in m.columns:
    m['supply_demand_ratio'] = m['num_new_house_available_for_sale'] / (m['num_new_house_transactions'] + 1)
    m['supply_demand_ratio'] = m['supply_demand_ratio'].replace(np.inf,0).fillna(0)

# log transforms for heavy-tail columns
num_cols = m.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    if c in ['month_num','year']: continue
    m[f"{c}_log1p"] = np.log1p(m[c].clip(lower=0))

# log target
if 'amount_new_house_transactions' not in m.columns:
    # attempt to build from num * price
    if 'num_new_house_transactions' in m.columns and 'price_new_house_transactions' in m.columns:
        m['amount_new_house_transactions'] = m['num_new_house_transactions'] * m['price_new_house_transactions']
    else:
        m['amount_new_house_transactions'] = 0
m['y_log1p'] = np.log1p(m['amount_new_house_transactions'].clip(lower=0))

save_df(m, "modeling_table_engineered_from_base.csv")
print("Engineered shape:", m.shape)


Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\modeling_table_engineered_from_base.csv
Engineered shape: (6419, 538)


## 8. Feature selection (use combined ranking if available) and prepare X/y

In [8]:
# Cell 8: Feature selection
fr_path = os.path.join(INPUT_DIR, "results", "feature_ranking_combined.csv")
if os.path.exists(fr_path):
    fr = pd.read_csv(fr_path)
    cand_feats = [f for f in fr['feature'].tolist() if f in m.columns]
    print("Loaded external feature ranking, candidate features:", len(cand_feats))
else:
    cand_feats = [c for c in m.select_dtypes(include=[np.number]).columns if c not in ['amount_new_house_transactions','y_log1p','month_num','year']]
    print("Fallback numeric candidate features:", len(cand_feats))

# Keep features with >1 unique value
features = [f for f in cand_feats if m[f].nunique()>1]
# Prioritize lags and core new-house features
priority = [f for f in features if ('lag' in f or 'roll' in f or 'new_house' in f or 'new' in f or 'amount_new_house' in f or 'num_new_house' in f)]
others = [f for f in features if f not in priority]
final_features = (priority[:60] + others[:40])[:120]
print("Final features count:", len(final_features))
print("Sample features:", final_features[:20])

X = m[final_features].fillna(0)
y = m['amount_new_house_transactions'].values
y_log = m['y_log1p'].values

# encode object features if any
for c in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[c] = le.fit_transform(X[c].astype(str))


Loaded external feature ranking, candidate features: 294
Final features count: 87
Sample features: ['amount_new_house_transactions_log1p', 'amount_new_house_transactions_lag3', 'area_new_house_transactions_lag1', 'amount_new_house_transactions_lag3_log1p', 'amount_new_house_transactions_lag1', 'num_new_house_transactions', 'price_new_house_transactions_lag3_log1p', 'num_new_house_transactions_roll3', 'new_near_price_new_house_transactions_nearby_sectors_log1p', 'new_near_total_price_per_unit_new_house_transactions_nearby_sectors', 'price_new_house_transactions_lag3', 'price_new_house_transactions_lag1', 'area_new_house_transactions_lag3', 'num_new_house_transactions_lag1_log1p', 'price_new_house_transactions_lag1_log1p', 'new_near_total_price_per_unit_new_house_transactions_nearby_sectors_log1p', 'area_new_house_transactions', 'area_new_house_transactions_lag3_log1p', 'price_new_house_transactions_log1p', 'total_price_per_unit_new_house_transactions_log1p']


## 9. Rolling CV training (two-stage) and OOF evaluation (LightGBM + XGBoost stacking)

In [9]:
# ============================================================
# Cell 9: Rolling CV training for two-stage with stacking meta-model
# ============================================================

months = sorted(m['month'].unique().tolist())
n_months = len(months)
N_FOLDS = 5
fold_months = []

# --- build rolling time-based folds ---
for i in range(N_FOLDS):
    train_end = int((i + 1) * n_months / (N_FOLDS + 1))
    val_start = train_end
    val_end = min(train_end + max(1, n_months // (N_FOLDS + 1)), n_months)
    if train_end == 0 or val_start >= val_end:
        continue
    fold_months.append((months[:train_end], months[val_start:val_end]))
print("folds:", len(fold_months))

# --- allocate arrays ---
oof_preds = np.zeros(len(X))
oof_p_zero = np.zeros(len(X))
meta_train = []        # meta features per fold
meta_idx = []
test_meta_preds = []   # store test predictions per fold

# --- prepare test features aligned with final_features ---
test_df = data["test.csv"].copy()
for f in final_features:
    if f not in test_df.columns:
        test_df[f] = 0
X_test = test_df[final_features].fillna(0)

# ============================================================
# CV loop
# ============================================================
for fold, (train_ms, val_ms) in enumerate(fold_months):
    print(f"\n================ Fold {fold} ================")
    train_idx = m['month'].isin(train_ms)
    val_idx = m['month'].isin(val_ms)

    X_tr, X_val = X.loc[train_idx], X.loc[val_idx]
    y_tr_log, y_val_log = y_log[train_idx], y_log[val_idx]
    y_tr_orig, y_val_orig = y[train_idx], y[val_idx]

    # ------------------------------------------------------------
    # 1️⃣ Classifier: predict whether amount_new_house_transactions == 0
    # ------------------------------------------------------------
    y_clf_tr = pd.Series((y_tr_orig == 0).astype(int))
    y_clf_val = pd.Series((y_val_orig == 0).astype(int))

    if y_clf_tr.nunique() > 1 and LGB_AVAILABLE:
        dtr_clf = lgb.Dataset(X_tr, label=y_clf_tr)
        dval_clf = lgb.Dataset(X_val, label=y_clf_val, reference=dtr_clf)
        clf = lgb.train(
            {
                'objective': 'binary',
                'metric': 'auc',
                'learning_rate': 0.05,
                'num_leaves': 64,
                'verbosity': -1,
                'seed': SEED
            },
            dtr_clf,
            num_boost_round=1000,
            valid_sets=[dtr_clf, dval_clf],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(0)
            ]
        )
        p_zero_val = clf.predict(X_val, num_iteration=clf.best_iteration)
        p_zero_test = clf.predict(X_test, num_iteration=clf.best_iteration)

    elif y_clf_tr.nunique() > 1:
        lr = LogisticRegression(max_iter=1000, class_weight='balanced')
        lr.fit(X_tr.fillna(0), y_clf_tr)
        p_zero_val = lr.predict_proba(X_val.fillna(0))[:, 1]
        p_zero_test = lr.predict_proba(X_test.fillna(0))[:, 1]

    else:
        # Fallback: only one class present
        const_prob = float(y_clf_tr.iloc[0])
        print(f"⚠️  Fold {fold}: Only one class in training (all {const_prob}). Using constant fallback.")
        p_zero_val = np.full(len(X_val), const_prob, dtype=float)
        p_zero_test = np.full(len(X_test), const_prob, dtype=float)

    # ------------------------------------------------------------
    # 2️⃣ Regressor: predict transaction amount (log-scale)
    # ------------------------------------------------------------
    reg_train_mask = train_idx & (m['amount_new_house_transactions'] > 0)

    if reg_train_mask.sum() >= max(50, int(0.1 * train_idx.sum())) and LGB_AVAILABLE:
        dtr = lgb.Dataset(X.loc[reg_train_mask], label=y_log[reg_train_mask])
        dval = lgb.Dataset(X_val, label=y_val_log, reference=dtr)
        reg = lgb.train(
            {
                'objective': 'regression',
                'metric': 'mae',
                'learning_rate': 0.03,
                'num_leaves': 128,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'verbosity': -1,
                'seed': SEED
            },
            dtr,
            num_boost_round=2000,
            valid_sets=[dtr, dval],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(0)
            ]
        )
        p_reg_val_log = reg.predict(X_val, num_iteration=reg.best_iteration)
        p_reg_test_log = reg.predict(X_test, num_iteration=reg.best_iteration)

    elif XGB_AVAILABLE:
        dtr = xgb.DMatrix(X.loc[train_idx], label=y_log[train_idx])
        dval = xgb.DMatrix(X_val, label=y_val_log)
        reg = xgb.train(
            {
                'objective': 'reg:squarederror',
                'eta': 0.03,
                'max_depth': 7,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'seed': SEED
            },
            dtr,
            num_boost_round=1000,
            evals=[(dval, 'val')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        p_reg_val_log = reg.predict(xgb.DMatrix(X_val))
        p_reg_test_log = reg.predict(xgb.DMatrix(X_test))

    else:
        rf = RandomForestRegressor(n_estimators=400, random_state=SEED, n_jobs=-1)
        rf.fit(X_tr.fillna(0), y_log[train_idx])
        p_reg_val_log = rf.predict(X_val.fillna(0))
        p_reg_test_log = rf.predict(X_test.fillna(0))

    # Convert log predictions back to original scale
    p_reg_val = np.expm1(np.clip(p_reg_val_log, -20, 50))
    p_reg_test = np.expm1(np.clip(p_reg_test_log, -20, 50))

    # ------------------------------------------------------------
    # 3️⃣ Combine classifier + regressor for final predictions
    # ------------------------------------------------------------
    p_final_val = (1 - p_zero_val) * p_reg_val

    # store OOF and meta features
    oof_preds[val_idx] = p_reg_val
    oof_p_zero[val_idx] = p_zero_val
    meta_train.append(np.vstack([p_reg_val, p_zero_val]).T)
    meta_idx.append(np.where(val_idx)[0])
    test_meta_preds.append(np.vstack([p_reg_test, p_zero_test]).T)

    # evaluate fold
    evaluate_preds(y_val_orig, p_final_val, name=f"fold{fold}")

# ============================================================
# Build meta matrices for stacking
# ============================================================
meta_X = np.zeros((len(X), 2))  # [reg_pred, p_zero]
for idxs, arr in zip(meta_idx, meta_train):
    meta_X[idxs, :] = arr

# Average test meta predictions
test_meta_avg = np.mean(np.stack(test_meta_preds, axis=2), axis=2)  # (n_test, 2)

# Final OOF evaluation (combined)
oof_combined = (1 - oof_p_zero) * oof_preds
evaluate_preds(y, oof_combined, name="OOF")


folds: 5

⚠️  Fold 0: Only one class in training (all 0.0). Using constant fallback.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[366]	training's l1: 0.0125041	valid_1's l1: 0.0460823
fold0 | two-stage score: 0.954802 | MAE: 2627.1292 | frac_bad: 0.0011

⚠️  Fold 1: Only one class in training (all 0.0). Using constant fallback.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[383]	training's l1: 0.00704008	valid_1's l1: 0.0283713
fold1 | two-stage score: 0.972292 | MAE: 2667.5879 | frac_bad: 0.0000

⚠️  Fold 2: Only one class in training (all 0.0). Using constant fallback.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[529]	training's l1: 0.00350936	valid_1's l1: 0.0198539
fold2 | two-stage score: 0.980271 | MAE: 884.1121 | frac_bad: 0.0000

⚠️  Fold 3: Only one class in training (all 0.0). Using constant fallback.
Training until validati

{'score': 0.8324899331512069,
 'mae': 5145.548145114742,
 'frac_bad': 0.00015578750584203146}

## 10. Train stacking meta-model (Ridge) on meta features and produce final test predictions & submission

In [10]:
# Cell 10: Stacking meta-model (Ridge) and final submission
# Use only rows where we have meta features (all rows should have)
# meta_X columns: [reg_pred, p_zero] for each training row
meta_y = y  # original-scale target

# Fit Ridge on meta features; but target for meta should be original-scale target
ridge = RidgeCV(alphas=[0.1,1.0,10.0], cv=5)
ridge.fit(meta_X, meta_y)
meta_test_pred = ridge.predict(test_meta_avg)  # direct predicted amounts

# Alternatively, blend: final = (1 - mean p_zero) * mean reg_pred then adjust with ridge residual
# But using ridge on meta is straightforward
# Evaluate OOF after stacking (predict meta_X)
oof_stack = ridge.predict(meta_X)
evaluate_preds(y, oof_stack, name="OOF_stacked")

# Prepare test submission predictions
final_test_preds = meta_test_pred
# Postprocess: clip negative & floor tiny values to 0 to avoid huge percentage errors on zeros
final_test_preds = np.clip(final_test_preds, 0, None)
final_test_preds = np.where(final_test_preds < 1.0, 0.0, final_test_preds)

# Build submission respecting original test id format
test_original = data["test.csv"].copy()
if 'id' in test_original.columns:
    sub = pd.DataFrame({'id': test_original['id'], 'new_house_transaction_amount': final_test_preds})
else:
    if {'month','sector'}.issubset(test_original.columns):
        sub = pd.DataFrame({'id': test_original['month'].astype(str) + "_sector " + test_original['sector'].astype(str),
                            'new_house_transaction_amount': final_test_preds})
    else:
        sub = pd.DataFrame({'id': [f"sample_{i}" for i in range(len(final_test_preds))],
                            'new_house_transaction_amount': final_test_preds})

save_df(sub, "submission_advanced_v2_stacked.csv")
print("Submission saved. Upload to Kaggle.")


OOF_stacked | two-stage score: 0.592481 | MAE: 8009.7168 | frac_bad: 0.2014
Saved: C:\Users\Mitudru\Documents\ML Project\realestateprediction\results\advanced_v2_outputs\submission_advanced_v2_stacked.csv
Submission saved. Upload to Kaggle.
