In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

In [2]:
# PART 1: DATA LOADING
# ============================================================================

geo_path = "/Users/user/Downloads/HDB_All_GeoInfo.csv"
resale_path = "/Users/user/Downloads/Resale flat prices based on registration date from Jan-2017 onwards.csv"
user_profiles_path = "/Users/user/Downloads/Singapore_Homebuyer_Profiles_Updated.xlsx"

geo = pd.read_csv(geo_path)
resale = pd.read_csv(resale_path)

def norm_street(s):
    if pd.isna(s): return s
    return " ".join(str(s).upper().split())

def norm_block(b):
    if pd.isna(b): return b
    return str(b).strip().upper()

geo["block_norm"] = geo["block"].apply(norm_block)
geo["street_norm"] = geo["street_nam"].apply(norm_street)
resale["block_norm"] = resale["block"].apply(norm_block)
resale["street_norm"] = resale["street_name"].apply(norm_street)
resale["month"] = pd.to_datetime(resale["month"], format="%Y-%m", errors="coerce")

In [3]:
grp = resale.groupby(["block_norm", "street_norm"], as_index=False)
agg_alltime = grp.agg(
    resale_txn_count=("resale_price", "size"),
    resale_price_median_alltime=("resale_price", "median"),
    resale_price_mean_alltime=("resale_price", "mean"),
    first_txn_month=("month", "min"),
    last_txn_month=("month", "max"),
)

last_month_map = agg_alltime.set_index(["block_norm", "street_norm"])["last_txn_month"]
resale_with_last = resale.merge(last_month_map.rename("group_last_month"),
                                on=["block_norm", "street_norm"], how="left")
latest_rows = resale_with_last[resale_with_last["month"] == resale_with_last["group_last_month"]]
latest_prices = latest_rows.groupby(["block_norm", "street_norm"], as_index=False).agg(
    latest_resale_price_median=("resale_price", "median"),
    latest_resale_price_mean=("resale_price", "mean"),
    latest_month=("month", "max"),
)

resale_summary = agg_alltime.merge(latest_prices, on=["block_norm", "street_norm"], how="left")
HDB_All_GeoInfo_enriched = geo.merge(resale_summary, on=["block_norm", "street_norm"], how="left")

orig_cols = [c for c in geo.columns if c not in ["block_norm", "street_norm"]]
enrich_cols = [
    "resale_txn_count", "first_txn_month", "last_txn_month",
    "resale_price_median_alltime", "resale_price_mean_alltime",
    "latest_month", "latest_resale_price_median", "latest_resale_price_mean",
]
final_cols = orig_cols + ["block_norm", "street_norm"] + enrich_cols
HDB_All_GeoInfo_enriched = HDB_All_GeoInfo_enriched[final_cols]

df = HDB_All_GeoInfo_enriched[~HDB_All_GeoInfo_enriched['latest_resale_price_mean'].isna()].copy()
df = df.loc[:, ~df.columns.duplicated(keep='first')]

print(f"Loaded {len(df)} properties")

Loaded 9659 properties


In [4]:
# PART 2: FEATURE PREPARATION
# ============================================================================

candidate_features = [
    'mrt_200', 'mrt_500', 'bus_200', 'bus_500',
    'HWKR_500M', 'MALL_500M', 'HOSP_1K', 'PK_500M_IN',
    'GP_SCH_1K', 'GP_SCH_2K',
]

features = [c for c in candidate_features if c in df.columns]
for c in features:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

print(f"Using {len(features)} features")

Using 10 features


In [5]:
# PART 3: LOAD USER PROFILES
# ============================================================================

user_profiles = pd.read_excel(user_profiles_path, sheet_name='Homebuyers')
print(f"Loaded {len(user_profiles)} user profiles")

priority_cols = ['Priority_School_Proximity', 'Priority_Park_Access', 
                 'Priority_Affordability', 'Priority_MRT_Access', 
                 'Priority_Bus_Access', 'Priority_Amenities']  # ADDED Bus

for col in priority_cols:
    if col in user_profiles.columns:  # Check if column exists
        user_profiles[f'{col}_norm'] = (user_profiles[col] - 1) / 4.0
    else:
        print(f"Warning: {col} not found in user profiles")

Loaded 101 user profiles


In [6]:
# PART 4: PERSONALIZED WEAK LABELING
# ============================================================================

def personalized_weak_label(row, user_profile):
    score = 0.0
    budget = user_profile['Budget_SGD']
    
    # 1. Affordability
    affordability_weight = user_profile['Priority_Affordability_norm']
    if row['latest_resale_price_mean'] <= budget:
        score += 5.0 * affordability_weight
    elif row['latest_resale_price_mean'] <= 1.1 * budget:
        score += 2.0 * affordability_weight
    else:
        score -= 3.0 * affordability_weight
    
    # 2. MRT Access
    mrt_weight = user_profile['Priority_MRT_Access_norm']
    if user_profile['Has_Car'] == 'No':
        mrt_weight *= 1.3
    
    if row.get('mrt_200', 0) == 1:
        score += 3.0 * mrt_weight
    elif row.get('mrt_500', 0) == 1:
        score += 2.0 * mrt_weight
    
    # 3. BUS ACCESS (NEW)
    bus_weight = user_profile.get('Priority_Bus_Access_norm', 0)
    if user_profile['Has_Car'] == 'No':
        bus_weight *= 1.3
    
    bus_200 = row.get('bus_200', 0)
    bus_500 = row.get('bus_500', 0)
    
    if bus_200 >= 3:  # 3+ bus stops within 200m
        score += 2.5 * bus_weight
    elif bus_200 >= 1:  # 1-2 bus stops
        score += 1.5 * bus_weight
    elif bus_500 >= 5:  # 5+ bus stops within 500m
        score += 1.0 * bus_weight
    
    # 4. School Proximity
    school_weight = user_profile['Priority_School_Proximity_norm']
    score += row.get('GP_SCH_1K', 0) * 3.0 * school_weight
    score += row.get('GP_SCH_2K', 0) * 1.5 * school_weight
    
    # 5. Park Access
    park_weight = user_profile['Priority_Park_Access_norm']
    score += row.get('PK_500M_IN', 0) * 2.5 * park_weight
    
    # 6. Amenities
    amenity_weight = user_profile['Priority_Amenities_norm']
    score += row.get('HWKR_500M', 0) * 1.0 * amenity_weight
    score += row.get('MALL_500M', 0) * 0.8 * amenity_weight
    score += row.get('HOSP_1K', 0) * 0.5 * amenity_weight
    
    # 7. Price curve
    price_diff = (budget - row['latest_resale_price_mean']) / (0.25 * budget)
    score += np.tanh(price_diff) * 2.0
    
    return score

In [7]:
# PART 5: GENERATE TRAINING DATA
# ============================================================================

print("\nGenerating training data...")
rows = []
for idx, user in user_profiles.iterrows():
    budget_max = user['Budget_SGD'] * 1.2
    cand = df[df['latest_resale_price_mean'] <= budget_max].copy()
    
    n_samples = min(150, len(cand))
    if n_samples > 0:
        cand_sample = cand.sample(n=n_samples, random_state=idx)
        cand_sample['label'] = cand_sample.apply(
            lambda r: personalized_weak_label(r, user), axis=1
        )
        cand_sample['query_id'] = idx + 1
        rows.append(cand_sample)

train_df = pd.concat(rows, ignore_index=True)
print(f"Generated {len(train_df)} training examples")



Generating training data...
Generated 15150 training examples


In [8]:
# PART 6: TRAIN/TEST SPLIT
# ============================================================================

qids = train_df['query_id'].unique()
q_train, q_valid = train_test_split(qids, test_size=0.2, random_state=123)
train_mask = train_df['query_id'].isin(q_train)
valid_mask = train_df['query_id'].isin(q_valid)

X_train = train_df.loc[train_mask, features]
y_train = train_df.loc[train_mask, 'label']
X_valid = train_df.loc[valid_mask, features]
y_valid = train_df.loc[valid_mask, 'label']

print(f"\nTrain size: {len(X_train)}, Validation size: {len(X_valid)}")


Train size: 12000, Validation size: 3150


In [9]:
# PART 7: EVALUATION METRICS
# ============================================================================

def ndcg_at_k(labels, scores, k=10):
    order = np.argsort(-scores)
    gains = (2**labels[order] - 1)[:k]
    discounts = np.log2(np.arange(2, k + 2))
    dcg = np.sum(gains / discounts)
    ideal_order = np.argsort(-labels)
    ideal_gains = (2**labels[ideal_order] - 1)[:k]
    ideal_dcg = np.sum(ideal_gains / discounts)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def evaluate_model(model, X_valid, y_valid, train_df_valid, model_name):
    """Evaluate model with multiple metrics"""
    predictions = model.predict(X_valid)
    
    # Regression metrics
    mae = mean_absolute_error(y_valid, predictions)
    rmse = np.sqrt(mean_squared_error(y_valid, predictions))
    
    # Ranking metric (NDCG@10 per query)
    val_df = train_df_valid.copy()
    val_df['pred'] = predictions
    
    ndcgs = []
    for qid in val_df['query_id'].unique():
        chunk = val_df[val_df['query_id'] == qid]
        ndcgs.append(ndcg_at_k(chunk['label'].values, chunk['pred'].values, 10))
    avg_ndcg = np.mean(ndcgs)
    
    return {
        'model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'NDCG@10': avg_ndcg
    }


In [10]:
# PART 8: TRAIN MULTIPLE MODELS
# ============================================================================

print("\n" + "="*80)
print("TRAINING AND COMPARING MULTIPLE MODELS")
print("="*80)

results = []
trained_models = {}

# MODEL 1: LightGBM (Gradient Boosting)
print("\n1. Training LightGBM...")
start = time.time()
lgb_model = lgb.LGBMRegressor(
    objective='regression_l1',
    learning_rate=0.05,
    n_estimators=200,
    num_leaves=63,
    min_child_samples=20,
    verbose=-1,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_time = time.time() - start
trained_models['LightGBM'] = lgb_model
result = evaluate_model(lgb_model, X_valid, y_valid, train_df.loc[valid_mask], 'LightGBM')
result['train_time'] = lgb_time
results.append(result)
print(f"   Completed in {lgb_time:.2f}s")

# MODEL 2: XGBoost (Gradient Boosting)
print("\n2. Training XGBoost...")
start = time.time()
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    min_child_weight=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
xgb_model.fit(X_train, y_train)
xgb_time = time.time() - start
trained_models['XGBoost'] = xgb_model
result = evaluate_model(xgb_model, X_valid, y_valid, train_df.loc[valid_mask], 'XGBoost')
result['train_time'] = xgb_time
results.append(result)
print(f"   Completed in {xgb_time:.2f}s")

# MODEL 3: Random Forest
print("\n3. Training Random Forest...")
start = time.time()
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_time = time.time() - start
trained_models['RandomForest'] = rf_model
result = evaluate_model(rf_model, X_valid, y_valid, train_df.loc[valid_mask], 'RandomForest')
result['train_time'] = rf_time
results.append(result)
print(f"   Completed in {rf_time:.2f}s")


TRAINING AND COMPARING MULTIPLE MODELS

1. Training LightGBM...


[WinError 2] The system cannot find the file specified
  File "C:\Users\user\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\user\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

   Completed in 0.29s

2. Training XGBoost...
   Completed in 0.11s

3. Training Random Forest...
   Completed in 0.18s


In [11]:
# PART 9: COMPARE RESULTS
# ============================================================================

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('NDCG@10', ascending=False)

print("\nPerformance Metrics (sorted by NDCG@10):")
print(results_df.to_string(index=False))

# Select best model
best_model_name = results_df.iloc[0]['model']
best_model = trained_models[best_model_name]

print(f"\n{'='*80}")
print(f"BEST MODEL: {best_model_name}")
print(f"{'='*80}")
print(f"NDCG@10: {results_df.iloc[0]['NDCG@10']:.4f}")
print(f"MAE: {results_df.iloc[0]['MAE']:.4f}")
print(f"RMSE: {results_df.iloc[0]['RMSE']:.4f}")
print(f"Training Time: {results_df.iloc[0]['train_time']:.2f}s")

# Feature importance for best model
if best_model_name in ['LightGBM', 'XGBoost', 'RandomForest', 'GradientBoosting']:
    print(f"\nFeature Importance ({best_model_name}):")
    if hasattr(best_model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': features,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        print(importance_df.to_string(index=False))


MODEL COMPARISON RESULTS

Performance Metrics (sorted by NDCG@10):
       model      MAE     RMSE  NDCG@10  train_time
RandomForest 2.875382 3.638341 0.391155    0.181684
     XGBoost 2.865324 3.613228 0.371465    0.114970
    LightGBM 2.884139 3.788974 0.347945    0.286501

BEST MODEL: RandomForest
NDCG@10: 0.3912
MAE: 2.8754
RMSE: 3.6383
Training Time: 0.18s

Feature Importance (RandomForest):
   feature  importance
 GP_SCH_1K    0.219178
   bus_500    0.150328
   mrt_500    0.139599
 GP_SCH_2K    0.117640
 HWKR_500M    0.106249
   bus_200    0.090294
PK_500M_IN    0.067740
 MALL_500M    0.063597
   mrt_200    0.030185
   HOSP_1K    0.015190


In [12]:
# PART 10: EXPLANATION GENERATION
# ============================================================================

def explain_recommendation(property_row, user_profile):

    explanations = []
    budget = user_profile['Budget_SGD']
    price = property_row['latest_resale_price_mean']
    
    # 1. Price/Affordability
    price_diff_pct = ((price - budget) / budget) * 100
    if price <= budget:
        explanations.append(f"✓ Within budget by ${budget - price:,.0f} ({abs(price_diff_pct):.1f}% under)")
    elif price <= budget * 1.1:
        explanations.append(f"⚠ Slightly over budget by ${price - budget:,.0f} ({price_diff_pct:.1f}% over)")
    
    # 2. MRT Access
    if user_profile['Priority_MRT_Access'] >= 4:
        if property_row.get('mrt_200', 0) == 1:
            explanations.append("✓ Excellent MRT access (within 200m) - matches your high priority")
        elif property_row.get('mrt_500', 0) == 1:
            explanations.append("✓ Good MRT access (within 500m) - matches your high priority")
        else:
            explanations.append("⚠ No nearby MRT - may not suit your preference")
    
    # 3. BUS ACCESS (NEW)
    if user_profile.get('Priority_Bus_Access', 0) >= 4:
        bus_200 = property_row.get('bus_200', 0)
        bus_500 = property_row.get('bus_500', 0)
        
        if bus_200 >= 3:
            explanations.append(f"✓ Excellent bus access ({int(bus_200)} stops within 200m) - matches your high priority")
        elif bus_200 >= 1:
            explanations.append(f"✓ Good bus access ({int(bus_200)} stops within 200m) - matches your high priority")
        elif bus_500 >= 5:
            explanations.append(f"✓ Decent bus coverage ({int(bus_500)} stops within 500m)")
        else:
            explanations.append("⚠ Limited bus access - may not suit your preference")
    
    # 4. School Proximity
    if user_profile['Priority_School_Proximity'] >= 4:
        school_1k = property_row.get('GP_SCH_1K', 0)
        if school_1k > 0:
            explanations.append(f"✓ {int(school_1k)} primary school(s) within 1km - ideal for families")
        else:
            explanations.append("⚠ No primary schools within 1km")
    
    # 5. Park Access
    if user_profile['Priority_Park_Access'] >= 4:
        if property_row.get('PK_500M_IN', 0) > 0:
            explanations.append("✓ Park within 500m - great for outdoor activities")
        else:
            explanations.append("⚠ No nearby parks")
    
    # 6. Amenities
    if user_profile['Priority_Amenities'] >= 4:
        hawkers = int(property_row.get('HWKR_500M', 0))
        malls = int(property_row.get('MALL_500M', 0))
        amenity_details = []
        if hawkers > 0:
            amenity_details.append(f"{hawkers} hawker centre(s)")
        if malls > 0:
            amenity_details.append(f"{malls} mall(s)")
        
        if amenity_details:
            explanations.append(f"✓ Good amenities: {', '.join(amenity_details)} within 500m")
        else:
            explanations.append("⚠ Limited nearby amenities")
    
    # 7. Overall match score
    match_reasons = len([e for e in explanations if e.startswith("✓")])
    warning_reasons = len([e for e in explanations if e.startswith("⚠")])
    
    if match_reasons >= 3:
        summary = f"Strong match ({match_reasons} key features align with your priorities)"
    elif match_reasons >= 2:
        summary = f"Good match ({match_reasons} features align with your priorities)"
    else:
        summary = f"Moderate match ({match_reasons} features align, {warning_reasons} areas to consider)"
    
    return {
        'summary': summary,
        'details': explanations
    }

def get_recommendations_with_explanations(model, df, features, user_profile, top_k=20):
    """
    Get recommendations with detailed explanations
    """
    cand = df[df['latest_resale_price_mean'] <= user_profile['Budget_SGD'] * 1.2].copy()
    
    # Check if no properties found
    if len(cand) == 0:
        print("\n" + "="*80)
        print("NO SUITABLE PROPERTIES FOUND")
        print("="*80)
        
        print(f"\nUnable to find properties matching your criteria.")
        print(f"\nYour Budget: ${user_profile['Budget_SGD']:,}")
        print(f"Search Range: Up to ${budget_threshold:,.0f} (150% of budget)")
        
        # Analyze why no matches
        reasons = []
        
        # Check budget constraints
        min_price = df['latest_resale_price_mean'].min()
        if budget_threshold < min_price:
            reasons.append(f"Budget constraint: Lowest available property is ${min_price:,.0f}, which exceeds your search range")
        
        # Check how many properties are within various budget multiples
        within_budget = len(df[df['latest_resale_price_mean'] <= user_profile['Budget_SGD']])
        within_120 = len(df[df['latest_resale_price_mean'] <= user_profile['Budget_SGD'] * 1.2])
        within_150 = len(df[df['latest_resale_price_mean'] <= user_profile['Budget_SGD'] * 1.5])
        within_200 = len(df[df['latest_resale_price_mean'] <= user_profile['Budget_SGD'] * 2.0])
        
        print(f"\nProperty Availability by Price Range:")
        print(f"  Within budget (${user_profile['Budget_SGD']:,}): {within_budget} properties")
        print(f"  Up to 120% (${user_profile['Budget_SGD']*1.2:,.0f}): {within_120} properties")
        print(f"  Up to 150% (${user_profile['Budget_SGD']*1.5:,.0f}): {within_150} properties")
        print(f"  Up to 200% (${user_profile['Budget_SGD']*2.0:,.0f}): {within_200} properties")
        
        if within_budget == 0 and within_120 == 0:
            reasons.append("Your budget may be too low for current market prices")
        
        # Show criteria that might be too restrictive
        print(f"\nYour Priority Requirements (4-5 out of 5):")
        high_priorities = []
        if user_profile['Priority_MRT_Access'] >= 4:
            high_priorities.append(f"MRT Access ({user_profile['Priority_MRT_Access']}/5)")
            mrt_props = len(df[(df['mrt_500'] == 1) | (df['mrt_200'] == 1)])
            print(f"  - MRT Access: {mrt_props} properties have MRT within 500m")
        
        if user_profile.get('Priority_Bus_Access', 0) >= 4:
            high_priorities.append(f"Bus Access ({user_profile['Priority_Bus_Access']}/5)")
            bus_props = len(df[df['bus_200'] >= 1])
            print(f"  - Bus Access: {bus_props} properties have bus stops within 200m")
        
        if user_profile['Priority_School_Proximity'] >= 4:
            high_priorities.append(f"School Proximity ({user_profile['Priority_School_Proximity']}/5)")
            school_props = len(df[df['GP_SCH_1K'] > 0])
            print(f"  - School Proximity: {school_props} properties have schools within 1km")
        
        if user_profile['Priority_Park_Access'] >= 4:
            high_priorities.append(f"Park Access ({user_profile['Priority_Park_Access']}/5)")
            park_props = len(df[df['PK_500M_IN'] > 0])
            print(f"  - Park Access: {park_props} properties have parks within 500m")
        
        if user_profile['Priority_Amenities'] >= 4:
            high_priorities.append(f"Amenities ({user_profile['Priority_Amenities']}/5)")
            amenity_props = len(df[(df['HWKR_500M'] > 0) | (df['MALL_500M'] > 0)])
            print(f"  - Amenities: {amenity_props} properties have hawkers/malls nearby")
        
        print(f"\n{'='*80}")
        print("SUGGESTIONS TO FIND PROPERTIES")
        print(f"{'='*80}")
        print("\n1. Increase your budget or search range")
        print(f"   Try searching up to ${user_profile['Budget_SGD']*2.0:,.0f} (200% of budget)")
        
        if len(high_priorities) > 2:
            print("\n2. Consider relaxing some priority requirements")
            print(f"   You have {len(high_priorities)} high priorities, which may be too restrictive")
        
        print("\n3. Consider different locations or HDB towns")
        print("   Some areas may offer better value for your budget")
        
        if user_profile['Has_Car'] == 'No' and user_profile['Priority_MRT_Access'] >= 4:
            print("\n4. Consider bus access as alternative to MRT")
            print("   Many areas have excellent bus connectivity without MRT stations")
        
        return pd.DataFrame()  # Return empty DataFrame
    
    cand_scores = model.predict(cand[features])
    cand['score'] = cand_scores
    
    if user_profile['Priority_MRT_Access'] >= 4:
        cand['score'] += cand['mrt_500'] * 0.5
    if user_profile['Priority_School_Proximity'] >= 4:
        cand['score'] += cand.get('GP_SCH_1K', 0) * 0.3
    
    result = cand.sort_values('score', ascending=False).head(top_k)
    
    # Add explanations
    explanations = []
    for idx, row in result.iterrows():
        exp = explain_recommendation(row, user_profile)
        explanations.append(exp)
    
    result['explanation_summary'] = [e['summary'] for e in explanations]
    result['explanation_details'] = [e['details'] for e in explanations]
    
    return result

# Legacy function for backwards compatibility
def get_recommendations_for_user(model, df, features, user_profile, top_k=20):
    return get_recommendations_with_explanations(model, df, features, user_profile, top_k)

print("\n" + "="*80)
print(f"AI Model recommendations WITH EXPLANATIONS lesgo - BEST MODEL: {best_model_name}")
print("="*80)

# Show detailed recommendations for first user only
user = user_profiles.iloc[100]

print(f"\nUser Profile:")
print(f"  ID: {user['Buyer_ID']}")
print(f"  Name: {user['Name']}")
print(f"  Type: {user['Household_Type']}")
print(f"  Budget: ${user['Budget_SGD']:,}")
print(f"  Has Car: {user['Has_Car']}")
print(f"  Work: {user['Work_Location']}")
print(f"  Top Priorities:")
print(f"    • Affordability: {user['Priority_Affordability']}/5")
print(f"    • MRT Access: {user['Priority_MRT_Access']}/5")
print(f"    • Bus Access: {user.get('Priority_Bus_Access', 'N/A')}/5")  # NEW
print(f"    • School Proximity: {user['Priority_School_Proximity']}/5")
print(f"    • Park Access: {user['Priority_Park_Access']}/5")
print(f"    • Amenities: {user['Priority_Amenities']}/5")

recs = get_recommendations_with_explanations(best_model, df, features, user, top_k=5)

if len(recs) > 0:
    print(f"\n{'='*80}")
    print("TOP 5 RECOMMENDED PROPERTIES")
    print(f"{'='*80}")
    
    for i, (idx, row) in enumerate(recs.iterrows(), 1):
        print(f"\n#{i} | {row['block']} {row['street_nam']}")
        print(f"    Price: ${row['latest_resale_price_mean']:,.0f} | Score: {row['score']:.2f}")
        print(f"\n    {row['explanation_summary']}")
        print(f"\n    Why this property?")
        for detail in row['explanation_details']:
            print(f"      {detail}")
        
        # Show property features
        features_list = []
        if row.get('mrt_200', 0) == 1:
            features_list.append("MRT <200m")
        elif row.get('mrt_500', 0) == 1:
            features_list.append("MRT <500m")
        if row.get('GP_SCH_1K', 0) > 0:
            features_list.append(f"{int(row['GP_SCH_1K'])} schools")
        if row.get('HWKR_500M', 0) > 0:
            features_list.append(f"{int(row['HWKR_500M'])} hawkers")
        if row.get('MALL_500M', 0) > 0:
            features_list.append(f"{int(row['MALL_500M'])} malls")
        
        if features_list:
            print(f"\n    Quick stats: {' | '.join(features_list)}")
        
        if i < len(recs):
            print(f"\n    {'-'*76}")

print("\n" + "="*80)
print("MODEL SELECTION COMPLETE")
print("="*80)
print(f"\nBest model '{best_model_name}' is ready for production use.")
print("\nTo use the best model for any user:")
print("  user = user_profiles.iloc[INDEX]")
print("  recs = get_recommendations_for_user(best_model, df, features, user)")


AI Model recommendations WITH EXPLANATIONS lesgo - BEST MODEL: RandomForest

User Profile:
  ID: HB0101
  Name: Thun Zhen Hong
  Type: Single
  Budget: $320,000
  Has Car: No
  Work: CBD (Raffles Place/Marina Bay)
  Top Priorities:
    • Affordability: 5/5
    • MRT Access: 5/5
    • Bus Access: 2/5
    • School Proximity: 1/5
    • Park Access: 1/5
    • Amenities: 4/5

TOP 5 RECOMMENDED PROPERTIES

#1 | 46 OWEN RD
    Price: $370,000 | Score: 12.10

    Good match (2 features align with your priorities)

    Why this property?
      ✓ Good MRT access (within 500m) - matches your high priority
      ✓ Good amenities: 1 hawker centre(s) within 500m

    Quick stats: MRT <500m | 1 schools | 1 hawkers

    ----------------------------------------------------------------------------

#2 | 118 LOR 1 TOA PAYOH
    Price: $350,000 | Score: 11.62

    Good match (2 features align with your priorities)

    Why this property?
      ⚠ Slightly over budget by $30,000 (9.4% over)
      ✓ Good MR