In [1]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../Utils')
from tools import create_auxiliary_location_features
print("TEST DATASET PROCESSING PIPELINE")

print("\n[STEP 1] Loading data and saved artifacts...")

# Load test data
test_df = pd.read_csv('../Dataset/test.csv')
print(f" Test data loaded: {test_df.shape}")

# Load auxiliary data
df_hdb = pd.read_csv('../Dataset/auxiliary-data/sg-hdb-block-details.csv')
df_mrt = pd.read_csv('../Dataset/auxiliary-data/sg-mrt-stations.csv')
df_hawker = pd.read_csv('../Dataset/auxiliary-data/sg-gov-hawkers.csv')
df_primary = pd.read_csv('../Dataset/auxiliary-data/sg-primary-schools.csv')
df_secondary = pd.read_csv('../Dataset/auxiliary-data/sg-secondary-schools.csv')
df_malls = pd.read_csv('../Dataset/auxiliary-data/sg-shopping-malls.csv')
print(f" Auxiliary data loaded")
# Load saved artifacts from training
with open('../Dataset/features_standard_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
print(f" Scaler loaded")

with open('../Dataset/train_geo_info.pkl', 'rb') as f:
    train_geo_info = pickle.load(f)
print(f" Train geo info loaded")

with open('../Dataset/knn_feature_stats.pkl', 'rb') as f:
    knn_feature_stats = pickle.load(f)
print(f" KNN feature statistics loaded ({len(knn_feature_stats)} features)")

with open('../Dataset/numerical_features.json', 'r') as f:
    numerical_features = json.load(f)
print(f" Numerical features list loaded ({len(numerical_features)} features)")

with open('../Dataset/all_final_features.json', 'r') as f:
    all_final_features = json.load(f)
print(f" All final features list loaded ({len(all_final_features)} features)")



TEST DATASET PROCESSING PIPELINE

[STEP 1] Loading data and saved artifacts...
 Test data loaded: (50000, 10)
 Auxiliary data loaded
 Scaler loaded
 Train geo info loaded
 KNN feature statistics loaded (8 features)
 Numerical features list loaded (36 features)
 All final features list loaded (89 features)


In [2]:
# STEP 2: Basic Feature Engineering (Same as Train)
print("\n[STEP 2] Basic feature engineering...")

test_processed = test_df.copy()

# 2.1 Clean FLAT_TYPE
test_processed['FLAT_TYPE'] = test_processed['FLAT_TYPE'].str.replace(' room', '-room', case=False)
test_processed['FLAT_TYPE'] = test_processed['FLAT_TYPE'].str.replace(' ROOM', '-ROOM', case=False)
print(f"✓ FLAT_TYPE cleaned: {test_processed['FLAT_TYPE'].unique()}")

# 2.2 Extract time features
test_processed['TRANSACTION_YEAR'] = test_processed['MONTH'].str[:4].astype(int)
test_processed['TRANSACTION_MONTH'] = test_processed['MONTH'].str[5:].astype(int)
print(f"✓ Time features extracted")

# 2.3 Calculate REMAINING_AGE
test_processed['REMAINING_AGE'] = 99 - test_processed['TRANSACTION_YEAR'] + test_processed['LEASE_COMMENCE_DATA']
print(f"✓ REMAINING_AGE calculated")

# 2.4 Extract floor level information
test_processed['FLOOR_LEVEL_LOW'] = test_processed['FLOOR_RANGE'].str.split(' to ').str[0].astype(int)
test_processed['FLOOR_LEVEL_HIGH'] = test_processed['FLOOR_RANGE'].str.split(' to ').str[1].astype(int)
test_processed['FLOOR_LEVEL_MID'] = (test_processed['FLOOR_LEVEL_LOW'] + test_processed['FLOOR_LEVEL_HIGH']) / 2
print(f"✓ Floor level features extracted")

# 2.5 String standardization
test_processed['STREET'] = test_processed['STREET'].str.strip().str.lower()
test_processed['TOWN'] = test_processed['TOWN'].str.strip().str.lower()
test_processed['BLOCK'] = test_processed['BLOCK'].str.strip().str.upper()
print(f" String standardization completed")


[STEP 2] Basic feature engineering...
✓ FLAT_TYPE cleaned: ['3-room' '4-room' '5-room' 'executive' '2-room' '1-room'
 'multi generation']
✓ Time features extracted
✓ REMAINING_AGE calculated
✓ Floor level features extracted
 String standardization completed


In [3]:
print("\n[STEP 3] Categorical encoding...")

# 3.1 FLAT_TYPE - Ordinal Encoding
flat_type_order = [
    '1-room', '2-room', '3-room', '4-room', '5-room', 
    'executive', 'multi generation'
]
flat_type_encoder = OrdinalEncoder(categories=[flat_type_order])
test_processed['FLAT_TYPE_ENCODED'] = flat_type_encoder.fit_transform(test_processed[['FLAT_TYPE']])
print(f"✓ FLAT_TYPE encoded")

# 3.2 FLAT_MODEL - OneHot Encoding

train_flat_models = [
    '2 room', '3gen', 'adjoined flat', 'apartment', 'dbss', 'improved',
    'improved maisonette', 'maisonette', 'model a', 'model a maisonette',
    'model a2', 'multi generation', 'new generation', 'premium apartment',
    'premium apartment loft', 'premium maisonette', 'simplified',
    'standard', 'terrace', 'type s1', 'type s2'
]

encoder_flat_model = OneHotEncoder(sparse_output=False, categories=[train_flat_models], handle_unknown='ignore')
encoded_flat_model = encoder_flat_model.fit_transform(test_processed[['FLAT_MODEL']])
flat_model_cols = [f'FLAT_MODEL_{cat}' for cat in train_flat_models]
encoded_flat_model_df = pd.DataFrame(encoded_flat_model, columns=flat_model_cols, index=test_processed.index)
print(f"FLAT_MODEL encoded: {encoded_flat_model_df.shape[1]} columns")

# 3.3 TOWN - OneHot Encoding
train_towns = [
    'ang mo kio', 'bedok', 'bishan', 'bukit batok', 'bukit merah',
    'bukit panjang', 'bukit timah', 'central area', 'choa chu kang',
    'clementi', 'geylang', 'hougang', 'jurong east', 'jurong west',
    'kallang/whampoa', 'marine parade', 'pasir ris', 'punggol',
    'queenstown', 'sembawang', 'sengkang', 'serangoon', 'tampines',
    'toa payoh', 'woodlands', 'yishun'
]

encoder_town = OneHotEncoder(sparse_output=False, categories=[train_towns], handle_unknown='ignore')
encoded_town = encoder_town.fit_transform(test_processed[['TOWN']])
town_cols = [f'TOWN_{cat}' for cat in train_towns]
encoded_town_df = pd.DataFrame(encoded_town, columns=town_cols, index=test_processed.index)
print(f"✓ TOWN encoded: {encoded_town_df.shape[1]} columns")

# Concatenate encoded features
test_processed = pd.concat([test_processed, encoded_flat_model_df, encoded_town_df], axis=1)
print(f" Encoded features concatenated. Shape: {test_processed.shape}")


[STEP 3] Categorical encoding...
✓ FLAT_TYPE encoded
FLAT_MODEL encoded: 21 columns
✓ TOWN encoded: 26 columns
 Encoded features concatenated. Shape: (50000, 64)


In [4]:
print("\n[STEP 4] Merging with HDB location data...")

# Prepare HDB data
important_cols = ['BLOCK', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'POSTAL_CODE', 'MAX_FLOOR']
df_hdb_processed = df_hdb[important_cols].copy()
df_hdb_processed['ADDRESS'] = df_hdb_processed['ADDRESS'].str.strip().str.lower()
df_hdb_processed['BLOCK'] = df_hdb_processed['BLOCK'].str.strip().str.upper()

# Merge
test_processed = test_processed.merge(
    df_hdb_processed,
    left_on=['BLOCK', 'STREET'],
    right_on=['BLOCK', 'ADDRESS'],
    how='left'
)
test_processed.drop(columns=['ADDRESS'], inplace=True)

missing_coords = test_processed[['LATITUDE', 'LONGITUDE']].isnull().any(axis=1).sum()
print(f"HDB data merged. Missing coordinates: {missing_coords}/{len(test_processed)}")


[STEP 4] Merging with HDB location data...
HDB data merged. Missing coordinates: 0/50000


In [5]:
# STEP 5: Floor Level Derived Features
# ============================================
print("\n[STEP 5] Creating floor level derived features...")

# FLOOR_LEVEL_RATIO
test_processed['FLOOR_LEVEL_RATIO'] = (
    test_processed['FLOOR_LEVEL_MID'] / test_processed['MAX_FLOOR']
)

# FLOOR_LEVEL_CATEGORY (临时用于生成IS_HIGH_FLOOR)
test_processed['FLOOR_LEVEL_CATEGORY'] = pd.cut(
    test_processed['FLOOR_LEVEL_RATIO'],
    bins=[0, 0.33, 0.67, 1.0],
    labels=['Low', 'Mid', 'High']
)

# IS_HIGH_FLOOR
test_processed['IS_HIGH_FLOOR'] = (test_processed['FLOOR_LEVEL_CATEGORY'] == 'High').astype(int)

# IS_HIGH_FLOOR_IN_PREMIUM_TOWN
high_premium_towns = ['sengkang', 'yishun', 'jurong west', 'tampines', 'hougang']
test_processed['IS_HIGH_FLOOR_IN_PREMIUM_TOWN'] = (
    test_processed['IS_HIGH_FLOOR'] & 
    test_processed['TOWN'].isin(high_premium_towns)
).astype(int)

print(f" Floor level features created")


[STEP 5] Creating floor level derived features...
 Floor level features created


In [6]:
# STEP 6: Amenity Proximity Features
# ============================================
print("\n[STEP 6] Creating amenity proximity features...")

test_coords = test_processed[['LATITUDE', 'LONGITUDE']].copy()

# MRT features
print("  Calculating MRT features...")
mrt_features = create_auxiliary_location_features(
    hdb_coords=test_coords,
    auxilliary_df=df_mrt,
    feature_prefix='MRT',
    radii=[0.5, 1.0, 2.0],
    batch_size=1000
)

# Hawker features
print("  Calculating Hawker features...")
hawker_features = create_auxiliary_location_features(
    hdb_coords=test_coords,
    auxilliary_df=df_hawker,
    radii=[0.5, 1.5, 3.0],
    feature_prefix='HAWKER',
    batch_size=1000
)

# Primary school features
print("  Calculating Primary school features...")
primary_features = create_auxiliary_location_features(
    hdb_coords=test_coords,
    auxilliary_df=df_primary,
    radii=[1.0, 2.0, 3.0],
    feature_prefix='PRIMARY',
    batch_size=1000
)

# Secondary school features
print("  Calculating Secondary school features...")
secondary_features = create_auxiliary_location_features(
    hdb_coords=test_coords,
    auxilliary_df=df_secondary,
    radii=[1.0, 2.0, 3.0],
    feature_prefix='SECONDARY',
    batch_size=1000
)

# Mall features
print("  Calculating Mall features...")
mall_features = create_auxiliary_location_features(
    hdb_coords=test_coords,
    auxilliary_df=df_malls,
    radii=[1.0, 2.0, 3.0],
    feature_prefix='MALL',
    batch_size=1000
)

# Concatenate all proximity features
test_processed = test_processed.join([
    mrt_features,
    hawker_features,
    primary_features,
    secondary_features,
    mall_features
])
print(f"Amenity features created. Shape: {test_processed.shape}")


[STEP 6] Creating amenity proximity features...
  Calculating MRT features...


Creating mall features:   0%|          | 0/50 [00:00<?, ?it/s]

  Calculating Hawker features...


Creating mall features:   0%|          | 0/50 [00:00<?, ?it/s]

  Calculating Primary school features...


Creating mall features:   0%|          | 0/50 [00:00<?, ?it/s]

  Calculating Secondary school features...


Creating mall features:   0%|          | 0/50 [00:00<?, ?it/s]

  Calculating Mall features...


Creating mall features:   0%|          | 0/50 [00:00<?, ?it/s]

Amenity features created. Shape: (50000, 92)


In [7]:
# STEP 7: KNN Geographic Features (CRITICAL!)
# ============================================
print("\n[STEP 7] Creating KNN geographic features using train data...")

# Extract training coordinates and prices
X_train_geo = train_geo_info['coordinates']  # Training set coordinates
y_train_price = train_geo_info['prices']      # Training set prices
K_VALUES = train_geo_info['k_values']         # [16, 32, 64, 128]

# Get test coordinates 
test_has_coords = ~test_processed[['LATITUDE', 'LONGITUDE']].isnull().any(axis=1)
X_test_geo = test_processed.loc[test_has_coords, ['LATITUDE', 'LONGITUDE']].values

# Initialize KNN feature columns with NaN
for k in K_VALUES:
    test_processed[f'GEO_AVG_PRICE_K{k}'] = np.nan
    test_processed[f'GEO_STD_PRICE_K{k}'] = np.nan

# Fit KNN on ENTIRE training set (no cross-validation for test)
print(f"  Fitting KNN on {len(X_train_geo)} training points...")
nn_model = NearestNeighbors(n_neighbors=max(K_VALUES), n_jobs=-1, metric='haversine')
nn_model.fit(X_train_geo)

# Find neighbors for test data
print(f"  Finding neighbors for {len(X_test_geo)} test points...")
distances, indices = nn_model.kneighbors(X_test_geo)

# Generate KNN features for each K value
for k in K_VALUES:
    k_indices = indices[:, :k]
    neighbor_prices = y_train_price[k_indices]
    
    mean_features = np.mean(neighbor_prices, axis=1)
    std_features = np.std(neighbor_prices, axis=1)
    
    test_processed.loc[test_has_coords, f'GEO_AVG_PRICE_K{k}'] = mean_features
    test_processed.loc[test_has_coords, f'GEO_STD_PRICE_K{k}'] = std_features

print(f"KNN features created for K values: {K_VALUES}")




[STEP 7] Creating KNN geographic features using train data...
  Fitting KNN on 162570 training points...
  Finding neighbors for 50000 test points...
KNN features created for K values: [16, 32, 64, 128]


In [8]:
# ============================================
# STEP 8: Drop Unnecessary Columns
# ============================================
print("\n[STEP 8] Dropping unnecessary columns...")

columns_to_drop = [
    'MONTH', 'TOWN', 'FLAT_TYPE', 'BLOCK', 'STREET', 'FLOOR_RANGE',
    'FLAT_MODEL', 'ECO_CATEGORY', 'LEASE_COMMENCE_DATA',
    'FLOOR_LEVEL_LOW', 'FLOOR_LEVEL_HIGH', 'FLOOR_LEVEL_CATEGORY',
    'POSTAL_CODE'
]

test_processed.drop(columns=columns_to_drop, errors='ignore', inplace=True)
print(f"✓ Dropped {len(columns_to_drop)} unnecessary columns")
print(f"  Current shape: {test_processed.shape}")


[STEP 8] Dropping unnecessary columns...
✓ Dropped 13 unnecessary columns
  Current shape: (50000, 87)


In [9]:
# STEP 9: Align with Training Features
# ============================================
print("\n[STEP 9] Aligning with training feature set...")

# In training dataset, there are RESALE_PRICE and LOG_RESALE_PRICE, but in test dataset, there are not
train_only_features = ['RESALE_PRICE', 'LOG_RESALE_PRICE']
expected_test_features = [f for f in all_final_features if f not in train_only_features]

# Check missing columns
missing_features = set(expected_test_features) - set(test_processed.columns)
extra_features = set(test_processed.columns) - set(expected_test_features)

if missing_features:
    print(f" Missing features: {missing_features}")
    # Add missing columns (fill with 0)
    for feat in missing_features:
        test_processed[feat] = 0
        print(f" Added {feat} with default value 0")

if extra_features:
    print(f" Extra features (will be dropped): {extra_features}")
    test_processed.drop(columns=list(extra_features), inplace=True)

# Reorder columns to match training set (except RESALE_PRICE and LOG_RESALE_PRICE)
test_processed = test_processed[expected_test_features]

print(f"Feature alignment completed")
print(f"Final shape: {test_processed.shape}")
print(f"Expected: ({len(test_df)}, {len(expected_test_features)})")


[STEP 9] Aligning with training feature set...
Feature alignment completed
Final shape: (50000, 87)
Expected: (50000, 87)


In [10]:
# STEP 10: Final Validation and Save (without standardization)
# ============================================
print("\n[STEP 10] Final validation and saving...")

# Check for missing values
missing_summary = test_processed.isnull().sum()
missing_cols = missing_summary[missing_summary > 0]

if len(missing_cols) > 0:
    print(f"  WARNING: {len(missing_cols)} columns have missing values:")
    print(missing_cols)
    print("\n  Handling missing values...")
    
    # For KNN features, if missing, fill with overall mean
    knn_cols = [col for col in test_processed.columns if 'GEO_' in col]
    for col in knn_cols:
        if test_processed[col].isnull().any():
            # Note: Here we use the mean of the training set, not the test set!
            train_median=knn_feature_stats[col]['median']
            test_processed[col].fillna(train_median, inplace=True)
            print(f"    Filled {col} with train median: {train_median:.2f}")
    
    # For other features, fill with 0
    test_processed.fillna(0, inplace=True)
    print(f"  Missing values handled")
else:
    print(f"   No missing values detected")

# Save processed test data for Tree-based Model (without standardization)
output_path = '../Dataset/test_data_for_modeling(no_standardization).csv'
test_processed.to_csv(output_path, index=False)
print(f"\n Test data saved to: {output_path}")


[STEP 10] Final validation and saving...
   No missing values detected

 Test data saved to: ../Dataset/test_data_for_modeling(no_standardization).csv


In [11]:
# STEP 11: Apply Standardization
# ============================================
print("\n[STEP 11] Applying standardization ...")

numerical_features_in_test = [f for f in numerical_features if f in test_processed.columns]

print(f"Standardizing {len(numerical_features_in_test)} numerical features...")
test_processed[numerical_features_in_test] = scaler.transform(
    test_processed[numerical_features_in_test]
)

print(f"Standardization applied using saved scaler")



[STEP 11] Applying standardization ...
Standardizing 36 numerical features...
Standardization applied using saved scaler


In [12]:
# STEP 12: Final Validation and Save
# ============================================
print("\n[STEP 11] Final validation and saving...")

# Check for missing values
missing_summary = test_processed.isnull().sum()
missing_cols = missing_summary[missing_summary > 0]

if len(missing_cols) > 0:
    print(f"  WARNING: {len(missing_cols)} columns have missing values:")
    print(missing_cols)
    print("\n  Handling missing values...")
    
    # For KNN features, if missing, fill with overall mean
    knn_cols = [col for col in test_processed.columns if 'GEO_' in col]
    for col in knn_cols:
        if test_processed[col].isnull().any():
            # Note: Here we use the mean of the training set, not the test set!
            train_median=knn_feature_stats[col]['median']
            test_processed[col].fillna(train_median, inplace=True)
            print(f"    Filled {col} with train median: {train_median:.2f}")
    
    # For other features, fill with 0
    test_processed.fillna(0, inplace=True)
    print(f"  Missing values handled")
else:
    print(f"   No missing values detected")

# Save processed test data
output_path = '../Dataset/test_data_for_modeling.csv'
test_processed.to_csv(output_path, index=False)
print(f"\n Test data saved to: {output_path}")


[STEP 11] Final validation and saving...
   No missing values detected

 Test data saved to: ../Dataset/test_data_for_modeling.csv


In [13]:
print("\n" + "=" * 60)
print("TEST DATA PROCESSING COMPLETED SUCCESSFULLY!")
print(f"Original test data:     {test_df.shape}")
print(f"Processed test data:    {test_processed.shape}")
print(f"Expected features:      {len(expected_test_features)}")
print(f"Actual features:        {test_processed.shape[1]}")
print(f"Match:                  {test_processed.shape[1] == len(expected_test_features)}")
print("\nFeature categories:")
print(f"  - Numerical features:    {len(numerical_features_in_test)}")
print(f"  - OneHot FLAT_MODEL:     {len([c for c in test_processed.columns if 'FLAT_MODEL_' in c])}")
print(f"  - OneHot TOWN:           {len([c for c in test_processed.columns if 'TOWN_' in c])}")
print(f"  - KNN features:          {len([c for c in test_processed.columns if 'GEO_' in c])}")
print(f"  - Amenity features:      {len([c for c in test_processed.columns if 'NEAREST_' in c or 'COUNT_' in c])}")




TEST DATA PROCESSING COMPLETED SUCCESSFULLY!
Original test data:     (50000, 10)
Processed test data:    (50000, 87)
Expected features:      87
Actual features:        87
Match:                  True

Feature categories:
  - Numerical features:    36
  - OneHot FLAT_MODEL:     21
  - OneHot TOWN:           26
  - KNN features:          8
  - Amenity features:      20
