In [1]:
pip install category-encoders

Note: you may need to restart the kernel to use updated packages.


In [2]:
# ============================================================================
# nairobi_realestate_predictor/notebooks/02_feature_engineering.ipynb
# Advanced Feature Engineering for Nairobi Real Estate
# ============================================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')

# Load data from EDA phase
df = pd.read_csv('../data/cleaned_nairobi_data.csv')
print(f"Initial dataset shape: {df.shape}")


# ==================== BUSINESS-DRIVEN FEATURE ENGINEERING ====================
print("\n" + "="*60)
print("BUSINESS-DRIVEN FEATURE ENGINEERING")
print("="*60)

# 1. CORE RATIO METRICS
df['PRICE_PER_SQM'] = df['PRICE_KSH'] / df['SIZE_SQM']
df['BEDROOMS_PER_SQM'] = df['BEDROOMS'] / df['SIZE_SQM']
df['ROOM_DENSITY'] = (df['BEDROOMS'] + df['BATHROOMS']) / df['SIZE_SQM']

# 2. LOCATION ENCODING STRATEGIES
print("\n1. Location Encoding Strategies...")

# a) Target Encoding (captures location premium)
location_target_mean = df.groupby('LOCATION')['PRICE_KSH'].transform('mean')
location_target_median = df.groupby('LOCATION')['PRICE_KSH'].transform('median')
df['LOCATION_MEAN_ENCODED'] = location_target_mean
df['LOCATION_MEDIAN_ENCODED'] = location_target_median

# b) Satellite town indicators[citation:1]
satellite_towns = ['Ruiru', 'Kitengela', 'Syokimau', 'Athi River', 'Thika', 'Ngong', 'Rongai']
df['IS_SATELLITE'] = df['LOCATION'].isin(satellite_towns).astype(int)

# c) Infrastructure corridor classification[citation:7]
thika_corridor = ['Ruiru', 'Thika', 'Juja']
mombasa_corridor = ['Kitengela', 'Syokimau', 'Athi River']
ngong_corridor = ['Ngong', 'Rongai']

df['CORRIDOR_THIKA'] = df['LOCATION'].isin(thika_corridor).astype(int)
df['CORRIDOR_MOMBASA'] = df['LOCATION'].isin(mombasa_corridor).astype(int)
df['CORRIDOR_NGONG'] = df['LOCATION'].isin(ngong_corridor).astype(int)

# 3. INTERACTION FEATURES
print("2. Creating Interaction Features...")

# Location-size interactions (premium for large properties in prime areas)
df['LOCATION_SIZE_INTERACTION'] = df['LOCATION_MEAN_ENCODED'] * df['SIZE_SQM']

# Satellite-specific features (value proposition in emerging areas)
df['SATELLITE_SIZE_VALUE'] = df['IS_SATELLITE'] * df['SIZE_SQM']

# Bedroom-location interaction (bedroom premium varies by location)
df['BEDROOM_LOCATION_PREMIUM'] = df['BEDROOMS'] * df['LOCATION_MEAN_ENCODED']

# 4. MARKET SEGMENTATION FEATURES
print("3. Market Segmentation Features...")

# Affordable housing segment (KES 3M-8M range)[citation:10]
df['AFFORDABLE_SEGMENT'] = ((df['PRICE_KSH'] >= 3e6) & (df['PRICE_KSH'] <= 8e6)).astype(int)

# Luxury segment (based on top quartile)
luxury_threshold = df['PRICE_KSH'].quantile(0.75)
df['LUXURY_SEGMENT'] = (df['PRICE_KSH'] > luxury_threshold).astype(int)

# Size categories
df['SIZE_CATEGORY'] = pd.cut(df['SIZE_SQM'], 
                             bins=[0, 60, 120, 200, 400],
                             labels=['Studio', 'Small', 'Medium', 'Large'])

# 5. OUTLIER DETECTION & HANDLING
print("4. Outlier Detection and Handling...")

# Calculate IQR for key features
features_to_check = ['PRICE_KSH', 'SIZE_SQM', 'PRICE_PER_SQM']
outlier_info = {}

for feature in features_to_check:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_pct = len(outliers) / len(df) * 100
    
    outlier_info[feature] = {
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'outlier_count': len(outliers),
        'outlier_pct': outlier_pct
    }
    
    print(f"   {feature}: {outlier_pct:.1f}% outliers ({len(outliers)} properties)")

# Cap outliers instead of removing (preserve data volume)
df['PRICE_KSH_CAPPED'] = df['PRICE_KSH'].clip(
    lower=df['PRICE_KSH'].quantile(0.01),
    upper=df['PRICE_KSH'].quantile(0.99)
)

df['SIZE_SQM_CAPPED'] = df['SIZE_SQM'].clip(
    lower=df['SIZE_SQM'].quantile(0.01),
    upper=df['SIZE_SQM'].quantile(0.99)
)

# 6. MISSING VALUE STRATEGY
print("5. Missing Value Analysis...")

# Check for missing values
missing_values = df.isnull().sum()
missing_pct = (missing_values / len(df)) * 100

missing_report = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Pct': missing_pct
}).sort_values('Missing_Pct', ascending=False)

print("\nMissing Value Report:")
print(missing_report[missing_report['Missing_Count'] > 0])

# Imputation strategy
if 'BATHROOMS' in df.columns and df['BATHROOMS'].isnull().any():
    # Impute bathrooms based on bedrooms (typical ratio)
    df['BATHROOMS'] = df['BATHROOMS'].fillna(df['BEDROOMS'].apply(lambda x: min(x, 3)))

# 7. GEOSPATIAL FEATURES (Proximity Proxies)
print("6. Creating Geospatial Proximity Features...")

# Define key Nairobi points of interest
nairobi_cbd = [-1.2921, 36.8219]  # Nairobi CBD
jomo_kenyatta_airport = [-1.3192, 36.9278]  # JKIA
two_rivers_mall = [-1.2110, 36.8872]  # Two Rivers Mall

# Simulate coordinates for demonstration
np.random.seed(42)
df['LATITUDE'] = np.random.uniform(-1.5, -1.1, len(df))
df['LONGITUDE'] = np.random.uniform(36.6, 37.1, len(df))

# Calculate distance proxies (simplified Euclidean)
def calculate_distance(lat1, lon1, lat2, lon2):
    # Simplified distance calculation for demonstration
    # In production, use Haversine formula
    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2) * 111  # Approx km

df['DISTANCE_TO_CBD'] = calculate_distance(df['LATITUDE'], df['LONGITUDE'], 
                                           nairobi_cbd[0], nairobi_cbd[1])
df['DISTANCE_TO_AIRPORT'] = calculate_distance(df['LATITUDE'], df['LONGITUDE'],
                                               jomo_kenyatta_airport[0], jomo_kenyatta_airport[1])
df['DISTANCE_TO_MALL'] = calculate_distance(df['LATITUDE'], df['LONGITUDE'],
                                            two_rivers_mall[0], two_rivers_mall[1])

# Infrastructure proximity score[citation:1]
df['INFRASTRUCTURE_SCORE'] = (
    1/(df['DISTANCE_TO_CBD'] + 1) * 0.4 +
    1/(df['DISTANCE_TO_AIRPORT'] + 1) * 0.3 +
    1/(df['DISTANCE_TO_MALL'] + 1) * 0.3
)

# ==================== FEATURE SELECTION PREPARATION ====================
print("\n7. Preparing Features for Modeling...")

# Separate features and target
target = 'PRICE_KSH_CAPPED'  # Using capped price as target

# Define feature categories
categorical_features = ['LOCATION', 'SIZE_CATEGORY']
binary_features = ['IS_SATELLITE', 'CORRIDOR_THIKA', 'CORRIDOR_MOMBASA', 
                   'CORRIDOR_NGONG', 'AFFORDABLE_SEGMENT', 'LUXURY_SEGMENT']
numeric_features = ['BEDROOMS', 'BATHROOMS', 'SIZE_SQM_CAPPED',
                    'PRICE_PER_SQM', 'BEDROOMS_PER_SQM', 'ROOM_DENSITY',
                    'LOCATION_MEAN_ENCODED', 'LOCATION_SIZE_INTERACTION',
                    'SATELLITE_SIZE_VALUE', 'BEDROOM_LOCATION_PREMIUM',
                    'DISTANCE_TO_CBD', 'DISTANCE_TO_AIRPORT', 
                    'DISTANCE_TO_MALL', 'INFRASTRUCTURE_SCORE']

# Create final feature set
feature_columns = categorical_features + binary_features + numeric_features

# Check feature availability
missing_features = [f for f in feature_columns if f not in df.columns]
if missing_features:
    print(f"Warning: Missing features: {missing_features}")
    feature_columns = [f for f in feature_columns if f in df.columns]

print(f"\nTotal features prepared: {len(feature_columns)}")
print(f"  - Categorical: {len(categorical_features)}")
print(f"  - Binary: {len(binary_features)}")
print(f"  - Numeric: {len(numeric_features)}")

# ==================== FEATURE IMPORTANCE PRE-ANALYSIS ====================
print("\n8. Preliminary Feature Analysis...")

# Calculate correlation with target price
feature_correlations = {}
for feature in numeric_features + binary_features:
    if feature in df.columns:
        corr = df[feature].corr(df[target])
        feature_correlations[feature] = abs(corr)

# Top 10 most correlated features
top_features = pd.Series(feature_correlations).sort_values(ascending=False).head(10)
print("\nTop 10 Features by Absolute Correlation with Price:")
for feature, corr in top_features.items():
    print(f"  {feature}: {corr:.3f}")

# ==================== SAVE PROCESSED DATA ====================
print("\n9. Saving Processed Dataset...")

# Create processed dataframe
processed_df = df[feature_columns + [target]].copy()

# One-hot encode categorical variables for some models
processed_df = pd.get_dummies(processed_df, columns=categorical_features, drop_first=True)

print(f"Processed dataset shape: {processed_df.shape}")
print(f"Features: {processed_df.shape[1] - 1}")  # Excluding target
print(f"Samples: {processed_df.shape[0]}")

# Save to file
processed_df.to_csv('../data/processed/nairobi_processed_features.csv', index=False)
print("\nProcessed data saved to '../data/processed/nairobi_processed_features.csv'")

# ==================== BUSINESS INSIGHTS FROM FEATURE ENGINEERING ====================
print("\n" + "="*60)
print("BUSINESS INSIGHTS FROM FEATURE ENGINEERING")
print("="*60)

# 1. Satellite town value proposition
satellite_stats = df[df['IS_SATELLITE'] == 1].agg({
    'PRICE_PER_SQM': 'median',
    'SIZE_SQM': 'median',
    'PRICE_KSH': 'median'
})

core_stats = df[df['IS_SATELLITE'] == 0].agg({
    'PRICE_PER_SQM': 'median',
    'SIZE_SQM': 'median',
    'PRICE_KSH': 'median'
})

print("\n1. Satellite Town Value Proposition[citation:1]:")
print(f"   - Price per SQM: KSh {satellite_stats['PRICE_PER_SQM']:,.0f} (Satellite) vs "
      f"KSh {core_stats['PRICE_PER_SQM']:,.0f} (Core)")
print(f"   - Discount: {(1 - satellite_stats['PRICE_PER_SQM']/core_stats['PRICE_PER_SQM'])*100:.1f}%")
print(f"   - Typical size: {satellite_stats['SIZE_SQM']:.0f} SQM (Satellite) vs "
      f"{core_stats['SIZE_SQM']:.0f} SQM (Core)")

# 2. Affordable housing opportunities
affordable_count = df['AFFORDABLE_SEGMENT'].sum()
affordable_pct = affordable_count / len(df) * 100
print(f"\n2. Affordable Housing Market[citation:5][citation:10]:")
print(f"   - {affordable_count:,} properties ({affordable_pct:.1f}%) in KES 3M-8M range")
print(f"   - Median price in segment: KSh {df[df['AFFORDABLE_SEGMENT']==1]['PRICE_KSH'].median()/1e6:.1f}M")

# 3. Infrastructure impact
infra_corridor_prices = {}
for corridor in ['CORRIDOR_THIKA', 'CORRIDOR_MOMBASA', 'CORRIDOR_NGONG']:
    if corridor in df.columns:
        median_price = df[df[corridor] == 1]['PRICE_KSH'].median() / 1e6
        infra_corridor_prices[corridor] = median_price

print(f"\n3. Infrastructure Corridor Premiums[citation:7]:")
for corridor, price in infra_corridor_prices.items():
    corridor_name = corridor.replace('CORRIDOR_', '').title()
    print(f"   - {corridor_name}: KSh {price:.1f}M median price")

print("\nFeature Engineering Complete. Proceed to Modeling.")

Initial dataset shape: (5000, 6)

BUSINESS-DRIVEN FEATURE ENGINEERING

1. Location Encoding Strategies...
2. Creating Interaction Features...
3. Market Segmentation Features...
4. Outlier Detection and Handling...
   PRICE_KSH: 0.9% outliers (47 properties)
   SIZE_SQM: 0.3% outliers (15 properties)
   PRICE_PER_SQM: 8.3% outliers (414 properties)
5. Missing Value Analysis...

Missing Value Report:
Empty DataFrame
Columns: [Missing_Count, Missing_Pct]
Index: []
6. Creating Geospatial Proximity Features...

7. Preparing Features for Modeling...

Total features prepared: 22
  - Categorical: 2
  - Binary: 6
  - Numeric: 14

8. Preliminary Feature Analysis...



Top 10 Features by Absolute Correlation with Price:
  LUXURY_SEGMENT: 0.827
  LOCATION_MEAN_ENCODED: 0.784
  IS_SATELLITE: 0.709
  LOCATION_SIZE_INTERACTION: 0.585
  BEDROOM_LOCATION_PREMIUM: 0.523
  SATELLITE_SIZE_VALUE: 0.475
  DISTANCE_TO_MALL: 0.394
  DISTANCE_TO_CBD: 0.366
  CORRIDOR_MOMBASA: 0.365
  INFRASTRUCTURE_SCORE: 0.317

9. Saving Processed Dataset...
Processed dataset shape: (5000, 38)
Features: 37
Samples: 5000



Processed data saved to '../data/processed/nairobi_processed_features.csv'

BUSINESS INSIGHTS FROM FEATURE ENGINEERING

1. Satellite Town Value Proposition[citation:1]:
   - Price per SQM: KSh 223,283 (Satellite) vs KSh 369,174 (Core)
   - Discount: 39.5%
   - Typical size: 122 SQM (Satellite) vs 120 SQM (Core)

2. Affordable Housing Market[citation:5][citation:10]:
   - 1 properties (0.0%) in KES 3M-8M range
   - Median price in segment: KSh 5.3M

3. Infrastructure Corridor Premiums[citation:7]:
   - Thika: KSh 26.9M median price
   - Mombasa: KSh 26.5M median price
   - Ngong: KSh 24.5M median price

Feature Engineering Complete. Proceed to Modeling.
