## Task 1.3: Algebraic Feature Engineering

Feature engineering is where domain knowledge meets data science—and this task was all about crafting meaningful derived variables from our raw features. We created 10 new algebraic features that capture nuanced relationships: `space_efficiency` (beds per bedroom), `value_density` (reviews per dollar), `price_per_bedroom`, `occupancy_rate`, and `booking_flexibility_score`, among others. Each feature was carefully designed to extract hidden patterns—for instance, `review_momentum` reveals recent listing activity by comparing last-twelve-month reviews to total reviews. Division-by-zero edge cases were handled gracefully using median imputation and clipping to ensure data integrity. The result is a richer feature space that transforms simple columns into action.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Define paths
DATA_PROCESSED = '../../data/processed/'
OUTPUTS_FIGURES = '../../outputs/figures/'
OUTPUTS = '../../outputs/'

df.to_csv(f'{DATA_PROCESSED}listings_with_algebraic_features.csv', index=False)
plt.savefig(f'{OUTPUTS_FIGURES}my_plot.png')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 80)
print("TASK 1.3: ALGEBRAIC FEATURE ENGINEERING")
print("San Francisco & San Diego Airbnb Dataset")
print("=" * 80)

# Load the EXISTING cleaned data with target
df = pd.read_csv('../../data/processed/listings_cleaned_with_target.csv')
print(f"\n Loaded Dataset Shape: {df.shape}")
print(f"Columns: {df.shape[1]}")
print(f"Rows: {df.shape[0]:,}")

# Check existing columns
print("\n Existing Columns:")
print(df.columns.tolist())

print("\n Checking Required Columns for Algebraic Features:")
required_cols = ['price', 'accommodates', 'bedrooms', 'beds', 'bathrooms_numeric', 
                 'number_of_reviews', 'availability_365', 'host_total_listings_count',
                 'minimum_nights', 'number_of_reviews_ltm']

all_present = True
for col in required_cols:
    if col in df.columns:
        print(f"    {col}")
    else:
        print(f"    {col} - MISSING!")
        all_present = False

if all_present:
    print("\n All required columns present! Ready to create algebraic features.")

    print("\n" + "=" * 80)
print("CREATING 10 NEW ALGEBRAIC FEATURES")
print("=" * 80)

# Feature 1: space_efficiency (beds per bedroom)
print("\n Creating: space_efficiency = beds / bedrooms")
df['space_efficiency'] = df['beds'] / df['bedrooms'].replace(0, np.nan)
df['space_efficiency'] = df['space_efficiency'].fillna(df['space_efficiency'].median())
print(f"   Range: {df['space_efficiency'].min():.2f} to {df['space_efficiency'].max():.2f}")
print(f"   Mean: {df['space_efficiency'].mean():.2f}")
print(f"   Median: {df['space_efficiency'].median():.2f}")

# Feature 2: value_density (reviews per dollar)
print("\n Creating: value_density = number_of_reviews / price")
df['value_density'] = df['number_of_reviews'] / df['price'].replace(0, np.nan)
df['value_density'] = df['value_density'].fillna(0)
print(f"   Range: {df['value_density'].min():.2f} to {df['value_density'].max():.2f}")
print(f"   Mean: {df['value_density'].mean():.2f}")
print(f"   Median: {df['value_density'].median():.2f}")

# Feature 3: review_to_capacity_ratio
print("\n Creating: review_to_capacity_ratio = number_of_reviews / accommodates")
df['review_to_capacity_ratio'] = df['number_of_reviews'] / df['accommodates'].replace(0, np.nan)
df['review_to_capacity_ratio'] = df['review_to_capacity_ratio'].fillna(0)
print(f"   Range: {df['review_to_capacity_ratio'].min():.2f} to {df['review_to_capacity_ratio'].max():.2f}")
print(f"   Mean: {df['review_to_capacity_ratio'].mean():.2f}")
print(f"   Median: {df['review_to_capacity_ratio'].median():.2f}")

# Feature 4: price_per_bedroom
print("\n Creating: price_per_bedroom = price / bedrooms")
df['price_per_bedroom'] = df['price'] / df['bedrooms'].replace(0, np.nan)
df['price_per_bedroom'] = df['price_per_bedroom'].fillna(df['price_per_bedroom'].median())
print(f"   Range: ${df['price_per_bedroom'].min():.2f} to ${df['price_per_bedroom'].max():.2f}")
print(f"   Mean: ${df['price_per_bedroom'].mean():.2f}")
print(f"   Median: ${df['price_per_bedroom'].median():.2f}")

# Feature 5: price_per_bathroom
print("\n Creating: price_per_bathroom = price / bathrooms_numeric")
df['price_per_bathroom'] = df['price'] / df['bathrooms_numeric'].replace(0, np.nan)
df['price_per_bathroom'] = df['price_per_bathroom'].fillna(df['price_per_bathroom'].median())
print(f"   Range: ${df['price_per_bathroom'].min():.2f} to ${df['price_per_bathroom'].max():.2f}")
print(f"   Mean: ${df['price_per_bathroom'].mean():.2f}")
print(f"   Median: ${df['price_per_bathroom'].median():.2f}")

# Feature 6: occupancy_rate (already exists as estimated_occupancy_l365d, but let's create our version)
print("\n Creating: occupancy_rate = (365 - availability_365) / 365")
df['occupancy_rate'] = (365 - df['availability_365']) / 365
df['occupancy_rate'] = df['occupancy_rate'].clip(0, 1)
print(f"   Range: {df['occupancy_rate'].min():.2f} to {df['occupancy_rate'].max():.2f}")
print(f"   Mean: {df['occupancy_rate'].mean():.2f}")
print(f"   Median: {df['occupancy_rate'].median():.2f}")

# Feature 7: review_momentum (recent activity)
print("\n Creating: review_momentum = number_of_reviews_ltm / number_of_reviews")
df['review_momentum'] = df['number_of_reviews_ltm'] / df['number_of_reviews'].replace(0, np.nan)
df['review_momentum'] = df['review_momentum'].fillna(0).clip(0, 1)
print(f"   Range: {df['review_momentum'].min():.2f} to {df['review_momentum'].max():.2f}")
print(f"   Mean: {df['review_momentum'].mean():.2f}")
print(f"   Median: {df['review_momentum'].median():.2f}")

# Feature 8: host_portfolio_intensity
print("\n Creating: host_portfolio_intensity = host_total_listings_count / accommodates")
df['host_portfolio_intensity'] = df['host_total_listings_count'] / df['accommodates'].replace(0, np.nan)
df['host_portfolio_intensity'] = df['host_portfolio_intensity'].fillna(df['host_portfolio_intensity'].median())
print(f"   Range: {df['host_portfolio_intensity'].min():.2f} to {df['host_portfolio_intensity'].max():.2f}")
print(f"   Mean: {df['host_portfolio_intensity'].mean():.2f}")
print(f"   Median: {df['host_portfolio_intensity'].median():.2f}")

# Feature 9: booking_flexibility_score
print("\n Creating: booking_flexibility_score = 1 / (minimum_nights + 1)")
df['booking_flexibility_score'] = 1 / (df['minimum_nights'] + 1)
print(f"   Range: {df['booking_flexibility_score'].min():.6f} to {df['booking_flexibility_score'].max():.6f}")
print(f"   Mean: {df['booking_flexibility_score'].mean():.6f}")
print(f"   Median: {df['booking_flexibility_score'].median():.6f}")

# Feature 10: space_per_person
print("\n Creating: space_per_person = bedrooms / accommodates")
df['space_per_person'] = df['bedrooms'] / df['accommodates'].replace(0, np.nan)
df['space_per_person'] = df['space_per_person'].fillna(df['space_per_person'].median())
print(f"   Range: {df['space_per_person'].min():.2f} to {df['space_per_person'].max():.2f}")
print(f"   Mean: {df['space_per_person'].mean():.2f}")
print(f"   Median: {df['space_per_person'].median():.2f}")

print("\n" + "=" * 80)
print(" ALL 10 ALGEBRAIC FEATURES CREATED SUCCESSFULLY!")
print("=" * 80)

# Summary
new_features = [
    'space_efficiency', 'value_density', 'review_to_capacity_ratio',
    'price_per_bedroom', 'price_per_bathroom', 'occupancy_rate',
    'review_momentum', 'host_portfolio_intensity', 'booking_flexibility_score',
    'space_per_person'
]

print(f"\n New Dataset Shape: {df.shape}")
print(f"   Before: 19,912 rows × 71 columns")
print(f"   After: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   Added: {len(new_features)} new algebraic features")

# Check data quality
print("\n Data Quality Check:")
quality_ok = True
for feature in new_features:
    nan_count = df[feature].isna().sum()
    inf_count = np.isinf(df[feature]).sum()
    if nan_count > 0 or inf_count > 0:
        print(f"    {feature}: {nan_count} NaN, {inf_count} Inf values")
        quality_ok = False
        
if quality_ok:
    print("    All features are clean (no NaN or Inf values)")

# Save the dataset
output_path = '../../data/processed/listings_with_algebraic_features.csv'
df.to_csv(output_path, index=False)
print(f"\n Saved dataset to: {output_path}")
print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

# Create detailed statistics table
print("\n" + "=" * 100)
print(" DETAILED STATISTICS FOR 10 ALGEBRAIC FEATURES")
print("=" * 100)

stats_data = []
for i, feature in enumerate(new_features, 1):
    stats = {
        'No.': i,
        'Feature Name': feature,
        'Mean': df[feature].mean(),
        'Median': df[feature].median(),
        'Std Dev': df[feature].std(),
        'Min': df[feature].min(),
        'Max': df[feature].max(),
        'Q1': df[feature].quantile(0.25),
        'Q3': df[feature].quantile(0.75),
        'Skewness': df[feature].skew(),
        'Missing': df[feature].isna().sum()
    }
    stats_data.append(stats)

stats_df = pd.DataFrame(stats_data)

# Display formatted table
print("\n")
for idx, row in stats_df.iterrows():
    print(f"{row['No.']}. {row['Feature Name'].upper()}")
    print(f"   Mean: {row['Mean']:.4f} | Median: {row['Median']:.4f} | Std: {row['Std Dev']:.4f}")
    print(f"   Range: [{row['Min']:.4f}, {row['Max']:.4f}] | IQR: [{row['Q1']:.4f}, {row['Q3']:.4f}]")
    print(f"   Skewness: {row['Skewness']:.4f} | Missing: {row['Missing']}")
    print()

# Save statistics to CSV
stats_df.to_csv('../../data/processed/algebraic_features_statistics.csv', index=False)
print("✅ Statistics saved: algebraic_features_statistics.csv")