## Task 1.3: Algebraic Feature Engineering

**10 Algebraic Features Created:**
1. space_efficiency 
2. price_per_bedroom 
3. price_per_bathroom 
4. occupancy_rate 
5. booking_flexibility_score 
6. space_per_person 
7. host_portfolio_intensity 
8. bathroom_to_bedroom_ratio 
9. price_to_capacity_ratio 
10. availability_flexibility_score 

Feature engineering is where domain knowledge meets data science—and this task was all about crafting meaningful derived variables from landlord-controlled features. Each feature was carefully designed to extract hidden patterns while ensuring NO data leakage from review-based information. Division-by-zero edge cases were handled gracefully using median imputation and clipping to ensure data integrity. The result is a richer feature space that transforms simple columns into actionable insights.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("="*80)
print("Task 1.3: Algebraic Feature Engineering")
print("="*80)

## 1. Load Data from T1.2

In [None]:
# Load the cleaned data with target from T1.2
df = pd.read_csv('../../data/processed/listings_cleaned_with_target.csv')
print(f"\n Loaded Dataset Shape: {df.shape}")
print(f"Columns: {df.shape[1]}")
print(f"Rows: {df.shape[0]:,}")

# Check existing columns
print("\n Existing Columns:")
print(df.columns.tolist())

## 2. Verify Required Columns

In [None]:
print("\n Checking Required Columns for Algebraic Features:")
required_cols = ['price', 'accommodates', 'bedrooms', 'beds', 'bathrooms_numeric', 
                 'availability_365', 'host_total_listings_count', 'minimum_nights']

all_present = True
for col in required_cols:
    if col in df.columns:
        print(f"{col}")
    else:
        print(f"{col} - Missing!")
        all_present = False

if all_present:
    print("\n All required columns present! Ready to create algebraic features.")
else:
    print("\n Some required columns are missing!")

## 3. Create 10 Landlord-Controlled Algebraic Features

In [None]:
print("\n" + "="*80)

print("="*80)

# Feature 1: space_efficiency (beds per bedroom) 
print("\n Creating: space_efficiency = beds / bedrooms")
df['space_efficiency'] = df['beds'] / df['bedrooms'].replace(0, np.nan)
df['space_efficiency'] = df['space_efficiency'].fillna(df['space_efficiency'].median())
print(f"Range: {df['space_efficiency'].min():.2f} to {df['space_efficiency'].max():.2f}")
print(f"Mean: {df['space_efficiency'].mean():.2f}")
print(f"Median: {df['space_efficiency'].median():.2f}")

# Feature 2: price_per_bedroom 
print("\n Creating: price_per_bedroom = price / bedrooms")
df['price_per_bedroom'] = df['price'] / df['bedrooms'].replace(0, np.nan)
df['price_per_bedroom'] = df['price_per_bedroom'].fillna(df['price_per_bedroom'].median())
print(f"Range: ${df['price_per_bedroom'].min():.2f} to ${df['price_per_bedroom'].max():.2f}")
print(f"Mean: ${df['price_per_bedroom'].mean():.2f}")
print(f"Median: ${df['price_per_bedroom'].median():.2f}")

# Feature 3: price_per_bathroom 
print("\n Creating: price_per_bathroom = price / bathrooms_numeric")
df['price_per_bathroom'] = df['price'] / df['bathrooms_numeric'].replace(0, np.nan)
df['price_per_bathroom'] = df['price_per_bathroom'].fillna(df['price_per_bathroom'].median())
print(f"Range: ${df['price_per_bathroom'].min():.2f} to ${df['price_per_bathroom'].max():.2f}")
print(f"Mean: ${df['price_per_bathroom'].mean():.2f}")
print(f"Median: ${df['price_per_bathroom'].median():.2f}")

# Feature 4: occupancy_rate 
print("\n Creating: occupancy_rate = (365 - availability_365) / 365 (capped 0 to 1)")
df['occupancy_rate'] = (365 - df['availability_365']) / 365
df['occupancy_rate'] = df['occupancy_rate'].clip(0, 1)
print(f"Range: {df['occupancy_rate'].min():.2f} to {df['occupancy_rate'].max():.2f}")
print(f"Mean: {df['occupancy_rate'].mean():.2f}")
print(f"Median: {df['occupancy_rate'].median():.2f}")  

# Feature 5: booking_flexibility_score 
print("\n Creating: booking_flexibility_score = 1 / (minimum_nights + 1) (higher is more flexible)")
df['booking_flexibility_score'] = 1 / (df['minimum_nights'] + 1)
print(f"Range: {df['booking_flexibility_score'].min():.6f} to {df['booking_flexibility_score'].max():.6f}")
print(f"Mean: {df['booking_flexibility_score'].mean():.6f}")
print(f"Median: {df['booking_flexibility_score'].median():.6f}")

# Feature 6: space_per_person 
print("\n Creating: space_per_person = bedrooms / accommodates ")
df['space_per_person'] = df['bedrooms'] / df['accommodates'].replace(0, np.nan)
df['space_per_person'] = df['space_per_person'].fillna(df['space_per_person'].median())
print(f"Range: {df['space_per_person'].min():.2f} to {df['space_per_person'].max():.2f}")
print(f"Mean: {df['space_per_person'].mean():.2f}")
print(f"Median: {df['space_per_person'].median():.2f}")

# Feature 7: host_portfolio_intensity 
print("\n Creating: host_portfolio_intensity = host_total_listings_count / accommodates")
df['host_portfolio_intensity'] = df['host_total_listings_count'] / df['accommodates'].replace(0, np.nan)
df['host_portfolio_intensity'] = df['host_portfolio_intensity'].fillna(df['host_portfolio_intensity'].median())
print(f"Range: {df['host_portfolio_intensity'].min():.2f} to {df['host_portfolio_intensity'].max():.2f}")
print(f"Mean: {df['host_portfolio_intensity'].mean():.2f}")
print(f"Median: {df['host_portfolio_intensity'].median():.2f}")

# Feature 8: bathroom_to_bedroom_ratio  
print("\n Creating: bathroom_to_bedroom_ratio = bathrooms_numeric / bedrooms")
df['bathroom_to_bedroom_ratio'] = df['bathrooms_numeric'] / df['bedrooms'].replace(0, np.nan)
df['bathroom_to_bedroom_ratio'] = df['bathroom_to_bedroom_ratio'].fillna(df['bathroom_to_bedroom_ratio'].median())
print(f"Range: {df['bathroom_to_bedroom_ratio'].min():.2f} to {df['bathroom_to_bedroom_ratio'].max():.2f}")
print(f"Mean: {df['bathroom_to_bedroom_ratio'].mean():.2f}")
print(f"Median: {df['bathroom_to_bedroom_ratio'].median():.2f}")
print(f"Interpretation: Higher ratio = more luxury (more bathrooms per bedroom)")

# Feature 9: price_to_capacity_ratio  
print("\n Creating: price_to_capacity_ratio = price / (accommodates × bedrooms) []")
df['price_to_capacity_ratio'] = df['price'] / (df['accommodates'] * df['bedrooms'].replace(0, np.nan))
df['price_to_capacity_ratio'] = df['price_to_capacity_ratio'].fillna(df['price_to_capacity_ratio'].median())
print(f"Range: ${df['price_to_capacity_ratio'].min():.2f} to ${df['price_to_capacity_ratio'].max():.2f}")
print(f"Mean: ${df['price_to_capacity_ratio'].mean():.2f}")
print(f"Median: ${df['price_to_capacity_ratio'].median():.2f}")
print(f"Interpretation: Price efficiency per unit of space")

# Feature 10: availability_flexibility_score  
print("\n Creating: availability_flexibility_score = availability_365 / minimum_nights")
df['availability_flexibility_score'] = df['availability_365'] / df['minimum_nights'].replace(0, np.nan)
df['availability_flexibility_score'] = df['availability_flexibility_score'].fillna(df['availability_flexibility_score'].median())
df['availability_flexibility_score'] = df['availability_flexibility_score'].clip(0, 365)  # Cap at 365
print(f"Range: {df['availability_flexibility_score'].min():.2f} to {df['availability_flexibility_score'].max():.2f}")
print(f"Mean: {df['availability_flexibility_score'].mean():.2f}")
print(f"Median: {df['availability_flexibility_score'].median():.2f}")
print(f"Interpretation: High availability + low minimum nights = more flexible booking")

print("\n" + "="*80)

print("="*80)

## 4. Summary and Data Quality Check

In [None]:
# Summary
new_features = [
    'space_efficiency', 'price_per_bedroom', 'price_per_bathroom',
    'occupancy_rate', 'booking_flexibility_score', 'space_per_person',
    'host_portfolio_intensity', 'bathroom_to_bedroom_ratio',
    'price_to_capacity_ratio', 'availability_flexibility_score'
]

print(f"\n New Dataset Shape: {df.shape}")
print(f"Added: {len(new_features)} new algebraic features")


# Check data quality
print("\n Data Quality Check:")
quality_ok = True
for feature in new_features:
    nan_count = df[feature].isna().sum()
    inf_count = np.isinf(df[feature]).sum()
    if nan_count > 0 or inf_count > 0:
        print(f"{feature}: {nan_count} NaN, {inf_count} Inf values")
        quality_ok = False
        
if quality_ok:
    print(" All features are clean (no NaN or Inf values!)")

## 5. Save Dataset

In [None]:
# Save the dataset
output_path = '../../data/processed/listings_with_algebraic_features.csv'
df.to_csv(output_path, index=False)
print(f"\n Saved dataset to: {output_path}")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

## 6. Detailed Statistics for New Algebraic Features

In [None]:
stats_data = []
for i, feature in enumerate(new_features, 1):
    stats = {
        'No.': i,
        'Feature Name': feature,
        'Mean': df[feature].mean(),
        'Median': df[feature].median(),
        'Std Dev': df[feature].std(),
        'Min': df[feature].min(),
        'Max': df[feature].max(),
        'Q1': df[feature].quantile(0.25),
        'Q3': df[feature].quantile(0.75),
        'Skewness': df[feature].skew(),
        'Missing': df[feature].isna().sum()
    }
    stats_data.append(stats)

stats_df = pd.DataFrame(stats_data)

# Display formatted table
print("\n")
for idx, row in stats_df.iterrows():
    print(f"{row['No.']}. {row['Feature Name'].upper()}")
    print(f"Mean: {row['Mean']:.4f} | Median: {row['Median']:.4f} | Std: {row['Std Dev']:.4f}")
    print(f"Range: [{row['Min']:.4f}, {row['Max']:.4f}] | IQR: [{row['Q1']:.4f}, {row['Q3']:.4f}]")
    print(f"Skewness: {row['Skewness']:.4f} | Missing: {row['Missing']}")
    print()

# Save statistics to CSV
stats_df.to_csv('../../data/processed/algebraic_features_statistics.csv', index=False)
print(" Statistics saved: data/processed/algebraic_features_statistics.csv")

## 7. Visualizations

In [None]:
# Create visualizations for the new features
import os
os.makedirs('../../outputs/figures', exist_ok=True)

# 1. Distribution plots for all 10 features
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.ravel()

for idx, feature in enumerate(new_features):
    axes[idx].hist(df[feature], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{feature}', fontsize=10, fontweight='bold')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print("\nSaved: outputs/figures/algebraic_features_distributions.png")

# 2. Correlation heatmap of new features
plt.figure(figsize=(12, 10))
correlation_matrix = df[new_features].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of 10 Landlord-Controlled Algebraic Features', 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_correlation.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: outputs/figures/algebraic_features_correlation.png")

# 3. Box plots for outlier detection
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.ravel()

for idx, feature in enumerate(new_features):
    axes[idx].boxplot(df[feature].dropna())
    axes[idx].set_title(f'{feature}', fontsize=10, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: outputs/figures/algebraic_features_boxplots.png")

# 4. Summary statistics visualization
fig, ax = plt.subplots(figsize=(14, 8))
x_pos = np.arange(len(new_features))
means = [df[f].mean() for f in new_features]
stds = [df[f].std() for f in new_features]

# Normalize for visualization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
means_normalized = scaler.fit_transform(np.array(means).reshape(-1, 1)).flatten()
stds_normalized = scaler.fit_transform(np.array(stds).reshape(-1, 1)).flatten()

ax.bar(x_pos - 0.2, means_normalized, 0.4, label='Mean (normalized)', alpha=0.8)
ax.bar(x_pos + 0.2, stds_normalized, 0.4, label='Std Dev (normalized)', alpha=0.8)
ax.set_xlabel('Features', fontweight='bold')
ax.set_ylabel('Normalized Value', fontweight='bold')
ax.set_title('Mean and Standard Deviation of Algebraic Features (Normalized)', 
             fontsize=14, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(new_features, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_variability.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: outputs/figures/algebraic_features_variability.png")

## 8. Final Summary Report

In [None]:
print("\n" + "="*80)
print("Task 1.3: Summary Report")

summary = f"""
{'='*80}

Input Data:
  - Source: data/processed/listings_cleaned_with_target.csv (from T1.2)
  - Shape: {df.shape[0]:,} rows × {df.shape[1] - len(new_features)} columns (before)

Output Data:
  - Destination: data/processed/listings_with_algebraic_features.csv
  - Shape: {df.shape[0]:,} rows × {df.shape[1]} columns (after)
  - Added: {len(new_features)} new algebraic features

  New Algebraic Features:
  1. space_efficiency = beds / bedrooms
  2. price_per_bedroom = price / bedrooms
  3. price_per_bathroom = price / bathrooms_numeric
  4. occupancy_rate = (365 - availability_365) / 365
  5. booking_flexibility_score = 1 / (minimum_nights + 1)
  6. space_per_person = bedrooms / accommodates
  7. host_portfolio_intensity = host_total_listings_count / accommodates
  8. bathroom_to_bedroom_ratio = bathrooms_numeric / bedrooms
  9. price_to_capacity_ratio = price / (accommodates × bedrooms)
  10. availability_flexibility_score = availability_365 / minimum_nights

Output Files:
  - data/processed/listings_with_algebraic_features.csv
  - data/processed/algebraic_features_statistics.csv

Visualizations:
   - outputs/figures/algebraic_features_distributions.png
   - outputs/figures/algebraic_features_correlation.png
   - outputs/figures/algebraic_features_boxplots.png
   - outputs/figures/algebraic_features_variability.png



"""

print(summary)



