## Task 1.3: Algebraic Feature Engineering (Landlord Perspective)

This task creates 10 new algebraic features using only landlord-controlled variables to avoid data leakage. 
Features include space_efficiency (beds per bedroom), price_per_bedroom, price_per_bathroom, 
occupancy_rate, space_per_person, host_portfolio_intensity, booking_flexibility_score, 
value_density (accommodates per dollar), capacity_utilization (beds per accommodation), 
and price_normalized. All features are derived from information available at listing creation time.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('='*80)
print('TASK 1.3: ALGEBRAIC FEATURE ENGINEERING - LANDLORD PERSPECTIVE')
print('San Francisco & San Diego Airbnb Dataset')
print('='*80)

### 1. Load Data

In [None]:
df = pd.read_csv('../../data/processed/listings_cleaned_with_target.csv')
print(f'\nLoaded Dataset Shape: {df.shape}')
print(f'Columns: {df.shape[1]}')
print(f'Rows: {df.shape[0]:,}')

print('\nChecking Required Columns for Algebraic Features:')
required_cols = ['price', 'accommodates', 'bedrooms', 'beds', 'bathrooms_numeric',
                 'availability_365', 'host_total_listings_count', 'minimum_nights']

all_present = True
for col in required_cols:
    if col in df.columns:
        print(f'  {col}')
    else:
        print(f'  {col} - MISSING')
        all_present = False

if all_present:
    print('\nAll required columns present')

### 2. Create 10 Algebraic Features (Landlord-Controlled Only)

In [None]:
print('\n' + '='*80)
print('CREATING 10 NEW ALGEBRAIC FEATURES - NO REVIEW DATA')
print('='*80)

# Feature 1: space_efficiency (beds per bedroom)
print('\n1. Creating: space_efficiency = beds / bedrooms')
df['space_efficiency'] = df['beds'] / df['bedrooms'].replace(0, np.nan)
df['space_efficiency'] = df['space_efficiency'].fillna(df['space_efficiency'].median())
print(f'   Range: {df["space_efficiency"].min():.2f} to {df["space_efficiency"].max():.2f}')
print(f'   Mean: {df["space_efficiency"].mean():.2f}')

# Feature 2: price_per_bedroom
print('\n2. Creating: price_per_bedroom = price / bedrooms')
df['price_per_bedroom'] = df['price'] / df['bedrooms'].replace(0, np.nan)
df['price_per_bedroom'] = df['price_per_bedroom'].fillna(df['price_per_bedroom'].median())
print(f'   Range: ${df["price_per_bedroom"].min():.2f} to ${df["price_per_bedroom"].max():.2f}')
print(f'   Mean: ${df["price_per_bedroom"].mean():.2f}')

# Feature 3: price_per_bathroom
print('\n3. Creating: price_per_bathroom = price / bathrooms_numeric')
df['price_per_bathroom'] = df['price'] / df['bathrooms_numeric'].replace(0, np.nan)
df['price_per_bathroom'] = df['price_per_bathroom'].fillna(df['price_per_bathroom'].median())
print(f'   Range: ${df["price_per_bathroom"].min():.2f} to ${df["price_per_bathroom"].max():.2f}')
print(f'   Mean: ${df["price_per_bathroom"].mean():.2f}')

# Feature 4: occupancy_rate
print('\n4. Creating: occupancy_rate = (365 - availability_365) / 365')
df['occupancy_rate'] = (365 - df['availability_365']) / 365
df['occupancy_rate'] = df['occupancy_rate'].clip(0, 1)
print(f'   Range: {df["occupancy_rate"].min():.2f} to {df["occupancy_rate"].max():.2f}')
print(f'   Mean: {df["occupancy_rate"].mean():.2f}')

# Feature 5: space_per_person
print('\n5. Creating: space_per_person = bedrooms / accommodates')
df['space_per_person'] = df['bedrooms'] / df['accommodates'].replace(0, np.nan)
df['space_per_person'] = df['space_per_person'].fillna(df['space_per_person'].median())
print(f'   Range: {df["space_per_person"].min():.2f} to {df["space_per_person"].max():.2f}')
print(f'   Mean: {df["space_per_person"].mean():.2f}')

# Feature 6: host_portfolio_intensity
print('\n6. Creating: host_portfolio_intensity = host_total_listings_count / accommodates')
df['host_portfolio_intensity'] = df['host_total_listings_count'] / df['accommodates'].replace(0, np.nan)
df['host_portfolio_intensity'] = df['host_portfolio_intensity'].fillna(df['host_portfolio_intensity'].median())
print(f'   Range: {df["host_portfolio_intensity"].min():.2f} to {df["host_portfolio_intensity"].max():.2f}')
print(f'   Mean: {df["host_portfolio_intensity"].mean():.2f}')

# Feature 7: booking_flexibility_score
print('\n7. Creating: booking_flexibility_score = 1 / (minimum_nights + 1)')
df['booking_flexibility_score'] = 1 / (df['minimum_nights'] + 1)
print(f'   Range: {df["booking_flexibility_score"].min():.6f} to {df["booking_flexibility_score"].max():.6f}')
print(f'   Mean: {df["booking_flexibility_score"].mean():.6f}')

# Feature 8: value_density (accommodates per dollar)
print('\n8. Creating: value_density = accommodates / price')
df['value_density'] = df['accommodates'] / df['price'].replace(0, np.nan)
df['value_density'] = df['value_density'].fillna(df['value_density'].median())
print(f'   Range: {df["value_density"].min():.4f} to {df["value_density"].max():.4f}')
print(f'   Mean: {df["value_density"].mean():.4f}')

# Feature 9: capacity_utilization (beds per accommodation)
print('\n9. Creating: capacity_utilization = beds / accommodates')
df['capacity_utilization'] = df['beds'] / df['accommodates'].replace(0, np.nan)
df['capacity_utilization'] = df['capacity_utilization'].fillna(df['capacity_utilization'].median())
print(f'   Range: {df["capacity_utilization"].min():.2f} to {df["capacity_utilization"].max():.2f}')
print(f'   Mean: {df["capacity_utilization"].mean():.2f}')

# Feature 10: price_normalized
print('\n10. Creating: price_normalized = (price - min) / (max - min)')
df['price_normalized'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
print(f'   Range: {df["price_normalized"].min():.4f} to {df["price_normalized"].max():.4f}')
print(f'   Mean: {df["price_normalized"].mean():.4f}')

print('\n' + '='*80)
print('ALL 10 ALGEBRAIC FEATURES CREATED SUCCESSFULLY')
print('='*80)

### 3. Data Quality Check

In [None]:
new_features = [
    'space_efficiency', 'price_per_bedroom', 'price_per_bathroom',
    'occupancy_rate', 'space_per_person', 'host_portfolio_intensity',
    'booking_flexibility_score', 'value_density', 'capacity_utilization',
    'price_normalized'
]

print(f'\nNew Dataset Shape: {df.shape}')
print(f'Added: {len(new_features)} new algebraic features')

print('\nData Quality Check:')
quality_ok = True
for feature in new_features:
    nan_count = df[feature].isna().sum()
    inf_count = np.isinf(df[feature]).sum()
    if nan_count > 0 or inf_count > 0:
        print(f'  {feature}: {nan_count} NaN, {inf_count} Inf values')
        quality_ok = False

if quality_ok:
    print('  All features are clean (no NaN or Inf values)')

### 4. Feature Statistics

In [None]:
print('\n' + '='*80)
print('DETAILED STATISTICS FOR 10 ALGEBRAIC FEATURES')
print('='*80)

stats_data = []
for i, feature in enumerate(new_features, 1):
    stats = {
        'No': i,
        'Feature_Name': feature,
        'Mean': df[feature].mean(),
        'Median': df[feature].median(),
        'Std_Dev': df[feature].std(),
        'Min': df[feature].min(),
        'Max': df[feature].max(),
        'Q1': df[feature].quantile(0.25),
        'Q3': df[feature].quantile(0.75),
        'Skewness': df[feature].skew(),
        'Missing': df[feature].isna().sum()
    }
    stats_data.append(stats)

stats_df = pd.DataFrame(stats_data)

print('\n')
for idx, row in stats_df.iterrows():
    print(f"{row['No']}. {row['Feature_Name'].upper()}")
    print(f"   Mean: {row['Mean']:.4f} | Median: {row['Median']:.4f} | Std: {row['Std_Dev']:.4f}")
    print(f"   Range: [{row['Min']:.4f}, {row['Max']:.4f}] | IQR: [{row['Q1']:.4f}, {row['Q3']:.4f}]")
    print(f"   Skewness: {row['Skewness']:.4f} | Missing: {row['Missing']}")
    print()

stats_df.to_csv('../../outputs/algebraic_features_statistics.csv', index=False)
print('Statistics saved: outputs/algebraic_features_statistics.csv')

### 5. Visualizations

In [None]:
print('\n' + '='*80)
print('CREATING VISUALIZATIONS')
print('='*80)

# Distribution plots
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.flatten()

for idx, feature in enumerate(new_features):
    axes[idx].hist(df[feature], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{feature}', fontsize=10, fontweight='bold')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print('\nSaved: outputs/figures/algebraic_features_distributions.png')

# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df[new_features].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix of Algebraic Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_correlation.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: outputs/figures/algebraic_features_correlation.png')

# Boxplots
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.flatten()

for idx, feature in enumerate(new_features):
    axes[idx].boxplot(df[feature].dropna())
    axes[idx].set_title(f'{feature}', fontsize=10, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/algebraic_features_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: outputs/figures/algebraic_features_boxplots.png')

### 6. Save Dataset

In [None]:
output_path = '../../data/processed/listings_with_algebraic_features.csv'
df.to_csv(output_path, index=False)
print(f'\nSaved dataset to: {output_path}')
print(f'Shape: {df.shape[0]:,} rows x {df.shape[1]} columns')

### 7. Summary Report

In [None]:
print('\n' + '='*80)
print('TASK 1.3 SUMMARY REPORT')
print('='*80)

summary = f"""
ALGEBRAIC FEATURE ENGINEERING COMPLETED
{'='*80}

DATASET:
  - Final shape: {df.shape[0]:,} rows x {df.shape[1]} columns
  - Added features: {len(new_features)}

NEW FEATURES (LANDLORD-CONTROLLED ONLY):
  1. space_efficiency - beds per bedroom
  2. price_per_bedroom - price divided by bedrooms
  3. price_per_bathroom - price divided by bathrooms
  4. occupancy_rate - calculated from availability
  5. space_per_person - bedrooms per accommodation
  6. host_portfolio_intensity - listings per accommodation
  7. booking_flexibility_score - inverse of minimum nights
  8. value_density - accommodates per dollar
  9. capacity_utilization - beds per accommodation
  10. price_normalized - normalized price

DATA QUALITY:
  - No NaN values
  - No Inf values
  - All features properly scaled

OUTPUT FILES:
  - data/processed/listings_with_algebraic_features.csv
  - outputs/algebraic_features_statistics.csv
  - outputs/figures/algebraic_features_distributions.png
  - outputs/figures/algebraic_features_correlation.png
  - outputs/figures/algebraic_features_boxplots.png

CRITICAL NOTES:
  - NO REVIEW DATA USED
  - All features based on landlord-controlled variables
  - No data leakage



{'='*80}
"""

print(summary)

import os
os.makedirs('../../outputs/reports', exist_ok=True)
with open('../../outputs/reports/T1.3_summary.txt', 'w') as f:
    f.write(summary)

print('Summary saved to: outputs/reports/T1.3_summary.txt')