# Etap 2: Feature Engineering for Profitable Location Prediction

## Business Problem: Task 12 - "Where to look for new profitable locations"

**Objective**: Help Nocarz identify the most profitable neighborhoods in London for establishing new Airbnb properties.

**Target Variable**: We will predict `annual_revenue_potential` - a refined version of the historical annual revenue that addresses the issues identified in our critical evaluation of Etap 1.

**Key Improvements over Etap 1**:
1. Better handling of short observation periods
2. Seasonality considerations
3. More robust price filtering (removing inflated strategic prices)
4. Confidence scoring for revenue estimates
5. Feature engineering based on 182-day median observation period

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings

warnings.filterwarnings("ignore")

# Set display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
plt.style.use("seaborn-v0_8")

print("Setup complete. Libraries imported.")

In [None]:
# Load the processed datasets
processed_data_dir = "../data/processed/etap2/"

print("Loading datasets...")
listings_df = pd.read_pickle(processed_data_dir + "listings_e2_df.pkl")
calendar_df = pd.read_pickle(processed_data_dir + "calendar_e2_df.pkl")
reviews_df = pd.read_pickle(processed_data_dir + "reviews_e2_df.pkl")

print(f"Listings: {listings_df.shape}")
print(f"Calendar: {calendar_df.shape}")
print(f"Reviews: {reviews_df.shape}")
print("\nData loaded successfully!")

## 2. Robust Revenue Calculation

### Addressing Etap 1 Issues:
1. **Short observation periods**: Apply minimum observation requirements
2. **Inflated strategic pricing**: Filter out extreme prices
3. **Naive annualization**: Use observation-period-adjusted calculations
4. **Missing confidence measures**: Add reliability scoring

In [None]:
def calculate_robust_revenue_metrics(calendar_df, min_observation_days=30):
    """
    Calculate robust revenue metrics addressing Etap 1 issues

    Parameters:
    - min_observation_days: Minimum days of data required for reliable estimates

    Returns:
    - DataFrame with listing-level revenue metrics and confidence scores
    """
    print(f"Calculating revenue metrics for {len(calendar_df)} calendar records...")

    # Group by listing_id and calculate metrics
    listing_metrics = []

    for listing_id, group in calendar_df.groupby("listing_id"):
        # Basic observation period info
        total_days = len(group)
        date_range = (group["date"].max() - group["date"].min()).days + 1

        # Skip listings with insufficient data
        if total_days < min_observation_days:
            continue

        # Filter out extreme prices (likely strategic inflation)
        # Remove prices above 95th percentile or below 5th percentile
        price_95 = group["price"].quantile(0.95)
        price_05 = group["price"].quantile(0.05)
        filtered_group = group[
            (group["price"] >= price_05)
            & (group["price"] <= price_95)
            & (group["price"].notna())
        ]

        if (
            len(filtered_group) < min_observation_days * 0.7
        ):  # Need at least 70% valid data
            continue

        # Calculate occupancy rate (booked days / available days in period)
        available_days = len(
            filtered_group[filtered_group["available"] == False]
        )  # False = booked
        total_valid_days = len(filtered_group)
        occupancy_rate = (
            available_days / total_valid_days if total_valid_days > 0 else 0
        )

        # Calculate ADR (Average Daily Rate) for booked days only
        booked_days = filtered_group[filtered_group["available"] == False]
        adr = (
            booked_days["price"].mean()
            if len(booked_days) > 0
            else filtered_group["price"].mean()
        )

        # Calculate daily revenue (ADR * occupancy)
        daily_revenue = adr * occupancy_rate

        # Calculate annualized revenue with observation period adjustment
        # Instead of naive *365, scale based on actual observation period
        if date_range >= 365:
            annual_revenue = daily_revenue * 365
            confidence_score = 1.0  # Full year of data
        else:
            # Scale up but with reduced confidence
            annual_revenue = daily_revenue * 365
            confidence_score = min(
                date_range / 365, 0.95
            )  # Max 95% confidence for < 1 year

        # Additional confidence factors
        data_completeness = len(filtered_group) / len(
            group
        )  # How much data survived filtering
        booking_activity = min(
            len(booked_days) / 10, 1.0
        )  # At least 10 bookings for full confidence

        final_confidence = confidence_score * data_completeness * booking_activity

        listing_metrics.append(
            {
                "listing_id": listing_id,
                "observation_days": total_days,
                "date_range_days": date_range,
                "occupancy_rate": occupancy_rate,
                "adr": adr,
                "daily_revenue": daily_revenue,
                "annual_revenue_potential": annual_revenue,
                "confidence_score": final_confidence,
                "booked_days_count": len(booked_days),
                "data_completeness": data_completeness,
            }
        )

    result_df = pd.DataFrame(listing_metrics)
    print(f"Calculated metrics for {len(result_df)} listings with sufficient data")
    print(f"Mean confidence score: {result_df['confidence_score'].mean():.3f}")

    return result_df


# Calculate revenue metrics
revenue_metrics = calculate_robust_revenue_metrics(calendar_df)
revenue_metrics.head()

## 3. Feature Engineering

Create features that will help predict profitable locations:

In [None]:
def engineer_property_features(listings_df):
    """
    Engineer property-level features from listings data
    """
    df = listings_df.copy()

    # Clean and convert price
    if df["price"].dtype == "object":
        df["price_clean"] = (
            df["price"].str.replace("[\$,]", "", regex=True).astype(float)
        )
    else:
        df["price_clean"] = df["price"]

    # Property size features
    df["accommodates"] = pd.to_numeric(df["accommodates"], errors="coerce")
    df["bedrooms"] = pd.to_numeric(df["bedrooms"], errors="coerce")
    df["beds"] = pd.to_numeric(df["beds"], errors="coerce")
    df["bathrooms"] = pd.to_numeric(
        df.get("bathrooms_text", pd.Series()).str.extract("(\d+\.?\d*)", expand=False),
        errors="coerce",
    )

    # Create size ratios
    df["beds_per_bedroom"] = df["beds"] / df["bedrooms"].replace(0, np.nan)
    df["accommodates_per_bedroom"] = df["accommodates"] / df["bedrooms"].replace(
        0, np.nan
    )

    # Property type categories
    property_type_mapping = {
        "Entire home/apt": "entire_home",
        "Private room": "private_room",
        "Shared room": "shared_room",
        "Hotel room": "hotel_room",
    }
    df["room_type_encoded"] = df["room_type"].map(property_type_mapping).fillna("other")

    # Review-based features
    review_columns = [col for col in df.columns if "review_scores" in col]
    for col in review_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Overall review quality
    if "review_scores_rating" in df.columns:
        df["high_review_score"] = (df["review_scores_rating"] >= 4.5).astype(int)
        df["review_score_normalized"] = df["review_scores_rating"] / 5.0

    # Host features
    df["host_is_superhost"] = (df.get("host_is_superhost", "f") == "t").astype(int)
    df["host_response_rate"] = (
        pd.to_numeric(
            df.get("host_response_rate", "0%").str.rstrip("%"), errors="coerce"
        )
        / 100
    )

    # Instant book feature
    df["instant_bookable"] = (df.get("instant_bookable", "f") == "t").astype(int)

    # Location features
    df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
    df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

    # Distance from London center (approximate)
    london_center_lat, london_center_lon = 51.5074, -0.1278
    df["distance_from_center"] = np.sqrt(
        (df["latitude"] - london_center_lat) ** 2
        + (df["longitude"] - london_center_lon) ** 2
    )

    print(f"Engineered features for {len(df)} listings")
    return df


# Engineer property features
listings_with_features = engineer_property_features(listings_df)
print("\nNew features created:")
new_columns = [
    col for col in listings_with_features.columns if col not in listings_df.columns
]
for col in new_columns:
    print(f"- {col}")

In [None]:
def engineer_location_features(df, reviews_df):
    """
    Engineer neighborhood-level features
    """
    # Reviews activity by listing
    if not reviews_df.empty:
        review_stats = (
            reviews_df.groupby("listing_id")
            .agg({"date": ["count", "min", "max"], "reviewer_id": "nunique"})
            .reset_index()
        )

        review_stats.columns = [
            "listing_id",
            "review_count",
            "first_review",
            "last_review",
            "unique_reviewers",
        ]

        # Calculate review velocity (reviews per month)
        review_stats["review_period_days"] = (
            review_stats["last_review"] - review_stats["first_review"]
        ).dt.days
        review_stats["reviews_per_month"] = (
            review_stats["review_count"] / (review_stats["review_period_days"] / 30.44)
        ).fillna(0)

        # Merge with main dataframe
        df = df.merge(
            review_stats[["listing_id", "review_count", "reviews_per_month"]],
            on="listing_id",
            how="left",
        )
        df["review_count"] = df["review_count"].fillna(0)
        df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

    # Neighborhood-level aggregations
    if "neighbourhood_cleansed" in df.columns:
        neighborhood_stats = (
            df.groupby("neighbourhood_cleansed")
            .agg(
                {
                    "price_clean": ["mean", "median", "std"],
                    "accommodates": "mean",
                    "review_scores_rating": "mean",
                    "id": "count",  # Number of listings in neighborhood
                }
            )
            .reset_index()
        )

        neighborhood_stats.columns = [
            "neighbourhood_cleansed",
            "neighborhood_price_mean",
            "neighborhood_price_median",
            "neighborhood_price_std",
            "neighborhood_accommodates_mean",
            "neighborhood_review_mean",
            "neighborhood_listing_count",
        ]

        # Calculate neighborhood competition density
        neighborhood_stats["neighborhood_density"] = neighborhood_stats[
            "neighborhood_listing_count"
        ]

        # Merge back to main dataframe
        df = df.merge(neighborhood_stats, on="neighbourhood_cleansed", how="left")

        # Relative pricing features
        df["price_vs_neighborhood"] = df["price_clean"] / df["neighborhood_price_mean"]
        df["above_neighborhood_median"] = (
            df["price_clean"] > df["neighborhood_price_median"]
        ).astype(int)

    print(f"Location features engineered for {len(df)} listings")
    return df


# Engineer location features
listings_with_features = engineer_location_features(listings_with_features, reviews_df)
print("\nLocation features added successfully")

## 4. Create Final Dataset for Modeling

In [None]:
# Merge listings with revenue metrics
modeling_df = listings_with_features.merge(
    revenue_metrics, left_on="id", right_on="listing_id", how="inner"
)

print(f"Final modeling dataset: {modeling_df.shape}")
print(
    f"Target variable (annual_revenue_potential) - Mean: £{modeling_df['annual_revenue_potential'].mean():.2f}"
)
print(
    f"Target variable (annual_revenue_potential) - Median: £{modeling_df['annual_revenue_potential'].median():.2f}"
)

# Filter for high-confidence predictions only
high_confidence_df = modeling_df[modeling_df["confidence_score"] >= 0.3].copy()
print(
    f"\nHigh-confidence subset: {high_confidence_df.shape} ({len(high_confidence_df) / len(modeling_df) * 100:.1f}%)"
)

# Select features for modeling
feature_columns = [
    # Property characteristics
    "accommodates",
    "bedrooms",
    "beds",
    "bathrooms",
    "beds_per_bedroom",
    "accommodates_per_bedroom",
    # Location
    "latitude",
    "longitude",
    "distance_from_center",
    # Reviews and ratings
    "review_scores_rating",
    "review_score_normalized",
    "high_review_score",
    "review_count",
    "reviews_per_month",
    # Host features
    "host_is_superhost",
    "host_response_rate",
    "instant_bookable",
    # Neighborhood context
    "neighborhood_price_mean",
    "neighborhood_price_median",
    "neighborhood_listing_count",
    "neighborhood_review_mean",
    "price_vs_neighborhood",
    "above_neighborhood_median",
    # Revenue calculation metadata
    "occupancy_rate",
    "adr",
    "confidence_score",
]

# Add categorical features (will be encoded later)
categorical_features = ["room_type_encoded", "neighbourhood_cleansed"]

# Create feature matrix
available_features = [
    col for col in feature_columns if col in high_confidence_df.columns
]
available_categorical = [
    col for col in categorical_features if col in high_confidence_df.columns
]

print(f"\nAvailable numerical features: {len(available_features)}")
print(f"Available categorical features: {len(available_categorical)}")

# Check for missing values in key features
print("\nMissing values in key features:")
for col in available_features + available_categorical:
    missing_pct = high_confidence_df[col].isna().mean() * 100
    if missing_pct > 0:
        print(f"{col}: {missing_pct:.1f}%")

## 5. Save Processed Data for Modeling

In [None]:
# Save the modeling dataset
output_path = "../data/processed/etap2/modeling_dataset.pkl"
high_confidence_df.to_pickle(output_path)
print(f"Modeling dataset saved to: {output_path}")

# Save feature configuration
feature_config = {
    "numerical_features": available_features,
    "categorical_features": available_categorical,
    "target_variable": "annual_revenue_potential",
    "confidence_threshold": 0.3,
    "total_samples": len(high_confidence_df),
    "feature_count": len(available_features) + len(available_categorical),
}

import json

config_path = "../data/processed/etap2/feature_config.json"
with open(config_path, "w") as f:
    json.dump(feature_config, f, indent=2)
print(f"Feature configuration saved to: {config_path}")

print("\n=== Feature Engineering Complete ===")
print(f"Dataset ready for modeling: {high_confidence_df.shape}")
print(f"Target variable: {feature_config['target_variable']}")
print(f"Total features: {feature_config['feature_count']}")

## 6. Data Summary for Modeling

### Key Improvements over Etap 1:
1. **Robust Revenue Calculation**: Addresses short observation periods and inflated pricing
2. **Confidence Scoring**: Each revenue estimate has an associated confidence level
3. **Feature Engineering**: Property, location, and neighborhood context features
4. **Data Quality**: High-confidence subset ensures reliable model training

### Next Steps:
1. **Baseline Model**: Simple linear regression or decision tree
2. **Advanced Model**: Random Forest, Gradient Boosting, or Neural Network
3. **Model Comparison**: Performance metrics and business impact analysis
4. **Microservice Implementation**: API for real-time predictions
5. **A/B Testing Framework**: Compare model predictions vs current methods