# 02 â€” Feature Engineering

**Objective**: Transform session-level data into customer-level features for segmentation.

**Approach**:
- Aggregate behavioral metrics per customer
- Create booking propensity features
- Engineer discount sensitivity indicators
- Build customer lifetime value proxies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load cleaned data from EDA pipeline
eda_latest = Path('../artifacts/eda/latest/data')
sessions_clean = pd.read_parquet(eda_latest / 'sessions_clean.parquet')

print(f"Loaded {len(sessions_clean):,} clean sessions")
print(f"Unique users: {sessions_clean['user_id'].nunique():,}")
sessions_clean.head()

## Behavioral Features

In [None]:
# Session-level behavioral aggregations
behavioral_features = sessions_clean.groupby('user_id').agg({
    'session_id': 'nunique',  # Total sessions
    'session_duration_sec': ['mean', 'std', 'max'],
    'page_clicks': ['mean', 'std', 'max', 'sum'],
    'flight_booked': ['mean', 'sum'],
    'hotel_booked': ['mean', 'sum'],
    'cancellation': ['mean', 'sum'],
    'flight_discount': 'mean',
    'hotel_discount': 'mean'
}).round(3)

# Flatten column names
behavioral_features.columns = ['_'.join(col).strip() for col in behavioral_features.columns]
behavioral_features = behavioral_features.rename(columns={
    'session_id_nunique': 'n_sessions',
    'flight_booked_mean': 'flight_booking_rate',
    'hotel_booked_mean': 'hotel_booking_rate',
    'cancellation_mean': 'cancellation_rate',
    'flight_discount_mean': 'flight_discount_rate',
    'hotel_discount_mean': 'hotel_discount_rate'
})

print(f"Behavioral features shape: {behavioral_features.shape}")
behavioral_features.head()

## Booking Value Features

In [None]:
# Booking value aggregations (only for customers who booked)
booking_features = sessions_clean.groupby('user_id').agg({
    'flight_discount_amount': ['mean', 'sum', 'count'],
    'hotel_discount_amount': ['mean', 'sum', 'count'],
    'base_fare_usd': ['mean', 'sum', 'count'],
    'hotel_per_room_usd': ['mean', 'sum', 'count'],
    'nights': ['mean', 'sum'],
    'rooms': ['mean', 'sum'],
    'seats': ['mean', 'sum'],
    'checked_bags': ['mean', 'sum']
}).round(2)

# Flatten column names
booking_features.columns = ['_'.join(col).strip() for col in booking_features.columns]

# Fill NaN with 0 (customers who didn't book)
booking_features = booking_features.fillna(0)

print(f"Booking features shape: {booking_features.shape}")
booking_features.head()

## Customer Lifetime Value Proxies

In [None]:
# Calculate CLV proxies
clv_features = pd.DataFrame(index=behavioral_features.index)

# Total revenue proxy
clv_features['total_flight_value'] = booking_features['base_fare_usd_sum']
clv_features['total_hotel_value'] = booking_features['hotel_per_room_usd_sum'] * booking_features['nights_sum']
clv_features['total_booking_value'] = clv_features['total_flight_value'] + clv_features['total_hotel_value']

# Engagement metrics
clv_features['avg_session_value'] = clv_features['total_booking_value'] / behavioral_features['n_sessions']
clv_features['engagement_score'] = (
    behavioral_features['n_sessions'] * 
    behavioral_features['page_clicks_mean'] * 
    behavioral_features['session_duration_sec_mean'] / 3600  # Convert to hours
).round(2)

# Discount sensitivity
clv_features['discount_sensitivity'] = (
    behavioral_features['flight_discount_rate'] + 
    behavioral_features['hotel_discount_rate']
) / 2

# Fill NaN/inf with 0
clv_features = clv_features.fillna(0).replace([np.inf, -np.inf], 0)

print(f"CLV features shape: {clv_features.shape}")
clv_features.head()

## Customer Segmentation Features

In [None]:
# Create customer type indicators
segmentation_features = pd.DataFrame(index=behavioral_features.index)

# Customer type based on booking behavior
segmentation_features['flight_only_customer'] = (
    (behavioral_features['flight_booking_rate'] > 0) & 
    (behavioral_features['hotel_booking_rate'] == 0)
).astype(int)

segmentation_features['hotel_only_customer'] = (
    (behavioral_features['hotel_booking_rate'] > 0) & 
    (behavioral_features['flight_booking_rate'] == 0)
).astype(int)

segmentation_features['full_trip_customer'] = (
    (behavioral_features['flight_booking_rate'] > 0) & 
    (behavioral_features['hotel_booking_rate'] > 0)
).astype(int)

segmentation_features['browser_only'] = (
    (behavioral_features['flight_booking_rate'] == 0) & 
    (behavioral_features['hotel_booking_rate'] == 0)
).astype(int)

# Frequency segments
session_quartiles = behavioral_features['n_sessions'].quantile([0.25, 0.5, 0.75])
segmentation_features['frequency_segment'] = pd.cut(
    behavioral_features['n_sessions'],
    bins=[0, session_quartiles[0.25], session_quartiles[0.5], session_quartiles[0.75], np.inf],
    labels=['Low', 'Medium', 'High', 'Very High']
)

# Value segments
value_quartiles = clv_features['total_booking_value'].quantile([0.25, 0.5, 0.75])
segmentation_features['value_segment'] = pd.cut(
    clv_features['total_booking_value'],
    bins=[0, value_quartiles[0.25], value_quartiles[0.5], value_quartiles[0.75], np.inf],
    labels=['Low', 'Medium', 'High', 'Very High']
)

print(f"Segmentation features shape: {segmentation_features.shape}")
segmentation_features.head()

## Combine All Features

In [None]:
# Combine all feature sets
customer_features = pd.concat([
    behavioral_features,
    booking_features,
    clv_features,
    segmentation_features
], axis=1)

# Add user demographics
user_demographics = sessions_clean.groupby('user_id')[[
    'gender', 'married', 'has_children', 'home_country', 'home_city'
]].first()

customer_features = customer_features.join(user_demographics)

print(f"Final customer features shape: {customer_features.shape}")
print(f"Features: {list(customer_features.columns)}")

## Feature Analysis & Validation

In [None]:
# Feature correlation analysis
numeric_features = customer_features.select_dtypes(include=[np.number])

plt.figure(figsize=(15, 12))
correlation_matrix = numeric_features.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# High correlation pairs
high_corr = correlation_matrix.abs() > 0.8
high_corr_pairs = []
for i in range(len(high_corr.columns)):
    for j in range(i+1, len(high_corr.columns)):
        if high_corr.iloc[i, j]:
            high_corr_pairs.append((
                high_corr.columns[i], 
                high_corr.columns[j], 
                correlation_matrix.iloc[i, j]
            ))

print("High correlation pairs (>0.8):")
for pair in high_corr_pairs:
    print(f"  {pair[0]} <-> {pair[1]}: {pair[2]:.3f}")

In [None]:
# Feature distributions
key_features = [
    'n_sessions', 'flight_booking_rate', 'hotel_booking_rate', 
    'total_booking_value', 'engagement_score', 'discount_sensitivity'
]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    if feature in numeric_features.columns:
        axes[i].hist(numeric_features[feature].dropna(), bins=30, alpha=0.7)
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel(feature)

plt.tight_layout()
plt.show()

## Export Features

In [None]:
# Save customer features
output_dir = Path('../artifacts/features')
output_dir.mkdir(parents=True, exist_ok=True)

# Reset index to include user_id as column
customer_features_export = customer_features.reset_index()

# Save as parquet and CSV
customer_features_export.to_parquet(output_dir / 'customer_features.parquet', index=False)
customer_features_export.to_csv(output_dir / 'customer_features.csv', index=False)

print(f"âœ… Exported customer features to {output_dir}")
print(f"   Shape: {customer_features_export.shape}")
print(f"   Numeric features: {len(numeric_features.columns)}")
print(f"   Categorical features: {len(customer_features.columns) - len(numeric_features.columns)}")

# Feature summary
print("\nðŸ“Š Feature Summary:")
print(customer_features_export.describe().round(2))