# 03 â€” Customer Segmentation

**Objective**: Cluster customers into interpretable segments based on behavioral features.

**Approach**:
- Feature selection and scaling
- Optimal cluster number determination
- K-means clustering with validation
- Segment profiling and interpretation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.manifold import TSNE

# Load customer features
features_path = Path('../artifacts/outputs/customer_features.parquet')
if not features_path.exists():
    features_path = Path('../artifacts/features/customer_features.parquet')

customer_features = pd.read_parquet(features_path)
print(f"Loaded {len(customer_features):,} customers with {customer_features.shape[1]} features")
customer_features.head()

## Feature Selection for Clustering

In [None]:
# Select key behavioral features for clustering
clustering_features = [
    'n_sessions',
    'avg_session_duration_sec', 
    'avg_page_clicks',
    'p_flight_booked',
    'p_hotel_booked', 
    'p_cancellation',
    'p_flight_discount',
    'p_hotel_discount',
    'avg_base_fare_usd',
    'avg_hotel_per_room_usd',
    'avg_nights',
    'avg_rooms'
]

# Filter to available features
available_features = [f for f in clustering_features if f in customer_features.columns]
print(f"Using {len(available_features)} features for clustering:")
for f in available_features:
    print(f"  â€¢ {f}")

# Create feature matrix
X = customer_features[available_features].fillna(0)
print(f"\nFeature matrix shape: {X.shape}")
print(f"Missing values: {X.isnull().sum().sum()}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=available_features, index=X.index)

print("Feature scaling completed")
print(f"Scaled features mean: {X_scaled.mean(axis=0).round(3)}")
print(f"Scaled features std: {X_scaled.std(axis=0).round(3)}")

## Optimal Number of Clusters

In [None]:
# Elbow method and silhouette analysis
k_range = range(2, 11)
inertias = []
silhouette_scores = []
calinski_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, labels))
    calinski_scores.append(calinski_harabasz_score(X_scaled, labels))

# Plot evaluation metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Elbow curve
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_title('Elbow Method')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].grid(True)

# Silhouette scores
axes[1].plot(k_range, silhouette_scores, 'ro-')
axes[1].set_title('Silhouette Score')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].grid(True)

# Calinski-Harabasz scores
axes[2].plot(k_range, calinski_scores, 'go-')
axes[2].set_title('Calinski-Harabasz Score')
axes[2].set_xlabel('Number of Clusters (k)')
axes[2].set_ylabel('CH Score')
axes[2].grid(True)

plt.tight_layout()
plt.show()

# Find optimal k
best_k_silhouette = k_range[np.argmax(silhouette_scores)]
best_k_calinski = k_range[np.argmax(calinski_scores)]

print(f"Best k by Silhouette Score: {best_k_silhouette} (score: {max(silhouette_scores):.3f})")
print(f"Best k by Calinski-Harabasz: {best_k_calinski} (score: {max(calinski_scores):.0f})")

## Final Clustering

In [None]:
# Use optimal k (or default to 5)
optimal_k = 5  # Can be adjusted based on business requirements

# Final K-means clustering
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=20)
cluster_labels = final_kmeans.fit_predict(X_scaled)

# Add cluster labels to customer data
customer_segments = customer_features.copy()
customer_segments['segment'] = cluster_labels

print(f"Clustering completed with k={optimal_k}")
print(f"Silhouette Score: {silhouette_score(X_scaled, cluster_labels):.3f}")
print(f"Calinski-Harabasz Score: {calinski_harabasz_score(X_scaled, cluster_labels):.0f}")

# Segment sizes
segment_sizes = pd.Series(cluster_labels).value_counts().sort_index()
print(f"\nSegment sizes:")
for seg, size in segment_sizes.items():
    print(f"  Segment {seg}: {size:,} customers ({size/len(customer_segments)*100:.1f}%)")

## Segment Profiling

In [None]:
# Create segment profiles
segment_profiles = customer_segments.groupby('segment')[available_features].agg([
    'mean', 'median', 'std'
]).round(3)

# Flatten column names
segment_profiles.columns = ['_'.join(col).strip() for col in segment_profiles.columns]

print("Segment Profiles (Mean Values):")
mean_cols = [col for col in segment_profiles.columns if col.endswith('_mean')]
display_profiles = segment_profiles[mean_cols]
display_profiles.columns = [col.replace('_mean', '') for col in display_profiles.columns]
print(display_profiles)

In [None]:
# Segment visualization - Feature heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(display_profiles.T, annot=True, cmap='viridis', fmt='.2f')
plt.title('Segment Feature Profiles (Mean Values)')
plt.xlabel('Segment')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
# PCA visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink']
for i in range(optimal_k):
    mask = cluster_labels == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
               c=colors[i], label=f'Segment {i}', alpha=0.6)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Customer Segments in PCA Space')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}")

## Segment Interpretation

In [None]:
# Segment interpretation based on key characteristics
segment_names = {}
segment_descriptions = {}

for seg in range(optimal_k):
    profile = display_profiles.loc[seg]
    
    # Analyze key characteristics
    high_sessions = profile.get('n_sessions', 0) > display_profiles['n_sessions'].median()
    high_flight_booking = profile.get('p_flight_booked', 0) > 0.1
    high_hotel_booking = profile.get('p_hotel_booked', 0) > 0.1
    high_cancellation = profile.get('p_cancellation', 0) > 0.1
    high_discount_usage = (profile.get('p_flight_discount', 0) + profile.get('p_hotel_discount', 0)) > 0.2
    
    # Generate segment names and descriptions
    if high_flight_booking and high_hotel_booking:
        if high_sessions:
            name = "Frequent Full-Trip Bookers"
            desc = "High-engagement customers who book both flights and hotels regularly"
        else:
            name = "Occasional Full-Trip Bookers"
            desc = "Customers who book complete trips but less frequently"
    elif high_flight_booking and not high_hotel_booking:
        name = "Flight-Focused Travelers"
        desc = "Primarily book flights, minimal hotel bookings"
    elif high_hotel_booking and not high_flight_booking:
        name = "Hotel-Focused Travelers"
        desc = "Primarily book hotels, minimal flight bookings"
    elif high_sessions and not (high_flight_booking or high_hotel_booking):
        name = "High-Intent Browsers"
        desc = "Frequent visitors with low conversion rates"
    else:
        name = "Low-Engagement Browsers"
        desc = "Infrequent visitors with minimal booking activity"
    
    segment_names[seg] = name
    segment_descriptions[seg] = desc

# Display segment interpretations
print("=== SEGMENT INTERPRETATIONS ===")
for seg in range(optimal_k):
    size = segment_sizes[seg]
    pct = size / len(customer_segments) * 100
    print(f"\nðŸŽ¯ Segment {seg}: {segment_names[seg]}")
    print(f"   Size: {size:,} customers ({pct:.1f}%)")
    print(f"   Description: {segment_descriptions[seg]}")
    
    # Key metrics
    profile = display_profiles.loc[seg]
    print(f"   Key metrics:")
    print(f"     â€¢ Avg sessions: {profile.get('n_sessions', 0):.1f}")
    print(f"     â€¢ Flight booking rate: {profile.get('p_flight_booked', 0):.1%}")
    print(f"     â€¢ Hotel booking rate: {profile.get('p_hotel_booked', 0):.1%}")
    print(f"     â€¢ Cancellation rate: {profile.get('p_cancellation', 0):.1%}")

## Export Segmentation Results

In [None]:
# Create output directory
output_dir = Path('../artifacts/outputs/segments')
output_dir.mkdir(parents=True, exist_ok=True)

# Export segment assignments
segment_assignments = customer_segments[['user_id', 'segment']].copy()
segment_assignments.to_parquet(output_dir / 'segment_assignments.parquet', index=False)
segment_assignments.to_csv(output_dir / 'segment_assignments.csv', index=False)

# Export segment profiles
segment_summary = pd.DataFrame({
    'segment': range(optimal_k),
    'segment_name': [segment_names[i] for i in range(optimal_k)],
    'description': [segment_descriptions[i] for i in range(optimal_k)],
    'size': [segment_sizes[i] for i in range(optimal_k)],
    'percentage': [segment_sizes[i] / len(customer_segments) * 100 for i in range(optimal_k)]
})

# Add key metrics to summary
for feature in ['n_sessions', 'p_flight_booked', 'p_hotel_booked', 'p_cancellation']:
    if feature in display_profiles.columns:
        segment_summary[f'avg_{feature}'] = display_profiles[feature].values

segment_summary.to_parquet(output_dir / 'segment_summary.parquet', index=False)
segment_summary.to_csv(output_dir / 'segment_summary.csv', index=False)

# Export detailed profiles
display_profiles.to_csv(output_dir / 'segment_profiles_detailed.csv')

print(f"âœ… Segmentation results exported to {output_dir}")
print(f"   â€¢ segment_assignments.parquet: {len(segment_assignments):,} customers")
print(f"   â€¢ segment_summary.parquet: {len(segment_summary)} segments")
print(f"   â€¢ segment_profiles_detailed.csv: Full feature profiles")

# Final summary
print(f"\nðŸ“Š Segmentation Summary:")
print(f"   â€¢ {optimal_k} customer segments identified")
print(f"   â€¢ {len(available_features)} features used for clustering")
print(f"   â€¢ Silhouette score: {silhouette_score(X_scaled, cluster_labels):.3f}")
print(f"   â€¢ Most balanced segment: {segment_sizes.min():,} - {segment_sizes.max():,} customers")