# Feature Engineering for Customer Retention

This notebook creates features for predicting customer repurchase probability:
- Customer behavioral features (RFM analysis)
- Purchase history aggregations
- Product preference features
- Seasonality features
- Target variable creation (will customer repurchase?)

Processed features are saved to the processed data layer.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')

## Load Clean Data

In [None]:
# Load clean data from staging layer
stage_data_path = '../data/stage/'

customers_df = pd.read_csv(f'{stage_data_path}/cleaned_customers.csv')
products_df = pd.read_csv(f'{stage_data_path}/cleaned_products.csv')
transactions_df = pd.read_csv(f'{stage_data_path}/cleaned_transactions.csv')

# Convert date columns
customers_df['registration_date'] = pd.to_datetime(customers_df['registration_date'])
products_df['launch_date'] = pd.to_datetime(products_df['launch_date'])
transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date'])

print("=== CLEAN DATA LOADED ===")
print(f"Customers: {len(customers_df):,} records")
print(f"Products: {len(products_df):,} records")
print(f"Transactions: {len(transactions_df):,} records")

# Set analysis date (latest transaction date)
analysis_date = transactions_df['transaction_date'].max()
print(f"\nAnalysis date: {analysis_date.date()}")

## RFM Analysis (Recency, Frequency, Monetary)

In [None]:
# Calculate RFM metrics for each customer
def calculate_rfm(transactions_df, analysis_date):
    # Filter completed transactions only
    completed_transactions = transactions_df[transactions_df['order_status'] == 'completed'].copy()
    
    # Calculate RFM metrics
    rfm = completed_transactions.groupby('customer_id').agg({
        'transaction_date': lambda x: (analysis_date - x.max()).days,  # Recency
        'transaction_id': 'count',  # Frequency
        'total_amount': 'sum'  # Monetary
    }).round(2)
    
    rfm.columns = ['recency_days', 'frequency', 'monetary_value']
    
    # Calculate RFM scores (1-5 scale)
    rfm['recency_score'] = pd.qcut(rfm['recency_days'], 5, labels=[5, 4, 3, 2, 1])
    rfm['frequency_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
    rfm['monetary_score'] = pd.qcut(rfm['monetary_value'], 5, labels=[1, 2, 3, 4, 5])
    
    # Convert scores to numeric
    rfm['recency_score'] = rfm['recency_score'].astype(int)
    rfm['frequency_score'] = rfm['frequency_score'].astype(int)
    rfm['monetary_score'] = rfm['monetary_score'].astype(int)
    
    # Calculate overall RFM score
    rfm['rfm_score'] = (rfm['recency_score'].astype(str) + 
                       rfm['frequency_score'].astype(str) + 
                       rfm['monetary_score'].astype(str))
    
    return rfm.reset_index()

rfm_df = calculate_rfm(transactions_df, analysis_date)
print(f"RFM analysis completed for {len(rfm_df):,} customers")

# Display RFM distribution
print("\n=== RFM SCORE DISTRIBUTION ===")
print(f"Recency Score: {rfm_df['recency_score'].describe()}")
print(f"Frequency Score: {rfm_df['frequency_score'].describe()}")
print(f"Monetary Score: {rfm_df['monetary_score'].describe()}")

rfm_df.head()

## Customer Behavioral Features

In [None]:
# Calculate additional behavioral features
def create_behavioral_features(transactions_df, customers_df, analysis_date):
    # Filter completed transactions
    completed_txns = transactions_df[transactions_df['order_status'] == 'completed'].copy()
    
    # Customer-level aggregations
    customer_features = completed_txns.groupby('customer_id').agg({
        'total_amount': ['sum', 'mean', 'std', 'count'],
        'quantity': ['sum', 'mean'],
        'discount_amount': ['sum', 'mean'],
        'shipping_cost': ['sum', 'mean'],
        'transaction_date': ['min', 'max']
    }).round(2)
    
    # Flatten column names
    customer_features.columns = ['_'.join(col).strip() for col in customer_features.columns]
    customer_features = customer_features.reset_index()
    
    # Calculate customer lifetime (days since first purchase)
    customer_features['customer_lifetime_days'] = (
        customer_features['transaction_date_max'] - customer_features['transaction_date_min']
    ).dt.days + 1
    
    # Calculate days since registration to first purchase
    reg_dates = customers_df.set_index('customer_id')['registration_date'].to_dict()
    customer_features['registration_date'] = customer_features['customer_id'].map(reg_dates)
    customer_features['days_reg_to_first_purchase'] = (
        customer_features['transaction_date_min'] - pd.to_datetime(customer_features['registration_date'])
    ).dt.days
    
    # Calculate purchase frequency (purchases per day of lifetime)
    customer_features['purchase_frequency'] = (
        customer_features['total_amount_count'] / customer_features['customer_lifetime_days']
    ).fillna(0).round(4)
    
    # Fill NaN values in std with 0 (customers with only 1 purchase)
    customer_features['total_amount_std'] = customer_features['total_amount_std'].fillna(0)
    
    # Calculate days since last purchase
    customer_features['days_since_last_purchase'] = (
        analysis_date - customer_features['transaction_date_max']
    ).dt.days
    
    # Drop intermediate columns
    customer_features = customer_features.drop(['transaction_date_min', 'transaction_date_max', 'registration_date'], axis=1)
    
    return customer_features

behavioral_features = create_behavioral_features(transactions_df, customers_df, analysis_date)
print(f"Behavioral features created for {len(behavioral_features):,} customers")

behavioral_features.head()

## Product Preference Features

In [None]:
# Create product preference features
def create_product_features(transactions_df, products_df):
    # Merge transactions with product info
    txn_products = transactions_df.merge(products_df[['product_id', 'category', 'price', 'rating']], on='product_id')
    
    # Filter completed transactions
    completed_txns = txn_products[txn_products['order_status'] == 'completed'].copy()
    
    # Category preferences
    category_counts = completed_txns.groupby(['customer_id', 'category']).size().unstack(fill_value=0)
    category_counts.columns = [f'purchases_{col.lower()}' for col in category_counts.columns]
    
    # Calculate total purchases per customer
    category_counts['total_purchases'] = category_counts.sum(axis=1)
    
    # Convert to percentages
    for col in category_counts.columns[:-1]:  # Exclude total_purchases
        category_counts[f'pct_{col}'] = (
            category_counts[col] / category_counts['total_purchases'] * 100
        ).round(2)
    
    # Find preferred category
    pct_cols = [col for col in category_counts.columns if col.startswith('pct_')]
    category_counts['preferred_category'] = category_counts[pct_cols].idxmax(axis=1)
    category_counts['preferred_category'] = category_counts['preferred_category'].str.replace('pct_purchases_', '')
    
    # Calculate average product metrics
    product_metrics = completed_txns.groupby('customer_id').agg({
        'price': 'mean',
        'rating': 'mean'
    }).round(2)
    product_metrics.columns = ['avg_product_price', 'avg_product_rating']
    
    # Merge category and product metrics
    product_features = category_counts.join(product_metrics)
    
    return product_features.reset_index()

product_features = create_product_features(transactions_df, products_df)
print(f"Product preference features created for {len(product_features):,} customers")

product_features.head()

## Seasonality and Time-based Features

In [None]:
# Create seasonality features
def create_seasonality_features(transactions_df):
    completed_txns = transactions_df[transactions_df['order_status'] == 'completed'].copy()
    
    # Extract time components
    completed_txns['month'] = completed_txns['transaction_date'].dt.month
    completed_txns['day_of_week'] = completed_txns['transaction_date'].dt.dayofweek
    completed_txns['quarter'] = completed_txns['transaction_date'].dt.quarter
    completed_txns['is_weekend'] = completed_txns['day_of_week'].isin([5, 6]).astype(int)
    
    # Define seasons
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:
            return 'fall'
    
    completed_txns['season'] = completed_txns['month'].apply(get_season)
    
    # Customer seasonality features
    seasonality_features = completed_txns.groupby('customer_id').agg({
        'month': lambda x: x.mode().iloc[0] if not x.empty else 1,  # Most common month
        'day_of_week': lambda x: x.mode().iloc[0] if not x.empty else 1,  # Most common day
        'quarter': lambda x: x.mode().iloc[0] if not x.empty else 1,  # Most common quarter
        'is_weekend': 'mean',  # Percentage of weekend purchases
        'season': lambda x: x.mode().iloc[0] if not x.empty else 'spring'  # Most common season
    }).round(2)
    
    seasonality_features.columns = [
        'preferred_month', 'preferred_day_of_week', 'preferred_quarter', 
        'weekend_purchase_rate', 'preferred_season'
    ]
    
    return seasonality_features.reset_index()

seasonality_features = create_seasonality_features(transactions_df)
print(f"Seasonality features created for {len(seasonality_features):,} customers")

seasonality_features.head()

## Create Target Variable (Repurchase Probability)

In [None]:
# Create target variable for repurchase prediction
def create_target_variable(transactions_df, analysis_date, prediction_window_days=90):
    """
    Target: Will customer make a purchase in the next X days?
    For simulation, we'll use a rule-based approach based on customer behavior
    """
    completed_txns = transactions_df[transactions_df['order_status'] == 'completed'].copy()
    
    # Calculate customer metrics for target generation
    customer_metrics = completed_txns.groupby('customer_id').agg({
        'transaction_date': ['min', 'max', 'count'],
        'total_amount': ['sum', 'mean']
    })
    
    customer_metrics.columns = ['first_purchase', 'last_purchase', 'purchase_count', 'total_spent', 'avg_order_value']
    customer_metrics = customer_metrics.reset_index()
    
    # Days since last purchase
    customer_metrics['days_since_last'] = (analysis_date - customer_metrics['last_purchase']).dt.days
    
    # Customer lifetime in days
    customer_metrics['customer_lifetime'] = (customer_metrics['last_purchase'] - customer_metrics['first_purchase']).dt.days + 1
    
    # Average days between purchases
    customer_metrics['avg_days_between_purchases'] = (
        customer_metrics['customer_lifetime'] / customer_metrics['purchase_count']
    ).fillna(0)
    
    # Rule-based target generation (simulate reality)
    def predict_repurchase(row):
        # High probability conditions
        if (row['days_since_last'] <= 30 and row['purchase_count'] >= 5) or \
           (row['avg_order_value'] >= 100 and row['days_since_last'] <= 60) or \
           (row['purchase_count'] >= 10 and row['days_since_last'] <= 90):
            return np.random.choice([0, 1], p=[0.2, 0.8])  # 80% chance
        
        # Medium probability conditions
        elif (row['days_since_last'] <= 60 and row['purchase_count'] >= 3) or \
             (row['avg_order_value'] >= 50 and row['days_since_last'] <= 120):
            return np.random.choice([0, 1], p=[0.5, 0.5])  # 50% chance
        
        # Low probability conditions
        elif row['days_since_last'] <= 180 and row['purchase_count'] >= 2:
            return np.random.choice([0, 1], p=[0.7, 0.3])  # 30% chance
        
        # Very low probability
        else:
            return np.random.choice([0, 1], p=[0.9, 0.1])  # 10% chance
    
    # Set seed for reproducible target generation
    np.random.seed(42)
    customer_metrics['will_repurchase'] = customer_metrics.apply(predict_repurchase, axis=1)
    
    return customer_metrics[['customer_id', 'will_repurchase']]

target_df = create_target_variable(transactions_df, analysis_date)
print(f"Target variable created for {len(target_df):,} customers")

# Display target distribution
target_dist = target_df['will_repurchase'].value_counts()
print(f"\n=== TARGET DISTRIBUTION ===")
print(f"Will NOT repurchase (0): {target_dist[0]:,} ({target_dist[0]/len(target_df)*100:.1f}%)")
print(f"Will repurchase (1): {target_dist[1]:,} ({target_dist[1]/len(target_df)*100:.1f}%)")

target_df.head()

## Merge All Features

In [None]:
# Merge all feature sets
print("=== MERGING ALL FEATURES ===")

# Start with customer basic info
customer_basic = customers_df[['customer_id', 'age', 'gender', 'preferred_category']].copy()

# Merge all feature sets
feature_sets = [
    (rfm_df, 'RFM'),
    (behavioral_features, 'Behavioral'),
    (product_features, 'Product Preferences'),
    (seasonality_features, 'Seasonality'),
    (target_df, 'Target')
]

final_dataset = customer_basic.copy()

for df, name in feature_sets:
    initial_count = len(final_dataset)
    final_dataset = final_dataset.merge(df, on='customer_id', how='inner')
    print(f"After merging {name}: {len(final_dataset):,} customers (lost {initial_count - len(final_dataset)})")

print(f"\n✅ Final dataset created with {len(final_dataset):,} customers and {len(final_dataset.columns)} features")

# Display feature summary
print(f"\n=== FEATURE SUMMARY ===")
print(f"Total features: {len(final_dataset.columns)}")
print(f"Features: {list(final_dataset.columns)}")

final_dataset.head()

## Feature Engineering - Categorical Encoding

In [None]:
# Encode categorical variables
final_dataset_encoded = final_dataset.copy()

# Label encode categorical variables
categorical_cols = ['gender', 'preferred_category', 'preferred_season']
label_encoders = {}

for col in categorical_cols:
    if col in final_dataset_encoded.columns:
        le = LabelEncoder()
        final_dataset_encoded[f'{col}_encoded'] = le.fit_transform(final_dataset_encoded[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# One-hot encode some categorical variables for better model performance
# Gender
if 'gender' in final_dataset_encoded.columns:
    gender_dummies = pd.get_dummies(final_dataset_encoded['gender'], prefix='gender')
    final_dataset_encoded = pd.concat([final_dataset_encoded, gender_dummies], axis=1)

# Preferred category
if 'preferred_category' in final_dataset_encoded.columns:
    category_dummies = pd.get_dummies(final_dataset_encoded['preferred_category'], prefix='prefers')
    final_dataset_encoded = pd.concat([final_dataset_encoded, category_dummies], axis=1)

# Preferred season
if 'preferred_season' in final_dataset_encoded.columns:
    season_dummies = pd.get_dummies(final_dataset_encoded['preferred_season'], prefix='season')
    final_dataset_encoded = pd.concat([final_dataset_encoded, season_dummies], axis=1)

print(f"\n✅ Categorical encoding completed. Dataset now has {len(final_dataset_encoded.columns)} features")

## Feature Correlation Analysis

In [None]:
# Analyze feature correlations with target variable
numeric_cols = final_dataset_encoded.select_dtypes(include=[np.number]).columns
correlation_with_target = final_dataset_encoded[numeric_cols].corr()['will_repurchase'].sort_values(key=abs, ascending=False)

print("=== TOP FEATURES CORRELATED WITH REPURCHASE ===")
print(correlation_with_target.head(15))

# Visualize top correlations
plt.figure(figsize=(10, 8))
top_features = correlation_with_target.drop('will_repurchase').head(15)
colors = ['green' if x > 0 else 'red' for x in top_features.values]
plt.barh(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Correlation with Repurchase Target')
plt.title('Top 15 Features Correlated with Customer Repurchase')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

## Prepare Final Training Dataset

In [None]:
# Prepare final training dataset
# Remove non-numeric and redundant columns
columns_to_drop = [
    'customer_id',  # Identifier, not a feature
    'gender', 'preferred_category', 'preferred_season',  # Original categorical (we have encoded versions)
    'rfm_score'  # String combination of scores (we have individual scores)
]

# Drop columns that exist in the dataset
columns_to_drop = [col for col in columns_to_drop if col in final_dataset_encoded.columns]
training_data = final_dataset_encoded.drop(columns=columns_to_drop)

# Separate features and target
X = training_data.drop('will_repurchase', axis=1)
y = training_data['will_repurchase']

print(f"=== FINAL TRAINING DATASET ===")
print(f"Samples: {len(X):,}")
print(f"Features: {len(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")

print(f"\nFeature list: {list(X.columns)}")

# Check for any remaining missing values
missing_values = X.isnull().sum()
if missing_values.sum() > 0:
    print(f"\n⚠️ Missing values found:")
    print(missing_values[missing_values > 0])
    # Fill missing values
    X = X.fillna(X.mean())
    print("✅ Missing values filled with mean")
else:
    print("\n✅ No missing values found")

# Display final dataset info
print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

## Save Processed Data

In [None]:
# Create output directory if it doesn't exist
processed_data_path = '../data/processed/'
os.makedirs(processed_data_path, exist_ok=True)

# Save feature engineered dataset
final_dataset_encoded.to_csv(f'{processed_data_path}/customer_features.csv', index=False)

# Save training data
training_data.to_csv(f'{processed_data_path}/training_data.csv', index=False)

# Save feature matrix and target separately for ML
X.to_csv(f'{processed_data_path}/X_features.csv', index=False)
y.to_csv(f'{processed_data_path}/y_target.csv', index=False)

# Save label encoders for future use
import pickle
with open(f'{processed_data_path}/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# Save feature engineering summary
feature_summary = {
    'created_date': datetime.now().isoformat(),
    'total_customers': len(final_dataset_encoded),
    'total_features': len(X.columns),
    'target_distribution': y.value_counts().to_dict(),
    'feature_list': list(X.columns),
    'categorical_encodings': {col: dict(zip(le.classes_, le.transform(le.classes_))) 
                             for col, le in label_encoders.items()}
}

import json
with open(f'{processed_data_path}/feature_engineering_summary.json', 'w') as f:
    json.dump(feature_summary, f, indent=2)

print("✅ Processed data successfully saved to processed data layer:")
print(f"   - customer_features.csv: {len(final_dataset_encoded):,} records, {len(final_dataset_encoded.columns)} features")
print(f"   - training_data.csv: {len(training_data):,} records")
print(f"   - X_features.csv: {X.shape[0]:,} samples, {X.shape[1]} features")
print(f"   - y_target.csv: {len(y):,} samples")
print(f"   - label_encoders.pkl: {len(label_encoders)} encoders")
print(f"   - feature_engineering_summary.json: metadata")

print("\n🎯 Data is now ready for machine learning model training!")