# 03. Advanced Temporal Feature Engineering

**Credit Card Default Analysis - Feature Engineering Phase**
- **Repository**: Kelompok-Nyengir/tubes-data-jumboh
- **Phase**: 3 of 5 - Advanced Feature Creation

## 📋 Notebook Objectives

1. **Temporal Feature Engineering**: Create 25+ features from 6-month payment history
2. **Payment Behavior Analysis**: Develop behavioral indicators and trends
3. **Credit Utilization Features**: Engineer credit usage and efficiency metrics
4. **Risk Scoring Components**: Build comprehensive risk assessment features
5. **Customer Segmentation**: Create business-oriented customer categories

## 🎯 Expected Outcomes
- Rich feature set for machine learning models
- Temporal insights into payment behaviors
- Business-interpretable risk indicators
- Enhanced predictive capabilities

## Setup and Configuration

In [None]:
# Enhanced setup for feature engineering
import sys
import os
sys.path.append('../src')

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Import custom modules
from src.feature_engineering import TemporalFeatureEngineer
from src.visualization import CreditCardVisualizer

import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('Set2')
%matplotlib inline

print('=' * 80)
print('⚙️ CREDIT CARD DEFAULT ANALYSIS - FEATURE ENGINEERING')
print('=' * 80)
print(f'📅 Analysis Date: 2025-06-20 16:04:47 UTC')
print(f'👤 Analyst: ardzz')
print(f'📝 Phase: 3 of 5 - Advanced Feature Creation')
print(f'🔗 Repository: Kelompok-Nyengir/tubes-data-jumboh')
print('=' * 80)

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName('CreditCardFeatureEngineering') \
    .config('spark.sql.adaptive.enabled', 'true') \
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true') \
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
    .getOrCreate()

spark.sparkContext.setLogLevel('WARN')

print(f'✅ Spark Session initialized successfully')
print(f'   Spark Version: {spark.version}')
print(f'   Spark UI: {spark.sparkContext.uiWebUrl}')

# Initialize feature engineer and visualizer
feature_engineer = TemporalFeatureEngineer()
visualizer = CreditCardVisualizer()

print(f'✅ Custom modules initialized')

## Data Loading and Pre-Engineering Assessment

In [None]:
# Load cleaned dataset from previous phase
print('📂 Loading cleaned dataset for feature engineering...')

try:
    # Try to load cleaned data from previous phase
    df_clean = spark.read.parquet('../data/processed/02_cleaned_data.parquet')
    print(f'✅ Loaded cleaned dataset from Phase 2')
except:
    try:
        # Fallback to exploration cache
        df_clean = spark.read.parquet('../data/processed/01_exploration_cache.parquet')
        print(f'⚠️  Using exploration cache - applying basic cleaning')
        
        # Apply basic cleaning
        df_clean = df_clean.withColumn(
            'EDUCATION', when(col('EDUCATION').isin([0, 5, 6]), 4).otherwise(col('EDUCATION'))
        ).withColumn(
            'MARRIAGE', when(col('MARRIAGE') == 0, 3).otherwise(col('MARRIAGE'))
        )
    except:
        # Final fallback to original CSV with cleaning
        from src.data_processing import CreditCardDataProcessor
        processor = CreditCardDataProcessor(spark)
        
        try:
            df_raw = processor.load_data('../data/sample.csv')
        except:
            df_raw = processor.load_data('../sample.csv')
        
        df_clean = processor.clean_data(df_raw)
        print(f'⚠️  Loaded and cleaned original CSV')

# Dataset assessment
print(f'\n📊 PRE-FEATURE ENGINEERING ASSESSMENT:')
print(f'   Records: {df_clean.count():,}')
print(f'   Columns: {len(df_clean.columns)}')

# Check for required columns
required_columns = {
    'demographic': ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE'],
    'payment_status': ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'],
    'bill_amounts': ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6'],
    'payment_amounts': ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
}

print(f'\n📋 REQUIRED COLUMNS AVAILABILITY:')
for category, cols in required_columns.items():
    available = sum(1 for col in cols if col in df_clean.columns)
    print(f'   {category.title()}: {available}/{len(cols)} columns available')
    
    missing = [col for col in cols if col not in df_clean.columns]
    if missing:
        print(f'      Missing: {', '.join(missing)}')

print(f'\n✅ Dataset ready for feature engineering')

## Phase 1: Payment Trend and Volatility Features

In [None]:
# Create payment trend and volatility features
print('📈 PHASE 1: PAYMENT TREND AND VOLATILITY FEATURES')
print('=' * 60)

# Apply payment trend feature engineering
df_features = feature_engineer.create_payment_trend_features(df_clean)

# Analyze the created features
trend_features = [
    'PAYMENT_TREND_SLOPE', 'PAYMENT_STATUS_VOLATILITY', 
    'MAX_PAYMENT_DELAY', 'MIN_PAYMENT_DELAY', 'PAYMENT_DELAY_RANGE'
]

print(f'\n📊 PAYMENT TREND FEATURES ANALYSIS:')
print(f'{'Feature':<25} {'Mean':<10} {'Std':<10} {'Min':<10} {'Max':<10}')
print('-' * 70)

for feature in trend_features:
    if feature in df_features.columns:
        stats = df_features.select(
            avg(feature).alias('mean'),
            stddev(feature).alias('std'),
            min(feature).alias('min'),
            max(feature).alias('max')
        ).collect()[0]
        
        print(f'{feature:<25} {stats['mean']:<10.3f} {stats['std']:<10.3f} '
              f'{stats['min']:<10.3f} {stats['max']:<10.3f}')

# Analyze payment trend slope distribution
if 'PAYMENT_TREND_SLOPE' in df_features.columns:
    print(f'\n🔍 PAYMENT TREND SLOPE INTERPRETATION:')
    
    # Categorize trend slopes
    trend_categories = df_features.withColumn(
        'TREND_CATEGORY',
        when(col('PAYMENT_TREND_SLOPE') > 1, 'Strongly Improving')
        .when(col('PAYMENT_TREND_SLOPE') > 0.2, 'Improving')
        .when(col('PAYMENT_TREND_SLOPE') > -0.2, 'Stable')
        .when(col('PAYMENT_TREND_SLOPE') > -1, 'Deteriorating')
        .otherwise('Strongly Deteriorating')
    ).groupBy('TREND_CATEGORY').count().orderBy(desc('count'))
    
    total_customers = df_features.count()
    print(f'   Payment Trend Distribution:')
    for row in trend_categories.collect():
        category = row['TREND_CATEGORY']
        count = row['count']
        percentage = count / total_customers * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')

# Payment volatility analysis
if 'PAYMENT_STATUS_VOLATILITY' in df_features.columns:
    print(f'\n📊 PAYMENT VOLATILITY ANALYSIS:')
    
    volatility_percentiles = df_features.select(
        expr('percentile_approx(PAYMENT_STATUS_VOLATILITY, 0.25)').alias('q1'),
        expr('percentile_approx(PAYMENT_STATUS_VOLATILITY, 0.5)').alias('median'),
        expr('percentile_approx(PAYMENT_STATUS_VOLATILITY, 0.75)').alias('q3'),
        expr('percentile_approx(PAYMENT_STATUS_VOLATILITY, 0.9)').alias('p90')
    ).collect()[0]
    
    print(f'   Volatility Percentiles:')
    print(f'      25th percentile: {volatility_percentiles['q1']:.3f}')
    print(f'      Median: {volatility_percentiles['median']:.3f}')
    print(f'      75th percentile: {volatility_percentiles['q3']:.3f}')
    print(f'      90th percentile: {volatility_percentiles['p90']:.3f}')
    
    # High volatility customers
    high_volatility_threshold = volatility_percentiles['p90']
    high_volatility_count = df_features.filter(
        col('PAYMENT_STATUS_VOLATILITY') >= high_volatility_threshold
    ).count()
    
    print(f'\n   High Volatility Customers (>90th percentile):')
    print(f'      Count: {high_volatility_count:,} ({high_volatility_count/total_customers*100:.1f}%)')
    print(f'      Threshold: {high_volatility_threshold:.3f}')

print(f'\n✅ Phase 1 completed: {len(trend_features)} trend and volatility features created')

## Phase 2: Temporal Segmentation Features

In [None]:
# Create temporal segmentation features
print('🕒 PHASE 2: TEMPORAL SEGMENTATION FEATURES')
print('=' * 60)

# Apply temporal segmentation feature engineering
df_features = feature_engineer.create_temporal_segmentation_features(df_features)

# Analyze temporal segmentation features
temporal_features = [
    'RECENT_AVG_DELAY', 'HISTORICAL_AVG_DELAY', 
    'PAYMENT_IMPROVEMENT_SCORE', 'RECOVERY_INSTANCES'
]

print(f'\n📊 TEMPORAL SEGMENTATION FEATURES ANALYSIS:')
print(f'{'Feature':<25} {'Mean':<10} {'Std':<10} {'Min':<10} {'Max':<10}')
print('-' * 70)

for feature in temporal_features:
    if feature in df_features.columns:
        stats = df_features.select(
            avg(feature).alias('mean'),
            stddev(feature).alias('std'),
            min(feature).alias('min'),
            max(feature).alias('max')
        ).collect()[0]
        
        print(f'{feature:<25} {stats['mean']:<10.3f} {stats['std']:<10.3f} '
              f'{stats['min']:<10.3f} {stats['max']:<10.3f}')

# Payment improvement analysis
if 'PAYMENT_IMPROVEMENT_SCORE' in df_features.columns:
    print(f'\n📈 PAYMENT IMPROVEMENT ANALYSIS:')
    
    # Categorize improvement scores
    improvement_categories = df_features.withColumn(
        'IMPROVEMENT_CATEGORY',
        when(col('PAYMENT_IMPROVEMENT_SCORE') > 2, 'Significant Improvement')
        .when(col('PAYMENT_IMPROVEMENT_SCORE') > 0.5, 'Moderate Improvement')
        .when(col('PAYMENT_IMPROVEMENT_SCORE') > -0.5, 'Stable')
        .when(col('PAYMENT_IMPROVEMENT_SCORE') > -2, 'Moderate Deterioration')
        .otherwise('Significant Deterioration')
    ).groupBy('IMPROVEMENT_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Payment Improvement Distribution:')
    for row in improvement_categories.collect():
        category = row['IMPROVEMENT_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')
    
    # Correlation with default
    if 'default payment next month' in df_features.columns:
        # Calculate correlation
        assembler = VectorAssembler(
            inputCols=['PAYMENT_IMPROVEMENT_SCORE', 'default payment next month'], 
            outputCol='features'
        )
        df_corr = assembler.transform(df_features).select('features')
        correlation_matrix = Correlation.corr(df_corr, 'features').head()[0]
        correlation_value = float(correlation_matrix.toArray()[0, 1])
        
        print(f'\n   Correlation with Default: {correlation_value:.4f}')
        if abs(correlation_value) > 0.1:
            direction = 'negative' if correlation_value < 0 else 'positive'
            strength = 'strong' if abs(correlation_value) > 0.3 else 'moderate' if abs(correlation_value) > 0.1 else 'weak'
            print(f'      {strength.title()} {direction} correlation detected')

# Recovery instances analysis
if 'RECOVERY_INSTANCES' in df_features.columns:
    print(f'\n🔄 RECOVERY INSTANCES ANALYSIS:')
    
    recovery_distribution = df_features.groupBy('RECOVERY_INSTANCES').count().orderBy('RECOVERY_INSTANCES')
    
    print(f'   Recovery Instances Distribution:')
    print(f'   {'Instances':<12} {'Count':<10} {'Percentage':<12} {'Interpretation'}')
    print('-' * 55)
    
    for row in recovery_distribution.collect():
        instances = row['RECOVERY_INSTANCES']
        count = row['count']
        percentage = count / df_features.count() * 100
        
        if instances == 0:
            interpretation = 'No recovery'
        elif instances <= 2:
            interpretation = 'Limited recovery'
        elif instances <= 4:
            interpretation = 'Good recovery'
        else:
            interpretation = 'Excellent recovery'
        
        print(f'   {instances:<12} {count:<10,} {percentage:<12.1f}% {interpretation}')
    
    # Average recovery by default status
    if 'default payment next month' in df_features.columns:
        avg_recovery_by_default = df_features.groupBy('default payment next month') \
            .agg(avg('RECOVERY_INSTANCES').alias('avg_recovery')) \
            .orderBy('default payment next month')
        
        print(f'\n   Average Recovery by Default Status:')
        for row in avg_recovery_by_default.collect():
            default_status = 'No Default' if row['default payment next month'] == 0 else 'Default'
            avg_recovery = row['avg_recovery']
            print(f'      {default_status}: {avg_recovery:.2f} recovery instances')

print(f'\n✅ Phase 2 completed: {len(temporal_features)} temporal segmentation features created')

## Phase 3: Bill Statement and Financial Features

In [None]:
# Create bill statement and financial features
print('💰 PHASE 3: BILL STATEMENT AND FINANCIAL FEATURES')
print('=' * 60)

# Apply bill statement feature engineering
df_features = feature_engineer.create_bill_statement_features(df_features)

# Analyze bill statement features
bill_features = [
    'BILL_TREND_SLOPE', 'BILL_AMOUNT_VOLATILITY', 
    'DEBT_ACCUMULATION_RATE', 'AVG_BILL_AMOUNT'
]

print(f'\n📊 BILL STATEMENT FEATURES ANALYSIS:')
print(f'{'Feature':<25} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12}')
print('-' * 80)

for feature in bill_features:
    if feature in df_features.columns:
        stats = df_features.select(
            avg(feature).alias('mean'),
            stddev(feature).alias('std'),
            min(feature).alias('min'),
            max(feature).alias('max')
        ).collect()[0]
        
        print(f'{feature:<25} {stats['mean']:<12.2f} {stats['std']:<12.2f} '
              f'{stats['min']:<12.2f} {stats['max']:<12.2f}')

# Bill trend analysis
if 'BILL_TREND_SLOPE' in df_features.columns:
    print(f'\n📈 BILL TREND ANALYSIS:')
    
    # Categorize bill trends
    bill_trend_categories = df_features.withColumn(
        'BILL_TREND_CATEGORY',
        when(col('BILL_TREND_SLOPE') > 5000, 'Rapidly Increasing')
        .when(col('BILL_TREND_SLOPE') > 1000, 'Increasing')
        .when(col('BILL_TREND_SLOPE') > -1000, 'Stable')
        .when(col('BILL_TREND_SLOPE') > -5000, 'Decreasing')
        .otherwise('Rapidly Decreasing')
    ).groupBy('BILL_TREND_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Bill Trend Distribution:')
    for row in bill_trend_categories.collect():
        category = row['BILL_TREND_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')

# Debt accumulation analysis
if 'DEBT_ACCUMULATION_RATE' in df_features.columns:
    print(f'\n📊 DEBT ACCUMULATION ANALYSIS:')
    
    # Analyze debt accumulation patterns
    debt_percentiles = df_features.select(
        expr('percentile_approx(DEBT_ACCUMULATION_RATE, 0.1)').alias('p10'),
        expr('percentile_approx(DEBT_ACCUMULATION_RATE, 0.25)').alias('q1'),
        expr('percentile_approx(DEBT_ACCUMULATION_RATE, 0.5)').alias('median'),
        expr('percentile_approx(DEBT_ACCUMULATION_RATE, 0.75)').alias('q3'),
        expr('percentile_approx(DEBT_ACCUMULATION_RATE, 0.9)').alias('p90')
    ).collect()[0]
    
    print(f'   Debt Accumulation Rate Percentiles:')
    print(f'      10th percentile: {debt_percentiles['p10']:.3f}')
    print(f'      25th percentile: {debt_percentiles['q1']:.3f}')
    print(f'      Median: {debt_percentiles['median']:.3f}')
    print(f'      75th percentile: {debt_percentiles['q3']:.3f}')
    print(f'      90th percentile: {debt_percentiles['p90']:.3f}')
    
    # High debt accumulation customers
    high_debt_threshold = 0.5  # 50% increase
    high_debt_count = df_features.filter(col('DEBT_ACCUMULATION_RATE') > high_debt_threshold).count()
    
    print(f'\n   High Debt Accumulation (>50% increase):')
    print(f'      Count: {high_debt_count:,} ({high_debt_count/df_features.count()*100:.1f}%)')
    
    # Negative debt accumulation (debt reduction)
    debt_reduction_count = df_features.filter(col('DEBT_ACCUMULATION_RATE') < -0.1).count()
    print(f'\n   Debt Reduction (>10% decrease):')
    print(f'      Count: {debt_reduction_count:,} ({debt_reduction_count/df_features.count()*100:.1f}%)')

# Average bill amount analysis
if 'AVG_BILL_AMOUNT' in df_features.columns and 'LIMIT_BAL' in df_features.columns:
    print(f'\n💳 CREDIT UTILIZATION ANALYSIS:')
    
    # Calculate basic credit utilization
    utilization_stats = df_features.withColumn(
        'BASIC_UTILIZATION',
        when(col('LIMIT_BAL') > 0, col('AVG_BILL_AMOUNT') / col('LIMIT_BAL')).otherwise(0)
    ).select(
        avg('BASIC_UTILIZATION').alias('avg_util'),
        expr('percentile_approx(BASIC_UTILIZATION, 0.5)').alias('median_util'),
        expr('percentile_approx(BASIC_UTILIZATION, 0.9)').alias('p90_util')
    ).collect()[0]
    
    print(f'   Credit Utilization Statistics:')
    print(f'      Average utilization: {utilization_stats['avg_util']:.3f} ({utilization_stats['avg_util']*100:.1f}%)')
    print(f'      Median utilization: {utilization_stats['median_util']:.3f} ({utilization_stats['median_util']*100:.1f}%)')
    print(f'      90th percentile: {utilization_stats['p90_util']:.3f} ({utilization_stats['p90_util']*100:.1f}%)')
    
    # High utilization customers
    high_util_count = df_features.filter(
        (col('LIMIT_BAL') > 0) & (col('AVG_BILL_AMOUNT') / col('LIMIT_BAL') > 0.8)
    ).count()
    
    print(f'\n   High Utilization (>80%):')
    print(f'      Count: {high_util_count:,} ({high_util_count/df_features.count()*100:.1f}%)')

print(f'\n✅ Phase 3 completed: {len(bill_features)} bill statement and financial features created')

## Phase 4: Payment Efficiency and Consistency Features

In [None]:
# Create payment efficiency and consistency features
print('⚡ PHASE 4: PAYMENT EFFICIENCY AND CONSISTENCY FEATURES')
print('=' * 60)

# Apply payment efficiency feature engineering
df_features = feature_engineer.create_payment_efficiency_features(df_features)

# Analyze payment efficiency features
efficiency_features = [
    'AVG_PAYMENT_EFFICIENCY', 'PAYMENT_EFFICIENCY_TREND',
    'PAYMENT_CONSISTENCY_SCORE', 'AVG_PAYMENT_AMOUNT'
]

print(f'\n📊 PAYMENT EFFICIENCY FEATURES ANALYSIS:')
print(f'{'Feature':<25} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12}')
print('-' * 80)

for feature in efficiency_features:
    if feature in df_features.columns:
        stats = df_features.select(
            avg(feature).alias('mean'),
            stddev(feature).alias('std'),
            min(feature).alias('min'),
            max(feature).alias('max')
        ).collect()[0]
        
        print(f'{feature:<25} {stats['mean']:<12.3f} {stats['std']:<12.3f} '
              f'{stats['min']:<12.3f} {stats['max']:<12.3f}')

# Payment efficiency analysis
if 'AVG_PAYMENT_EFFICIENCY' in df_features.columns:
    print(f'\n⚡ PAYMENT EFFICIENCY ANALYSIS:')
    
    # Categorize payment efficiency
    efficiency_categories = df_features.withColumn(
        'EFFICIENCY_CATEGORY',
        when(col('AVG_PAYMENT_EFFICIENCY') >= 1.0, 'Full Payment')
        .when(col('AVG_PAYMENT_EFFICIENCY') >= 0.8, 'High Efficiency')
        .when(col('AVG_PAYMENT_EFFICIENCY') >= 0.5, 'Medium Efficiency')
        .when(col('AVG_PAYMENT_EFFICIENCY') >= 0.2, 'Low Efficiency')
        .otherwise('Very Low Efficiency')
    ).groupBy('EFFICIENCY_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Payment Efficiency Distribution:')
    for row in efficiency_categories.collect():
        category = row['EFFICIENCY_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')
    
    # Efficiency vs default correlation
    if 'default payment next month' in df_features.columns:
        efficiency_by_default = df_features.groupBy('default payment next month') \
            .agg(avg('AVG_PAYMENT_EFFICIENCY').alias('avg_efficiency')) \
            .orderBy('default payment next month')
        
        print(f'\n   Average Efficiency by Default Status:')
        for row in efficiency_by_default.collect():
            default_status = 'No Default' if row['default payment next month'] == 0 else 'Default'
            avg_efficiency = row['avg_efficiency']
            print(f'      {default_status}: {avg_efficiency:.3f} ({avg_efficiency*100:.1f}%)')

# Payment consistency analysis
if 'PAYMENT_CONSISTENCY_SCORE' in df_features.columns:
    print(f'\n📊 PAYMENT CONSISTENCY ANALYSIS:')
    
    # Analyze consistency score distribution
    consistency_percentiles = df_features.select(
        expr('percentile_approx(PAYMENT_CONSISTENCY_SCORE, 0.1)').alias('p10'),
        expr('percentile_approx(PAYMENT_CONSISTENCY_SCORE, 0.25)').alias('q1'),
        expr('percentile_approx(PAYMENT_CONSISTENCY_SCORE, 0.5)').alias('median'),
        expr('percentile_approx(PAYMENT_CONSISTENCY_SCORE, 0.75)').alias('q3'),
        expr('percentile_approx(PAYMENT_CONSISTENCY_SCORE, 0.9)').alias('p90')
    ).collect()[0]
    
    print(f'   Consistency Score Percentiles:')
    print(f'      10th percentile: {consistency_percentiles['p10']:.3f}')
    print(f'      25th percentile: {consistency_percentiles['q1']:.3f}')
    print(f'      Median: {consistency_percentiles['median']:.3f}')
    print(f'      75th percentile: {consistency_percentiles['q3']:.3f}')
    print(f'      90th percentile: {consistency_percentiles['p90']:.3f}')
    
    # High consistency customers
    high_consistency_threshold = consistency_percentiles['q3']
    high_consistency_count = df_features.filter(
        col('PAYMENT_CONSISTENCY_SCORE') >= high_consistency_threshold
    ).count()
    
    print(f'\n   High Consistency (≥75th percentile):')
    print(f'      Count: {high_consistency_count:,} ({high_consistency_count/df_features.count()*100:.1f}%)')
    print(f'      Threshold: {high_consistency_threshold:.3f}')

# Payment efficiency trend analysis
if 'PAYMENT_EFFICIENCY_TREND' in df_features.columns:
    print(f'\n📈 PAYMENT EFFICIENCY TREND ANALYSIS:')
    
    # Categorize efficiency trends
    trend_categories = df_features.withColumn(
        'EFFICIENCY_TREND_CATEGORY',
        when(col('PAYMENT_EFFICIENCY_TREND') > 0.1, 'Improving Efficiency')
        .when(col('PAYMENT_EFFICIENCY_TREND') > -0.1, 'Stable Efficiency')
        .otherwise('Declining Efficiency')
    ).groupBy('EFFICIENCY_TREND_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Efficiency Trend Distribution:')
    for row in trend_categories.collect():
        category = row['EFFICIENCY_TREND_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')

print(f'\n✅ Phase 4 completed: {len(efficiency_features)} payment efficiency and consistency features created')

## Phase 5: Credit Utilization and Risk Features

In [None]:
# Create credit utilization features
print('💳 PHASE 5: CREDIT UTILIZATION AND RISK FEATURES')
print('=' * 60)

# Apply credit utilization feature engineering
df_features = feature_engineer.create_credit_utilization_features(df_features)

# Analyze credit utilization features
utilization_features = [
    'CREDIT_UTILIZATION_RATIO', 'CREDIT_BUFFER', 'CREDIT_UTILIZATION_TREND'
]

print(f'\n📊 CREDIT UTILIZATION FEATURES ANALYSIS:')
print(f'{'Feature':<25} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12}')
print('-' * 80)

for feature in utilization_features:
    if feature in df_features.columns:
        stats = df_features.select(
            avg(feature).alias('mean'),
            stddev(feature).alias('std'),
            min(feature).alias('min'),
            max(feature).alias('max')
        ).collect()[0]
        
        if feature == 'CREDIT_BUFFER':
            # Display credit buffer in thousands
            print(f'{feature:<25} {stats['mean']/1000:<12.1f}K {stats['std']/1000:<12.1f}K '
                  f'{stats['min']/1000:<12.1f}K {stats['max']/1000:<12.1f}K')
        else:
            print(f'{feature:<25} {stats['mean']:<12.3f} {stats['std']:<12.3f} '
                  f'{stats['min']:<12.3f} {stats['max']:<12.3f}')

# Credit utilization ratio analysis
if 'CREDIT_UTILIZATION_RATIO' in df_features.columns:
    print(f'\n💳 CREDIT UTILIZATION RATIO ANALYSIS:')
    
    # Categorize utilization ratios
    utilization_categories = df_features.withColumn(
        'UTILIZATION_CATEGORY',
        when(col('CREDIT_UTILIZATION_RATIO') <= 0.1, 'Very Low (≤10%)')
        .when(col('CREDIT_UTILIZATION_RATIO') <= 0.3, 'Low (10-30%)')
        .when(col('CREDIT_UTILIZATION_RATIO') <= 0.5, 'Medium (30-50%)')
        .when(col('CREDIT_UTILIZATION_RATIO') <= 0.8, 'High (50-80%)')
        .when(col('CREDIT_UTILIZATION_RATIO') <= 1.0, 'Very High (80-100%)')
        .otherwise('Over Limit (>100%)')
    ).groupBy('UTILIZATION_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Credit Utilization Distribution:')
    for row in utilization_categories.collect():
        category = row['UTILIZATION_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')
    
    # Utilization vs default analysis
    if 'default payment next month' in df_features.columns:
        utilization_default = df_features.groupBy('default payment next month') \
            .agg(avg('CREDIT_UTILIZATION_RATIO').alias('avg_utilization')) \
            .orderBy('default payment next month')
        
        print(f'\n   Average Utilization by Default Status:')
        for row in utilization_default.collect():
            default_status = 'No Default' if row['default payment next month'] == 0 else 'Default'
            avg_utilization = row['avg_utilization']
            print(f'      {default_status}: {avg_utilization:.3f} ({avg_utilization*100:.1f}%)')

# Credit buffer analysis
if 'CREDIT_BUFFER' in df_features.columns:
    print(f'\n💰 CREDIT BUFFER ANALYSIS:')
    
    # Analyze credit buffer distribution
    buffer_stats = df_features.select(
        avg('CREDIT_BUFFER').alias('avg_buffer'),
        expr('percentile_approx(CREDIT_BUFFER, 0.25)').alias('q1'),
        expr('percentile_approx(CREDIT_BUFFER, 0.5)').alias('median'),
        expr('percentile_approx(CREDIT_BUFFER, 0.75)').alias('q3')
    ).collect()[0]
    
    print(f'   Credit Buffer Statistics (NT$):')
    print(f'      Average: NT$ {buffer_stats['avg_buffer']:,.0f}')
    print(f'      25th percentile: NT$ {buffer_stats['q1']:,.0f}')
    print(f'      Median: NT$ {buffer_stats['median']:,.0f}')
    print(f'      75th percentile: NT$ {buffer_stats['q3']:,.0f}')
    
    # Low buffer customers (potential risk)
    low_buffer_count = df_features.filter(col('CREDIT_BUFFER') < 10000).count()
    print(f'\n   Low Buffer (<NT$ 10,000):')
    print(f'      Count: {low_buffer_count:,} ({low_buffer_count/df_features.count()*100:.1f}%)')

# Credit utilization trend analysis
if 'CREDIT_UTILIZATION_TREND' in df_features.columns:
    print(f'\n📈 CREDIT UTILIZATION TREND ANALYSIS:')
    
    # Categorize utilization trends
    trend_categories = df_features.withColumn(
        'UTILIZATION_TREND_CATEGORY',
        when(col('CREDIT_UTILIZATION_TREND') > 0.1, 'Increasing Utilization')
        .when(col('CREDIT_UTILIZATION_TREND') > -0.1, 'Stable Utilization')
        .otherwise('Decreasing Utilization')
    ).groupBy('UTILIZATION_TREND_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Utilization Trend Distribution:')
    for row in trend_categories.collect():
        category = row['UTILIZATION_TREND_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')

print(f'\n✅ Phase 5 completed: {len(utilization_features)} credit utilization features created')

## Phase 6: Behavioral Classification and Risk Scoring

In [None]:
print('🎯 PHASE 6: BEHAVIORAL CLASSIFICATION AND RISK SCORING')
print('=' * 60)
    # Apply beavioral classification feature engineering
df_features = src.feature_engineer.create_behavioral_classification_features(df_features)
df_features = src.feature_engineer.create_risk_scoring_features(df_features)
    
behavioral_features = [
    'PAYMENT_BEHAVIOR_TYPE', 'TEMPORAL_RISK_LEVEL', 'CUSTOMER_SEGMENT',
    'TEMPORAL_RISK_SCORE', 'RISK_SCORE_CATEGORY'
]
    
print(f'\\n🎯 BEHAVIORAL CLASSIFICATION ANALYSIS:')
    
if 'PAYMENT_BEHAVIOR_TYPE' in df_features.columns:
    print(f'\\n   Payment Behavior Type Distribution:')
    behavior_dist = df_features.groupBy('PAYMENT_BEHAVIOR_TYPE').count().orderBy(desc('count'))
        
    for row in behavior_dist.collect():
        behavior_type = row['PAYMENT_BEHAVIOR_TYPE']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {behavior_type}: {count:,} ({percentage:.1f}%)')
        
    if 'default payment next month' in df_features.columns:
        behavior_default = df_features.groupBy('PAYMENT_BEHAVIOR_TYPE', 'default payment next month') \\
            .count()
            .groupBy('PAYMENT_BEHAVIOR_TYPE')
            .pivot('default payment next month')
            .sum('count')
            .fillna(0)
            
            behavior_default = behavior_default.withColumn(
                'total', col('0') + col('1')
            ).withColumn(
                'default_rate', col('1') / col('total') * 100
            )
            
            print(f'\\n   Default Rate by Payment Behavior Type:')
            print(f'   {'Behavior Type':<20} {'No Default':<12} {'Default':<10} {'Default Rate':<12}')
            print('   ' + '-' * 60)
            
            for row in behavior_default.collect():
                behavior_type = row['PAYMENT_BEHAVIOR_TYPE']
                no_default = int(row['0']) if row['0'] else 0
                default = int(row['1']) if row['1'] else 0
                default_rate = row['default_rate']
                print(f'   {behavior_type:<20} {no_default:<12,} {default:<10,} {default_rate:<12.1f}%')
    
if 'TEMPORAL_RISK_SCORE' in df_features.columns:
    print(f'\\n📊 TEMPORAL RISK SCORE ANALYSIS:')
    
    # Risk score statistics
    risk_score_stats = df_features.select(
        avg('TEMPORAL_RISK_SCORE').alias('avg_score'),
        stddev('TEMPORAL_RISK_SCORE').alias('std_score'),
        expr('percentile_approx(TEMPORAL_RISK_SCORE, 0.1)').alias('p10'),
        expr('percentile_approx(TEMPORAL_RISK_SCORE, 0.25)').alias('q1'),
        expr('percentile_approx(TEMPORAL_RISK_SCORE, 0.5)').alias('median'),
        expr('percentile_approx(TEMPORAL_RISK_SCORE, 0.75)').alias('q3'),
        expr('percentile_approx(TEMPORAL_RISK_SCORE, 0.9)').alias('p90')
    ).collect()[0]
    
    print(f'   Risk Score Statistics:')
    print(f'      Average: {risk_score_stats['avg_score']:.4f}')
    print(f'      Std Dev: {risk_score_stats['std_score']:.4f}')
    print(f'      10th percentile: {risk_score_stats['p10']:.4f}')
    print(f'      25th percentile: {risk_score_stats['q1']:.4f}')
    print(f'      Median: {risk_score_stats['median']:.4f}')
    print(f'      75th percentile: {risk_score_stats['q3']:.4f}')
    print(f'      90th percentile: {risk_score_stats['p90']:.4f}')

# Risk score category analysis
if 'RISK_SCORE_CATEGORY' in df_features.columns:
    print(f'\\n🎯 RISK SCORE CATEGORY ANALYSIS:')
    
    risk_category_dist = df_features.groupBy('RISK_SCORE_CATEGORY').count().orderBy(desc('count'))
    
    print(f'   Risk Score Category Distribution:')
    for row in risk_category_dist.collect():
        category = row['RISK_SCORE_CATEGORY']
        count = row['count']
        percentage = count / df_features.count() * 100
        print(f'      {category}: {count:,} ({percentage:.1f}%)')
    
    # Risk category vs default analysis
    if 'default payment next month' in df_features.columns:
        risk_default = df_features.groupBy('RISK_SCORE_CATEGORY', 'default payment next month') \
            .count() \
            .groupBy('RISK_SCORE_CATEGORY') \
            .pivot('default payment next month') \
            .sum('count') \
            .fillna(0)
        
        risk_default = risk_default.withColumn(
            'total', col('0') + col('1')
        ).withColumn(
            'default_rate', col('1') / col('total') * 100
        )
        
        print(f'\\n   Default Rate by Risk Category:')
        print(f'   {'Risk Category':<15} {'No Default':<12} {'Default':<10} {'Default Rate':<12}')
        print('   ' + '-' * 55)
        
        for row in risk_default.collect():
            category = row['RISK_SCORE_CATEGORY']
            no_default = int(row['0']) if row['0'] else 0
            default = int(row['1']) if row['1'] else 0
            default_rate = row['default_rate']
            print(f'   {category:<15} {no_default:<12,} {default:<10,} {default_rate:<12.1f}%')

print(f'\\n✅ Phase 6 completed: Behavioral classification and risk scoring features created')
# Comprehensive feature engineering summary
print('📋 FEATURE ENGINEERING SUMMARY AND VALIDATION')
print('=' * 60)

# Count all created features
original_columns = set(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE'] + 
                       [f'PAY_{i}' for i in [0, 2, 3, 4, 5, 6]] +
                       [f'BILL_AMT{i}' for i in range(1, 7)] +
                       [f'PAY_AMT{i}' for i in range(1, 7)] +
                       ['default payment next month'])

all_columns = set(df_features.columns)
new_features = all_columns - original_columns

print(f'\\n📊 FEATURE CREATION SUMMARY:')
print(f'   Original columns: {len(original_columns)}')
print(f'   Total columns after engineering: {len(all_columns)}')
print(f'   New features created: {len(new_features)}')
print(f'   Feature expansion: +{len(new_features)/len(original_columns)*100:.1f}%')

# Categorize new features
feature_categories = {
    'Payment Trends': [f for f in new_features if 'TREND' in f or 'VOLATILITY' in f or 'DELAY' in f],
    'Temporal Segmentation': [f for f in new_features if 'RECENT' in f or 'HISTORICAL' in f or 'IMPROVEMENT' in f or 'RECOVERY' in f],
    'Financial Analysis': [f for f in new_features if 'BILL' in f or 'DEBT' in f or 'AVG_BILL' in f],
    'Payment Efficiency': [f for f in new_features if 'EFFICIENCY' in f or 'CONSISTENCY' in f or 'AVG_PAYMENT' in f],
    'Credit Utilization': [f for f in new_features if 'CREDIT' in f or 'UTILIZATION' in f or 'BUFFER' in f],
    'Behavioral Classification': [f for f in new_features if 'BEHAVIOR' in f or 'SEGMENT' in f or 'RISK' in f]
}

print(f'\\n📋 NEW FEATURES BY CATEGORY:')
for category, features in feature_categories.items():
    print(f'   {category}: {len(features)} features')
    for feature in sorted(features)[:3]:  # Show first 3 features per category
        print(f'      - {feature}')
    if len(features) > 3:
        print(f'      ... and {len(features)-3} more')

# Feature quality validation
print(f'\\n🔍 FEATURE QUALITY VALIDATION:')

# Check for features with too many missing values
missing_features = []
for feature in new_features:
    if feature in df_features.columns:
        missing_count = df_features.filter(col(feature).isNull()).count()
        if missing_count > 0:
            missing_pct = missing_count / df_features.count() * 100
            missing_features.append((feature, missing_count, missing_pct))

if missing_features:
    print(f'   ⚠️  Features with missing values: {len(missing_features)}')
    for feature, count, pct in missing_features[:5]:  # Show top 5
        print(f'      {feature}: {count:,} ({pct:.1f}%)')
else:
    print(f'   ✅ No missing values in engineered features')

# Check for features with zero variance
zero_variance_features = []
for feature in new_features:
    if feature in df_features.columns:
        try:
            variance = df_features.select(variance(feature)).collect()[0][0]
            if variance is not None and variance < 1e-10:
                zero_variance_features.append(feature)
        except:
            continue

if zero_variance_features:
    print(f'   ⚠️  Features with zero/low variance: {len(zero_variance_features)}')
    for feature in zero_variance_features[:5]:
        print(f'      - {feature}')
else:
    print(f'   ✅ All features have sufficient variance')

# Feature correlation with target
if 'default payment next month' in df_features.columns:
    print(f'\\n🎯 TOP FEATURES BY CORRELATION WITH TARGET:')
    
    feature_correlations = []
    
    # Calculate correlations for numeric features
    numeric_features = [f for f in new_features if f in df_features.columns and 
                       dict(df_features.dtypes)[f] in ['int', 'bigint', 'double', 'float']]
    
    for feature in numeric_features[:15]:  # Limit to prevent long computation
        try:
            assembler = VectorAssembler(inputCols=[feature, 'default payment next month'], outputCol='features')
            df_corr = assembler.transform(df_features).select('features')
            correlation_matrix = Correlation.corr(df_corr, 'features').head()[0]
            correlation_value = float(correlation_matrix.toArray()[0, 1])
            feature_correlations.append((feature, correlation_value))
        except:
            continue
    
    # Sort by absolute correlation
    feature_correlations.sort(key=lambda x: abs(x[1]), reverse=True)
    
    print(f'   {'Feature':<30} {'Correlation':<12} {'Strength':<12}')
    print('   ' + '-' * 60)
    
    for feature, corr in feature_correlations[:10]:  # Top 10
        abs_corr = abs(corr)
        if abs_corr >= 0.3:
            strength = 'Strong'
        elif abs_corr >= 0.1:
            strength = 'Moderate'  
        elif abs_corr >= 0.05:
            strength = 'Weak'
        else:
            strength = 'Very Weak'
        
        print(f'   {feature:<30} {corr:<12.4f} {strength:<12}')

print(f'\\n💡 FEATURE ENGINEERING INSIGHTS:')
insights = [
    f'Created {len(new_features)} temporal and behavioral features',
    f'Enhanced dataset from {len(original_columns)} to {len(all_columns)} dimensions',
    f'Temporal features capture 6-month payment behavior patterns',
    f'Risk scoring provides business-interpretable customer assessment',
    f'Features ready for advanced machine learning modeling'
]

for insight in insights:
    print(f'   ✅ {insight}')

print(f'\\n✅ FEATURE ENGINEERING PHASE COMPLETED SUCCESSFULLY')
print(f'📁 Proceed to notebook: 04_visualization_analysis.ipynb')

# Save enhanced dataset with all engineered features
print('💾 SAVING ENHANCED DATASET WITH ENGINEERED FEATURES')
print('=' * 60)

try:
    # Save complete enhanced dataset
    df_features.write.mode('overwrite').parquet('../data/processed/03_enhanced_features.parquet')
    
    print(f'✅ Enhanced dataset saved successfully')
    print(f'   📁 Location: ../data/processed/03_enhanced_features.parquet')
    print(f'   📊 Records: {df_features.count():,}')
    print(f'   📋 Total columns: {len(df_features.columns)}')
    print(f'   ⚙️  New features: {len(new_features)}')
    
    # Save feature documentation
    feature_docs = {
        'original_features': len(original_columns),
        'engineered_features': len(new_features), 
        'total_features': len(df_features.columns),
        'feature_categories': {k: len(v) for k, v in feature_categories.items()},
        'created_date': '2025-06-20 16:12:16 UTC',
        'analyst': 'ardzz'
    }
    
    print(f'\\n📋 Feature documentation created')
    
except Exception as e:
    print(f'⚠️  Could not save enhanced dataset: {e}')

# Display final feature summary
print(f'\\n🎯 READY FOR ADVANCED ANALYSIS:')
print(f'   Next Phase: Visualization and Business Intelligence')
print(f'   Enhanced features: Payment trends, temporal patterns, risk scoring')
print(f'   Business value: Comprehensive customer behavior analysis')

print(f'\\n🚀 Feature Engineering Complete - Ready for Phase 4: Visualization Analysis')