In [None]:
import pm4py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.stats import ks_2samp, chi2_contingency
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

# Create output directory
output_dir = Path("../../Results/Advanced_Analysis/data_drift")
output_dir.mkdir(parents=True, exist_ok=True)

print("Libraries imported successfully")


Libraries imported successfully


In [None]:
# Load event log
log_path = "../../Dataset/BPI Challenge 2017.xes"
log = pm4py.read_xes(log_path)
df = pm4py.convert_to_dataframe(log)

# Ensure timestamp is datetime
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'], utc=True)

# Sort by case and timestamp
df = df.sort_values(['case:concept:name', 'time:timestamp'])

print(f"Loaded {len(df):,} events")
print(f"Number of cases: {df['case:concept:name'].nunique():,}")


In [None]:
# Extract case-level features
case_data = df.groupby('case:concept:name').agg({
    'time:timestamp': ['min', 'max'],
    'concept:name': lambda x: list(x),  # Activity sequence
    'case:LoanGoal': 'first',
    'case:ApplicationType': 'first',
    'case:RequestedAmount': 'first'
}).reset_index()

case_data.columns = ['case_id', 'start_time', 'end_time', 'activity_sequence', 
                     'loan_goal', 'app_type', 'requested_amount']

# Calculate case duration (in days)
case_data['duration_days'] = (case_data['end_time'] - case_data['start_time']).dt.total_seconds() / (24 * 3600)

# Calculate case length (activity frequency - total number of activities per case)
case_data['case_length'] = case_data['activity_sequence'].apply(len)

# Classify outcomes based on endpoint activities
def classify_outcome(activities):
    if activities is None or len(activities) == 0:
        return 'Unknown'
    activities_set = set(activities)
    if 'A_Denied' in activities_set:
        return 'Denied'
    if 'A_Cancelled' in activities_set:
        return 'Cancelled'
    if 'A_Pending' in activities_set:
        return 'Pending'
    return 'Other'

case_data['outcome'] = case_data['activity_sequence'].apply(classify_outcome)

# Create monthly time windows based on start_time
case_data['month'] = case_data['start_time'].dt.to_period('M')
case_data['month_str'] = case_data['month'].astype(str)

# Filter to months with sufficient data (at least 100 cases)
month_counts = case_data['month_str'].value_counts()
valid_months = month_counts[month_counts >= 100].index.tolist()
case_data = case_data[case_data['month_str'].isin(valid_months)].copy()

# Sort by month
case_data = case_data.sort_values('start_time')

print(f"Prepared {len(case_data):,} cases")
print(f"Number of months: {case_data['month_str'].nunique()}")
print(f"\nMonths in dataset:")
print(case_data['month_str'].value_counts().sort_index())


In [None]:
# Distribution visualizations for Outcome (categorical)
outcome_by_month = pd.crosstab(case_data['month_str'], case_data['outcome'], normalize='index') * 100

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart
outcome_by_month.plot(kind='bar', stacked=True, ax=axes[0], colormap='Set2', width=0.8)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Proportion (%)', fontsize=12)
axes[0].set_title('Outcome Distribution Over Time', fontsize=14, fontweight='bold')
axes[0].legend(title='Outcome', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=45)

# Time series of proportions
for outcome in outcome_by_month.columns:
    axes[1].plot(outcome_by_month.index, outcome_by_month[outcome], marker='o', label=outcome, linewidth=2)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Proportion (%)', fontsize=12)
axes[1].set_title('Outcome Proportions Over Time', fontsize=14, fontweight='bold')
axes[1].legend(title='Outcome')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(output_dir / 'outcome_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: outcome_distribution.png")


In [None]:
# Distribution visualizations for Case Duration (continuous)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Box plot by month
case_data.boxplot(column='duration_days', by='month_str', ax=axes[0, 0])
axes[0, 0].set_title('Case Duration Distribution by Month', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Month', fontsize=11)
axes[0, 0].set_ylabel('Duration (days)', fontsize=11)
axes[0, 0].tick_params(axis='x', rotation=45)
plt.setp(axes[0, 0].xaxis.get_majorticklabels(), rotation=45)

# Histogram comparison: first month vs last month
first_month = case_data['month_str'].min()
last_month = case_data['month_str'].max()
first_month_data = case_data[case_data['month_str'] == first_month]['duration_days']
last_month_data = case_data[case_data['month_str'] == last_month]['duration_days']

axes[0, 1].hist(first_month_data, bins=30, alpha=0.6, label=f'First Month ({first_month})', density=True)
axes[0, 1].hist(last_month_data, bins=30, alpha=0.6, label=f'Last Month ({last_month})', density=True)
axes[0, 1].set_xlabel('Duration (days)', fontsize=11)
axes[0, 1].set_ylabel('Density', fontsize=11)
axes[0, 1].set_title('Duration Distribution: First vs Last Month', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Time series of mean and median
duration_stats = case_data.groupby('month_str')['duration_days'].agg(['mean', 'median', 'std']).reset_index()
axes[1, 0].plot(duration_stats['month_str'], duration_stats['mean'], marker='o', label='Mean', linewidth=2)
axes[1, 0].plot(duration_stats['month_str'], duration_stats['median'], marker='s', label='Median', linewidth=2)
axes[1, 0].fill_between(duration_stats['month_str'], 
                       duration_stats['mean'] - duration_stats['std'],
                       duration_stats['mean'] + duration_stats['std'],
                       alpha=0.3, label='±1 Std Dev')
axes[1, 0].set_xlabel('Month', fontsize=11)
axes[1, 0].set_ylabel('Duration (days)', fontsize=11)
axes[1, 0].set_title('Duration Statistics Over Time', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# KDE plot comparison
for month in [first_month, last_month]:
    month_data = case_data[case_data['month_str'] == month]['duration_days']
    sns.kdeplot(data=month_data, ax=axes[1, 1], label=f'{month}', linewidth=2)
axes[1, 1].set_xlabel('Duration (days)', fontsize=11)
axes[1, 1].set_ylabel('Density', fontsize=11)
axes[1, 1].set_title('Duration KDE: First vs Last Month', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'duration_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: duration_distribution.png")


In [None]:
# Distribution visualizations for Case Length / Activity Frequency (continuous)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Box plot by month
case_data.boxplot(column='case_length', by='month_str', ax=axes[0, 0])
axes[0, 0].set_title('Case Length Distribution by Month', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Month', fontsize=11)
axes[0, 0].set_ylabel('Case Length (activities)', fontsize=11)
axes[0, 0].tick_params(axis='x', rotation=45)
plt.setp(axes[0, 0].xaxis.get_majorticklabels(), rotation=45)

# Histogram comparison: first month vs last month
first_month_data_len = case_data[case_data['month_str'] == first_month]['case_length']
last_month_data_len = case_data[case_data['month_str'] == last_month]['case_length']

axes[0, 1].hist(first_month_data_len, bins=30, alpha=0.6, label=f'First Month ({first_month})', density=True)
axes[0, 1].hist(last_month_data_len, bins=30, alpha=0.6, label=f'Last Month ({last_month})', density=True)
axes[0, 1].set_xlabel('Case Length (activities)', fontsize=11)
axes[0, 1].set_ylabel('Density', fontsize=11)
axes[0, 1].set_title('Case Length Distribution: First vs Last Month', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Time series of mean and median
length_stats = case_data.groupby('month_str')['case_length'].agg(['mean', 'median', 'std']).reset_index()
axes[1, 0].plot(length_stats['month_str'], length_stats['mean'], marker='o', label='Mean', linewidth=2)
axes[1, 0].plot(length_stats['month_str'], length_stats['median'], marker='s', label='Median', linewidth=2)
axes[1, 0].fill_between(length_stats['month_str'], 
                       length_stats['mean'] - length_stats['std'],
                       length_stats['mean'] + length_stats['std'],
                       alpha=0.3, label='±1 Std Dev')
axes[1, 0].set_xlabel('Month', fontsize=11)
axes[1, 0].set_ylabel('Case Length (activities)', fontsize=11)
axes[1, 0].set_title('Case Length Statistics Over Time', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# KDE plot comparison
for month in [first_month, last_month]:
    month_data = case_data[case_data['month_str'] == month]['case_length']
    sns.kdeplot(data=month_data, ax=axes[1, 1], label=f'{month}', linewidth=2)
axes[1, 1].set_xlabel('Case Length (activities)', fontsize=11)
axes[1, 1].set_ylabel('Density', fontsize=11)
axes[1, 1].set_title('Case Length KDE: First vs Last Month', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'case_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: case_length_distribution.png")


In [None]:
# Distribution visualizations for Loan Goal (categorical)
loan_goal_by_month = pd.crosstab(case_data['month_str'], case_data['loan_goal'], normalize='index') * 100

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart
loan_goal_by_month.plot(kind='bar', stacked=True, ax=axes[0], colormap='tab10', width=0.8)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Proportion (%)', fontsize=12)
axes[0].set_title('Loan Goal Distribution Over Time', fontsize=14, fontweight='bold')
axes[0].legend(title='Loan Goal', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=45)

# Time series for top loan goals
top_goals = loan_goal_by_month.sum().nlargest(5).index
for goal in top_goals:
    if goal in loan_goal_by_month.columns:
        axes[1].plot(loan_goal_by_month.index, loan_goal_by_month[goal], marker='o', label=goal, linewidth=2)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Proportion (%)', fontsize=12)
axes[1].set_title('Top Loan Goals Over Time', fontsize=14, fontweight='bold')
axes[1].legend(title='Loan Goal', fontsize=9)
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(output_dir / 'loan_goal_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: loan_goal_distribution.png")


In [None]:
# Distribution visualizations for Application Type (categorical)
app_type_by_month = pd.crosstab(case_data['month_str'], case_data['app_type'], normalize='index') * 100

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart
app_type_by_month.plot(kind='bar', stacked=True, ax=axes[0], colormap='Set3', width=0.8)
axes[0].set_xlabel('Month', fontsize=12)
axes[0].set_ylabel('Proportion (%)', fontsize=12)
axes[0].set_title('Application Type Distribution Over Time', fontsize=14, fontweight='bold')
axes[0].legend(title='Application Type', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=45)

# Time series
for app_type in app_type_by_month.columns:
    axes[1].plot(app_type_by_month.index, app_type_by_month[app_type], marker='o', label=app_type, linewidth=2)
axes[1].set_xlabel('Month', fontsize=12)
axes[1].set_ylabel('Proportion (%)', fontsize=12)
axes[1].set_title('Application Type Proportions Over Time', fontsize=14, fontweight='bold')
axes[1].legend(title='Application Type', fontsize=9)
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(output_dir / 'app_type_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: app_type_distribution.png")


In [None]:
# Distribution visualizations for Requested Amount (continuous)
# Remove outliers for better visualization (top 1%)
q99 = case_data['requested_amount'].quantile(0.99)
case_data_viz = case_data[case_data['requested_amount'] <= q99].copy()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Box plot by month
case_data_viz.boxplot(column='requested_amount', by='month_str', ax=axes[0, 0])
axes[0, 0].set_title('Requested Amount Distribution by Month', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Month', fontsize=11)
axes[0, 0].set_ylabel('Requested Amount', fontsize=11)
axes[0, 0].tick_params(axis='x', rotation=45)
plt.setp(axes[0, 0].xaxis.get_majorticklabels(), rotation=45)

# Histogram comparison: first month vs last month
first_month_data_amt = case_data_viz[case_data_viz['month_str'] == first_month]['requested_amount']
last_month_data_amt = case_data_viz[case_data_viz['month_str'] == last_month]['requested_amount']

axes[0, 1].hist(first_month_data_amt, bins=30, alpha=0.6, label=f'First Month ({first_month})', density=True)
axes[0, 1].hist(last_month_data_amt, bins=30, alpha=0.6, label=f'Last Month ({last_month})', density=True)
axes[0, 1].set_xlabel('Requested Amount', fontsize=11)
axes[0, 1].set_ylabel('Density', fontsize=11)
axes[0, 1].set_title('Requested Amount Distribution: First vs Last Month', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Time series of mean and median
amount_stats = case_data.groupby('month_str')['requested_amount'].agg(['mean', 'median', 'std']).reset_index()
axes[1, 0].plot(amount_stats['month_str'], amount_stats['mean'], marker='o', label='Mean', linewidth=2)
axes[1, 0].plot(amount_stats['month_str'], amount_stats['median'], marker='s', label='Median', linewidth=2)
axes[1, 0].fill_between(amount_stats['month_str'], 
                       amount_stats['mean'] - amount_stats['std'],
                       amount_stats['mean'] + amount_stats['std'],
                       alpha=0.3, label='±1 Std Dev')
axes[1, 0].set_xlabel('Month', fontsize=11)
axes[1, 0].set_ylabel('Requested Amount', fontsize=11)
axes[1, 0].set_title('Requested Amount Statistics Over Time', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# KDE plot comparison
for month in [first_month, last_month]:
    month_data = case_data_viz[case_data_viz['month_str'] == month]['requested_amount']
    sns.kdeplot(data=month_data, ax=axes[1, 1], label=f'{month}', linewidth=2)
axes[1, 1].set_xlabel('Requested Amount', fontsize=11)
axes[1, 1].set_ylabel('Density', fontsize=11)
axes[1, 1].set_title('Requested Amount KDE: First vs Last Month', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'requested_amount_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved: requested_amount_distribution.png")


In [None]:
# Statistical Tests for Data Drift Detection
# Compare each month to baseline (first month)

baseline_month = case_data['month_str'].min()
baseline_data = case_data[case_data['month_str'] == baseline_month]
months = sorted(case_data['month_str'].unique())

# Results storage
drift_results = []

print("=== Statistical Tests for Data Drift ===\n")
print(f"Baseline month: {baseline_month}\n")

# Continuous variables: Kolmogorov-Smirnov test
continuous_vars = ['duration_days', 'case_length', 'requested_amount']

for var in continuous_vars:
    baseline_values = baseline_data[var].dropna()
    print(f"\n{var.upper().replace('_', ' ')}:")
    print("-" * 50)
    
    for month in months:
        if month == baseline_month:
            continue
        month_data = case_data[case_data['month_str'] == month][var].dropna()
        
        if len(baseline_values) > 0 and len(month_data) > 0:
            ks_stat, p_value = ks_2samp(baseline_values, month_data)
            significant = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
            print(f"  {month}: KS-stat={ks_stat:.4f}, p-value={p_value:.4e} {significant}")
            
            drift_results.append({
                'variable': var,
                'month': month,
                'test': 'Kolmogorov-Smirnov',
                'statistic': ks_stat,
                'p_value': p_value,
                'significant': p_value < 0.05
            })

# Categorical variables: Chi-square test
categorical_vars = ['outcome', 'loan_goal', 'app_type']

for var in categorical_vars:
    baseline_counts = baseline_data[var].value_counts()
    print(f"\n{var.upper().replace('_', ' ')}:")
    print("-" * 50)
    
    for month in months:
        if month == baseline_month:
            continue
        month_data = case_data[case_data['month_str'] == month][var]
        month_counts = month_data.value_counts()
        
        # Create contingency table
        all_categories = set(baseline_counts.index) | set(month_counts.index)
        contingency = pd.DataFrame({
            'baseline': [baseline_counts.get(cat, 0) for cat in all_categories],
            'month': [month_counts.get(cat, 0) for cat in all_categories]
        }, index=list(all_categories))
        
        if contingency.sum().sum() > 0:
            chi2, p_value, dof, expected = chi2_contingency(contingency)
            significant = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
            print(f"  {month}: Chi2={chi2:.4f}, p-value={p_value:.4e}, dof={dof} {significant}")
            
            drift_results.append({
                'variable': var,
                'month': month,
                'test': 'Chi-square',
                'statistic': chi2,
                'p_value': p_value,
                'significant': p_value < 0.05
            })

# Convert results to DataFrame
drift_df = pd.DataFrame(drift_results)
drift_df.to_csv(output_dir / 'drift_test_results.csv', index=False)
print(f"\n\nSaved drift test results to: drift_test_results.csv")


In [None]:
# Summary Table of Drift Results
if len(drift_df) > 0:
    # Create summary pivot table
    summary_pivot = drift_df.pivot_table(
        index='variable',
        columns='month',
        values='p_value',
        aggfunc='first'
    )
    
    # Create significance matrix
    sig_pivot = drift_df.pivot_table(
        index='variable',
        columns='month',
        values='significant',
        aggfunc='first'
    )
    
    print("=== Summary of Data Drift Tests ===\n")
    print("P-values (significant drift if p < 0.05):")
    print(summary_pivot.round(4))
    
    print("\n\nSignificant Drift (True = p < 0.05):")
    print(sig_pivot)
    
    # Count significant drifts per variable
    drift_counts = drift_df.groupby('variable')['significant'].sum()
    print("\n\nNumber of months with significant drift per variable:")
    print(drift_counts)
    
    # Save summary
    summary_pivot.to_csv(output_dir / 'drift_summary_pvalues.csv')
    sig_pivot.to_csv(output_dir / 'drift_summary_significance.csv')
    drift_counts.to_frame('num_significant_drifts').to_csv(output_dir / 'drift_counts.csv')
    
    print(f"\n\nSaved summary tables to output directory")
    
    # Visualization of p-values
    fig, ax = plt.subplots(figsize=(14, 8))
    summary_pivot_plot = summary_pivot.copy()
    summary_pivot_plot = -np.log10(summary_pivot_plot + 1e-10)  # -log10(p-value)
    
    sns.heatmap(summary_pivot_plot, annot=True, fmt='.2f', cmap='RdYlGn_r', 
               cbar_kws={'label': '-log10(p-value)'}, ax=ax, linewidths=0.5)
    ax.axhline(y=0, color='black', linewidth=2)
    ax.axhline(y=len(summary_pivot_plot), color='black', linewidth=2)
    ax.axvline(x=0, color='black', linewidth=2)
    ax.axvline(x=len(summary_pivot_plot.columns), color='black', linewidth=2)
    ax.set_title('Data Drift Significance Heatmap\n(-log10 p-value, higher = more significant drift)', 
                fontsize=14, fontweight='bold')
    ax.set_xlabel('Month', fontsize=12)
    ax.set_ylabel('Variable', fontsize=12)
    plt.tight_layout()
    plt.savefig(output_dir / 'drift_significance_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("Saved: drift_significance_heatmap.png")
else:
    print("No drift results to summarize")


## 3. Interpretation

### Key Findings:

1. **Data Drift Detection**: Statistical tests identify months where distributions significantly differ from the baseline month.

2. **Continuous Variables**: Kolmogorov-Smirnov tests detect distribution shifts in:
   - Case duration
   - Case length (activity frequency)
   - Requested amount

3. **Categorical Variables**: Chi-square tests detect proportion changes in:
   - Outcome distributions
   - Loan goal distributions
   - Application type distributions

### Implications for Prediction/Simulation:

- **Model Retraining**: Significant drift indicates when prediction models may need retraining
- **Time-Aware Models**: Incorporate temporal features to account for distribution changes
- **Simulation Accuracy**: Use time-varying distributions in simulation models for more realistic scenarios
