In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import cumfreq
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
def compute_ecdf(data, column):
    """Compute the empirical cumulative distribution function (ECDF)."""
    values = np.sort(data[column])
    n = values.size
    y = np.arange(1, n + 1) / n
    return values, y

In [None]:
def filter_ecdf(data, column, threshold=0.8):
    """Filter the ECDF to retain only the first 80% of the distribution."""
    values, y = compute_ecdf(data, column)
    cutoff = np.percentile(values, threshold * 100)
    return data[data[column] <= cutoff]

In [None]:
def apply_kmeans_clustering(data, column, n_clusters=3):
    """Apply K-Means clustering to the event interval column."""
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[[column]])
    
    # Determine optimal clusters using silhouette analysis (or set manually)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    data['Cluster'] = kmeans.fit_predict(scaled_data)
    
    return data, kmeans

In [None]:
def compute_median_duration_per_cluster(data):
    """Compute median duration per cluster."""
    return data.groupby('Cluster')['event_interval'].median()

In [None]:
def plot_results(data):
    """Generate plots to visualize the clustering and ECDF."""
    plt.figure(figsize=(12, 5))
    
    # ECDF Plot
    plt.subplot(1, 2, 1)
    values, y = compute_ecdf(data, 'event_interval')
    plt.plot(values, y, marker='.', linestyle='none')
    plt.xlabel('Event Interval (days)')
    plt.ylabel('ECDF')
    plt.title('Empirical CDF of Event Intervals')
    
    # Clustering Plot
    plt.subplot(1, 2, 2)
    sns.boxplot(x='Cluster', y='event_interval', data=data)
    plt.xlabel('Cluster')
    plt.ylabel('Event Interval (days)')
    plt.title('K-Means Clustering Results')
    
    plt.tight_layout()
    plt.show()

In [None]:
def sessa_empirical_estimator(data):
    """Main function implementing the SEE algorithm."""
    data = data.sort_values(['patient_id', 'event_date'])
    data['prev_event_date'] = data.groupby('patient_id')['event_date'].shift(1)
    data.dropna(inplace=True)
    
    data['event_interval'] = (data['event_date'] - data['prev_event_date']).dt.days
    
    # Retain only 80% of the ECDF
    filtered_data = filter_ecdf(data, 'event_interval')
    
    # Apply K-Means clustering
    clustered_data, model = apply_kmeans_clustering(filtered_data, 'event_interval')
    
    # Compute median duration per cluster
    cluster_medians = compute_median_duration_per_cluster(clustered_data)
    
    # Assign computed durations back to original data
    data = data.merge(cluster_medians.rename('computed_duration'), on='Cluster', how='left')
    
    # Plot results
    plot_results(data)
    
    return data

In [None]:
# Example usage
# df = pd.read_csv('your_data.csv', parse_dates=['event_date'])
# result = sessa_empirical_estimator(df)
# print(result.head())