In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict, Counter
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

In [None]:
class ProcessMiningTool:
    """
    A comprehensive process mining tool for analyzing AI agent execution logs.
    Produces workflow visualizations with timing metrics and KPI statistics.
    """

    def __init__(self, csv_path):
        """Initialize the tool and load data."""
        self.df = pd.read_csv(csv_path)
        self.preprocess_data()
        self.transitions = None
        self.kpis = None

    def preprocess_data(self):
        """Preprocess timestamps and sort data."""
        # Normalize column names to lowercase for consistency
        self.df.columns = self.df.columns.str.lower().str.strip()

        # Convert timestamp to datetime
        self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])

        # Sort by caseid and timestamp
        self.df = self.df.sort_values(['caseid', 'timestamp']).reset_index(drop=True)

        # Display data overview
        print(f"‚úì Loaded {len(self.df)} events from {self.df['caseid'].nunique()} cases")
        print(f"‚úì Date range: {self.df['timestamp'].min()} to {self.df['timestamp'].max()}")
        print(f"‚úì Unique actions: {self.df['action'].nunique()}")
        print(f"\nüìã Action distribution:")
        for action, count in self.df['action'].value_counts().items():
            print(f"   ‚Ä¢ {action}: {count}")

    def compute_kpis(self):
        """Compute case-level KPIs."""
        case_durations = []

        for caseid, group in self.df.groupby('caseid'):
            min_time = group['timestamp'].min()
            max_time = group['timestamp'].max()
            duration = (max_time - min_time).total_seconds()
            case_durations.append(duration)

        self.kpis = {
            'total_cases': len(case_durations),
            'avg_duration_seconds': np.mean(case_durations),
            'median_duration_seconds': np.median(case_durations),
            'min_duration_seconds': np.min(case_durations),
            'max_duration_seconds': np.max(case_durations)
        }

        return self.kpis

    def extract_transitions(self):
        """Extract all transitions between actions with timing information."""
        transitions = []

        for caseid, group in self.df.groupby('caseid'):
            actions = group['action'].tolist()
            timestamps = group['timestamp'].tolist()

            # Only process cases with at least 2 actions
            if len(actions) < 2:
                print(f"‚ö†Ô∏è  Warning: Case {caseid} has only {len(actions)} action(s), skipping transitions")
                continue

            for i in range(len(actions) - 1):
                source = actions[i]
                target = actions[i + 1]
                duration = (timestamps[i + 1] - timestamps[i]).total_seconds()

                transitions.append({
                    'source': source,
                    'target': target,
                    'duration': duration,
                    'caseid': caseid
                })

        self.transitions = pd.DataFrame(transitions)

        if len(self.transitions) == 0:
            print("‚ö†Ô∏è  No transitions found in the data!")
        else:
            print(f"‚úì Extracted {len(self.transitions)} transitions")

        return self.transitions

    def compute_transition_metrics(self):
        """Aggregate transition metrics (average duration and count)."""
        if self.transitions is None:
            self.extract_transitions()

        metrics = self.transitions.groupby(['source', 'target']).agg({
            'duration': ['mean', 'count']
        }).reset_index()

        metrics.columns = ['source', 'target', 'avg_duration', 'count']
        return metrics

    @staticmethod
    def format_duration(seconds):
        """Format duration in human-readable form."""
        if seconds < 1:
            return f"{seconds*1000:.0f}ms"
        elif seconds < 60:
            return f"{seconds:.1f}s"
        elif seconds < 3600:
            return f"{seconds/60:.1f}m"
        elif seconds < 86400:
            return f"{seconds/3600:.1f}h"
        else:
            return f"{seconds/86400:.1f}d"

    def build_process_graph(self):
        """Build a directed graph representing the process flow."""
        metrics = self.compute_transition_metrics()

        G = nx.DiGraph()

        # Add all unique actions as nodes
        all_actions = set(metrics['source'].unique()) | set(metrics['target'].unique())
        G.add_nodes_from(all_actions)

        # Add edges with attributes
        for _, row in metrics.iterrows():
            avg_dur_formatted = self.format_duration(row['avg_duration'])
            edge_label = f"{avg_dur_formatted} / {int(row['count'])}"

            G.add_edge(
                row['source'],
                row['target'],
                weight=row['count'],
                avg_duration=row['avg_duration'],
                label=edge_label
            )

        return G

    def visualize_workflow(self, figsize=(16, 12), output_path=None):
        """Create a workflow visualization with timing and frequency annotations."""
        G = self.build_process_graph()

        # Create figure
        fig, ax = plt.subplots(figsize=figsize)

        # Use hierarchical layout for better readability
        try:
            pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
        except:
            pos = nx.shell_layout(G)

        # Calculate node sizes based on frequency (total incoming + outgoing)
        node_frequencies = defaultdict(int)
        for edge in G.edges():
            node_frequencies[edge[0]] += G.edges[edge]['weight']
            node_frequencies[edge[1]] += G.edges[edge]['weight']

        max_freq = max(node_frequencies.values()) if node_frequencies else 1
        node_sizes = [3000 + (node_frequencies[node] / max_freq) * 5000 for node in G.nodes()]

        # Draw nodes
        nx.draw_networkx_nodes(
            G, pos,
            node_size=node_sizes,
            node_color='lightblue',
            edgecolors='darkblue',
            linewidths=2,
            ax=ax
        )

        # Draw edges with varying thickness based on frequency
        edge_weights = [G.edges[edge]['weight'] for edge in G.edges()]
        max_weight = max(edge_weights) if edge_weights else 1
        edge_widths = [1 + (w / max_weight) * 4 for w in edge_weights]

        nx.draw_networkx_edges(
            G, pos,
            width=edge_widths,
            edge_color='gray',
            arrows=True,
            arrowsize=20,
            arrowstyle='->',
            connectionstyle='arc3,rad=0.1',
            ax=ax
        )

        # Draw node labels
        nx.draw_networkx_labels(
            G, pos,
            font_size=10,
            font_weight='bold',
            font_color='darkblue',
            ax=ax
        )

        # Draw edge labels
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(
            G, pos,
            edge_labels=edge_labels,
            font_size=8,
            font_color='red',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='none', alpha=0.7),
            ax=ax
        )

        ax.set_title('AI Agent Process Flow\n(Edge labels: avg_duration / occurrence_count)',
                     fontsize=16, fontweight='bold', pad=20)
        ax.axis('off')
        plt.tight_layout()

        if output_path:
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
            print(f"‚úì Workflow visualization saved to {output_path}")

        plt.show()

        return G

    def print_kpi_report(self):
        """Print a formatted KPI report."""
        if self.kpis is None:
            self.compute_kpis()

        print("\n" + "="*60)
        print("üìä PROCESS MINING KPI REPORT")
        print("="*60)
        print(f"\nüìÅ Total Cases: {self.kpis['total_cases']}")
        print(f"\n‚è±Ô∏è  Case Duration Statistics:")
        print(f"   ‚Ä¢ Average:  {self.format_duration(self.kpis['avg_duration_seconds'])}")
        print(f"   ‚Ä¢ Median:   {self.format_duration(self.kpis['median_duration_seconds'])}")
        print(f"   ‚Ä¢ Minimum:  {self.format_duration(self.kpis['min_duration_seconds'])}")
        print(f"   ‚Ä¢ Maximum:  {self.format_duration(self.kpis['max_duration_seconds'])}")
        print("\n" + "="*60 + "\n")

    def print_transition_summary(self, top_n=10):
        """Print summary of most frequent transitions."""
        metrics = self.compute_transition_metrics()
        metrics_sorted = metrics.sort_values('count', ascending=False)

        print(f"\nüîÑ Top {top_n} Most Frequent Transitions:")
        print("-" * 80)
        print(f"{'Source':<25} {'Target':<25} {'Avg Duration':<15} {'Count':<10}")
        print("-" * 80)

        for _, row in metrics_sorted.head(top_n).iterrows():
            duration_str = self.format_duration(row['avg_duration'])
            print(f"{row['source']:<25} {row['target']:<25} {duration_str:<15} {int(row['count']):<10}")

        print("-" * 80 + "\n")

    def run_complete_analysis(self, output_path='process_workflow.png'):
        """Run the complete process mining analysis pipeline."""
        print("\nüöÄ Starting Process Mining Analysis...\n")

        # Compute KPIs
        self.compute_kpis()
        self.print_kpi_report()

        # Extract and summarize transitions
        self.extract_transitions()
        self.print_transition_summary()

        # Visualize workflow
        print("üìà Generating workflow visualization...\n")
        self.visualize_workflow(output_path=output_path)

        print("‚úÖ Analysis complete!")

        return self

In [None]:
# =============================================================================
# USAGE EXAMPLE
# =============================================================================

if __name__ == "__main__":
    # Initialize the tool with your CSV file
    # Replace 'agent_logs.csv' with your actual CSV filename
    csv_filename = 'Sample.csv'  # Change this to your file name

    try:
        tool = ProcessMiningTool(csv_filename)

        # Run complete analysis
        tool.run_complete_analysis(output_path='agent_workflow.png')

        # Optional: Access individual components
        # kpis = tool.compute_kpis()
        # transitions = tool.extract_transitions()
        # graph = tool.build_process_graph()

    except FileNotFoundError:
        print(f"‚ùå Error: File '{csv_filename}' not found!")
        print("Please ensure the CSV file is in the same directory as this script.")
    except Exception as e:
        print(f"‚ùå Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
