In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define processes
processes = ['Claim Admin', 'Communication', 'Coverage', 'Desktop Management', 
             'Liability', 'Recovery', 'Schedule Services', 'Settlement', 
             'Total Loss', 'Claim related teams chats']

# Generate activity pool for each process (max 8 activities per process)
process_activities = {}
common_actions = ["Review", "Assessment", "Verification", "Approval", "Documentation", "Update", "Analysis", "Processing", "Entry", "Check"]

for proc in processes:
    # Limit total no of activity types per process to max 8
    num_types = random.randint(3, 8)
    # Create specific activities for this process
    selected_actions = random.sample(common_actions, num_types)
    process_activities[proc] = [f"{action} - {proc}" for action in selected_actions]

# Generate claim numbers (similar to your data)
num_claims = 1000
claim_numbers = np.random.randint(40000000, 60000000, size=num_claims)

# Function to generate activity sequences for a claim
def generate_claim_activities(claim_number, start_date):
    activities = []
    current_time = start_date
    
    # Generate 5-15 process interactions per claim
    num_process_steps = random.randint(5, 15)
    
    # Start with a random process
    available_processes = processes.copy()
    current_process = random.choice(available_processes)
    
    # Track visited processes for the back-and-forth logic
    visited_processes = [current_process]
    
    for i in range(num_process_steps):
        # Generate 1-4 specific activities for this process step
        num_sub_activities = random.randint(1, 4)
        
        for j in range(num_sub_activities):
            # Pick a random activity from the pool for this process
            current_activity = random.choice(process_activities[current_process])
            
            active_minutes = np.random.exponential(scale=5) + 0.1  # Random active time
            
            activities.append({
                'Claim_Number': claim_number,
                'Process': current_process,
                'Activity': current_activity,
                'First_TimeStamp': current_time,
                'Active_Minutes': active_minutes
            })
            
            # Move time forward (shorter gap between activities in same process)
            time_gap = timedelta(minutes=random.randint(5, 60))
            current_time += time_gap
        
        # Move time forward (larger gap between different process steps)
        time_gap = timedelta(minutes=random.randint(60, 300))
        current_time += time_gap
        
        # Choose next process
        # 30% chance to revisit a previous process, 70% chance for new/different
        if random.random() < 0.3 and len(visited_processes) > 1:
            # Revisit a previous process
            current_process = random.choice(visited_processes)
        else:
            # Choose any process
            current_process = random.choice(processes)
        
        visited_processes.append(current_process)
    
    return activities

# Generate all claim activities
all_activities = []
base_date = datetime(2024, 12, 1, 8, 0, 0)

for claim in claim_numbers:
    # Each claim starts at a random time within a 2-week window
    start_offset = timedelta(days=random.randint(0, 14), 
                            hours=random.randint(0, 23),
                            minutes=random.randint(0, 59))
    claim_start = base_date + start_offset
    
    claim_activities = generate_claim_activities(claim, claim_start)
    all_activities.extend(claim_activities)

# Create DataFrame
activity_df = pd.DataFrame(all_activities)

# Sort by claim and timestamp
activity_df = activity_df.sort_values(['Claim_Number', 'First_TimeStamp']).reset_index(drop=True)

print(f"Original rows: {len(activity_df)}")
print(f"Number of unique claims: {activity_df['Claim_Number'].nunique()}")
print(f"\nProcess distribution:")
print(activity_df['Process'].value_counts())
print(f"\nActivity distribution (top 10):")
print(activity_df['Activity'].value_counts().head(10))
print(f"\nFirst few rows:")
activity_df.head(10)

Original rows: 25370
Number of unique claims: 1000

Process distribution:
Process
Recovery                     2646
Claim Admin                  2636
Schedule Services            2634
Communication                2553
Coverage                     2518
Settlement                   2503
Total Loss                   2497
Desktop Management           2479
Claim related teams chats    2477
Liability                    2427
Name: count, dtype: int64

Activity distribution (top 10):
Activity
Update - Schedule Services           899
Documentation - Schedule Services    883
Entry - Settlement                   858
Check - Schedule Services            852
Processing - Settlement              835
Assessment - Settlement              810
Approval - Recovery                  678
Assessment - Recovery                677
Update - Recovery                    676
Review - Desktop Management          658
Name: count, dtype: int64

First few rows:


Unnamed: 0,Claim_Number,Process,Activity,First_TimeStamp,Active_Minutes
0,40043585,Desktop Management,Analysis - Desktop Management,2024-12-11 17:54:00,5.662605
1,40043585,Desktop Management,Analysis - Desktop Management,2024-12-11 18:34:00,1.677966
2,40043585,Communication,Documentation - Communication,2024-12-11 20:36:00,0.523595
3,40043585,Communication,Check - Communication,2024-12-11 21:07:00,24.619091
4,40043585,Communication,Update - Communication,2024-12-11 21:12:00,0.949208
5,40043585,Liability,Documentation - Liability,2024-12-12 00:04:00,22.392808
6,40043585,Liability,Assessment - Liability,2024-12-12 01:02:00,19.022534
7,40043585,Liability,Update - Liability,2024-12-12 01:50:00,7.994983
8,40043585,Claim Admin,Assessment - Claim Admin,2024-12-12 06:45:00,5.48557
9,40043585,Total Loss,Documentation - Total Loss,2024-12-12 11:21:00,4.411464


In [12]:
# Create process change tracking (like in your image)
activity_df_sorted = activity_df.sort_values(['Claim_Number', 'First_TimeStamp'])



In [4]:
activity_df.to_csv('simulated_claim_activities.csv', index=False)