In [10]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [11]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define processes
processes = ['Claim Admin', 'Communication', 'Coverage', 'Desktop Management', 
             'Liability', 'Recovery', 'Schedule Services', 'Settlement', 
             'Total Loss', 'Claim related teams chats']

# Generate claim numbers (similar to your data)
num_claims = 1000
claim_numbers = np.random.randint(40000000, 60000000, size=num_claims)

# Function to generate activity sequences for a claim
def generate_claim_activities(claim_number, start_date):
    activities = []
    current_time = start_date
    
    # Generate 5-15 activities per claim with back-and-forth between processes
    num_activities = random.randint(5, 15)
    
    # Start with a random process
    available_processes = processes.copy()
    current_process = random.choice(available_processes)
    
    for i in range(num_activities):
        # Add current activity
        active_minutes = np.random.exponential(scale=5) + 0.1  # Random active time
        
        activities.append({
            'Claim_Number': claim_number,
            'Process': current_process,
            'First_TimeStamp': current_time,
            'Active_Minutes': active_minutes
        })
        
        # Move time forward
        time_gap = timedelta(minutes=random.randint(1, 120))
        current_time += time_gap
        
        # Choose next process - can be same as previous (back-and-forth) or different
        # 30% chance to revisit a previous process, 70% chance for new/different
        if random.random() < 0.3 and len(activities) > 1:
            # Revisit a previous process from this claim
            previous_processes = [a['Process'] for a in activities]
            current_process = random.choice(previous_processes)
        else:
            # Choose any process (can be same or different)
            current_process = random.choice(processes)
    
    return activities

# Generate all claim activities
all_activities = []
base_date = datetime(2024, 12, 1, 8, 0, 0)

for claim in claim_numbers:
    # Each claim starts at a random time within a 2-week window
    start_offset = timedelta(days=random.randint(0, 14), 
                            hours=random.randint(0, 23),
                            minutes=random.randint(0, 59))
    claim_start = base_date + start_offset
    
    claim_activities = generate_claim_activities(claim, claim_start)
    all_activities.extend(claim_activities)

# Create DataFrame
activity_df = pd.DataFrame(all_activities)

# Sort by claim and timestamp
activity_df = activity_df.sort_values(['Claim_Number', 'First_TimeStamp']).reset_index(drop=True)

print(f"Original rows: {len(activity_df)}")
print(f"Number of unique claims: {activity_df['Claim_Number'].nunique()}")
print(f"\nProcess distribution:")
print(activity_df['Process'].value_counts())
print(f"\nFirst few rows:")
activity_df.head(10)

Original rows: 9924
Number of unique claims: 1000

Process distribution:
Process
Liability                    1046
Total Loss                   1040
Schedule Services            1039
Communication                1030
Settlement                   1008
Recovery                      986
Coverage                      976
Desktop Management            963
Claim related teams chats     941
Claim Admin                   895
Name: count, dtype: int64

First few rows:


Unnamed: 0,Claim_Number,Process,First_TimeStamp,Active_Minutes
0,40043585,Desktop Management,2024-12-08 09:58:00,0.827176
1,40043585,Settlement,2024-12-08 10:34:00,7.059049
2,40043585,Communication,2024-12-08 11:39:00,5.449471
3,40043585,Settlement,2024-12-08 13:07:00,15.788164
4,40043585,Coverage,2024-12-08 14:10:00,0.457255
5,40043585,Total Loss,2024-12-08 16:02:00,0.393735
6,40043585,Liability,2024-12-08 16:47:00,1.757731
7,40043585,Communication,2024-12-08 18:23:00,1.617064
8,40043585,Desktop Management,2024-12-08 19:00:00,1.518309
9,40043585,Desktop Management,2024-12-08 19:21:00,11.935862


In [12]:
# Create process change tracking (like in your image)
activity_df_sorted = activity_df.sort_values(['Claim_Number', 'First_TimeStamp'])



In [13]:
activity_df_sorted.to_csv('simulated_claim_activities.csv', index=False)