In [5]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [7]:
# Event of interest: what to predict
event_of_interest = "A_Cancelled"  # O_Accepted

df_bpi= pd.read_parquet("../data/BPI_Challenge_2017.gzip")
df_bpi.rename(inplace=True, columns={"time:timestamp": "time", "case:concept:name": "case", "concept:name": "event"})
df_bpi["time"] = pd.to_datetime(df_bpi["time"], errors='coerce')

# Filtering on end events

In [3]:
# Group data by case_id and get start and end events
start_events = df_bpi.groupby('case')['event'].first()
end_events = df_bpi.groupby('case')['event'].last()

# Count occurrences of start and end events
start_event_counts = Counter(start_events)
end_event_counts = Counter(end_events)

# Combine results into a DataFrame
results = pd.DataFrame({
    'Start_Event_Count': pd.Series(start_event_counts),
    'End_Event_Count': pd.Series(end_event_counts)
}).fillna(0).astype(int)

# Display the results
print(results)
# We conclude that only O_Cancelled, W_Call after offers, W_Call incomplete files, W_validate_application
# Remove other traces makes the process more uniform. 

# This is what is done in the "filter_log" function below

                            Start_Event_Count  End_Event_Count
A_Create Application                    31509                0
A_Denied                                    0                1
O_Cancelled                                 0             4436
O_Returned                                  0                1
O_Sent (mail and online)                    0               15
O_Sent (online only)                        0                8
W_Assess potential fraud                    0              102
W_Call after offers                         0             9457
W_Call incomplete files                     0             4676
W_Complete application                      0              144
W_Personal Loan collection                  0                2
W_Shortened completion                      0                5
W_Validate application                      0            12662


# Why bucketing could be useful to distinguish short starting prefixes from longer prefixes

In [None]:
# Group events by case_id
grouped = df_bpi.groupby('case')['event'].apply(list)

# Extract the first 5 events
sequences = grouped.apply(lambda x: tuple(x[:5]) if len(x) >= 5 else None).dropna()

# Count unique sequences
sequence_counts = Counter(sequences)

# Filter sequences with frequency > 0 (all valid counts)
filtered_sequences = {seq: count for seq, count in sequence_counts.items() if count > 0}

print("Dictionary of Starting Sequences (Length 5) with Frequencies > 0:")
print(filtered_sequences)

# Convert Counter to DataFrame for visualization
hist_data = pd.DataFrame(sequence_counts.items(), columns=['Sequence', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.bar(['-'.join(seq) for seq in hist_data['Sequence']], hist_data['Frequency'])
plt.xticks(rotation=90)
plt.xlabel('Unique Opening Sequences (First 5 Events)')
plt.ylabel('Frequency')
plt.title('Histogram of Unique Opening Sequences')
plt.show()

NameError: name 'df_bpi' is not defined

# O_Create Offer Bucketing:

- Create a first bucket with all prefixes before O_Create Offer occurred
- Create a second bucket for all prefixes which have already seen O_Create Offer

In [4]:
df = df_bpi.copy()
# Add a natural event order index within each case
df['event_index'] = df.groupby('case').cumcount()

# Filter to find 'O_Create Offer' and retrieve its event_index
o_create_offer_indices = df[df['event'] == 'O_Create Offer'][['case', 'event_index']]

# Display the results
print(o_create_offer_indices)

NameError: name 'df_bpi' is not defined