In [None]:
import pm4py

In [None]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2013_incidents.xes")

In [None]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['case:concept:name', 'time:timestamp'])

df.head(10)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def create_activity_sequences(df, prefix_length):
    sequences = []
    next_activities = []
    resources = []

    for case_id, case_df in df.groupby('case:concept:name'):
        activities = case_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]
            next_activity = activities[prefix_length]
            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(case_id)

    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i + 1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['case:conept:name'] = resources
    return sequences_df


max_sequence_length = df.groupby('case:concept:name').size().max()
print(f"Maximum sequence length: {max_sequence_length}")

# prefix_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150] #for BPIC2013 resources
prefix_lengths = [10, 15, 20, 25, 30, 35, 40, 45, 50] #for BPIC2013 cases

# prefix_lengths = [100, 150, 200, 400, 600, 800, 1000, 1200, 1400, 1500, 2000] #for BPIC2017 resources
# prefix_lengths = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80] #for BPIC2017 cases
# prefix_lengths = [100, 150, 200, 400, 600, 800, 1000, 1200, 1400, 1500, 2000, 2500] #for BPIC2018 resources
# prefix_lengths = [100, 125, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500, 600] #for BPIC2018 cases 
# prefix_lengths = [100, 150, 200, 300, 400, 500, 600, 700, 800] #for BPIC2019 resources
# prefix_lengths = [100, 125, 150, 175, 200, 225, 250, 300, 400] #for BPIC2019 cases


results = []

for prefix_length in prefix_lengths:
    sequences_df = create_activity_sequences(df, prefix_length)

    if sequences_df.empty:
        print(f"No valid sequences for prefix length {prefix_length}")
        results.append({'Prefix Length': prefix_length, 'Accuracy': None, 'Num Samples': 0})
        continue

    # Label encode activities
    label_encoder = LabelEncoder()
    all_activities = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)

    for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    # Define features and target
    X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
    y = sequences_df['next_activity']

    # Handle rare classes
    rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()
    if len(rare_classes) > 0:
        y = y.replace(rare_classes, -1)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Majority class predictor
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    y_pred = dummy_clf.predict(X_test)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Save results
    results.append({
        'Prefix Length': prefix_length,
        'Accuracy': round(accuracy, 4),
        'Num Samples': len(sequences_df)
    })

results_df = pd.DataFrame(results)
results_df.to_excel("majority_class_accuracies_BPIC2013.xlsx", index=False)

print("Results saved to majority_class_accuracies.xlsx")