In [None]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")


In [None]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['org:resource', 'time:timestamp'])


In [None]:
def create_activity_sequences(df, prefix_length):
    sequences = []
    next_activities = []
    resources = []

    # Iterate through the dataframe, grouped by resource
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values  # Get the activities for this resource
        
        # Only generate sequences if there are enough activities for a valid prefix
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]  # Get the first 'prefix_length' activities
            next_activity = activities[prefix_length]  # Next activity after the prefix
            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(resource)  # Append the resource for each sequence

    # Convert to DataFrame with column names indicating the sequence of activities
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities  # Add the next activity to the DataFrame
    sequences_df['org:resource'] = resources  # Add the resource column (e.g., user_1, user_2)

    return sequences_df

# Example of how to use this function
prefix_length = 700  # You can change this value
sequences_df = create_activity_sequences(df, prefix_length)

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit on all unique activity values (including 'next_activity')
all_activities = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()
label_encoder.fit(all_activities)  # Fit on the entire dataset

# Apply the same encoding across all columns
for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
    sequences_df[col] = label_encoder.transform(sequences_df[col])

# Store mappings for decoding later
activity_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

inverse_activity_mapping = {v: k for k, v in activity_mapping.items()}

# Show the first 10 rows of the resulting dataframe
sequences_df.head(n=10)


In [None]:
## Experiment 1: Next Activity Prediction without activity information

# Define features (prefix activities) and target (next_activity)
X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
y = sequences_df['next_activity']

# Identify and handle rare numeric classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if len(rare_classes) == 1:
    rare_class = rare_classes[0]
    rare_class_rows = X[y == rare_class]
    X = pd.concat([X, rare_class_rows], axis=0)
    y = pd.concat([y, pd.Series([rare_class] * len(rare_class_rows))], axis=0)
else:
    new_label = max(y) + 1
    y = y.replace(rare_classes, new_label)
    others_rows = X[y.isin(rare_classes)]
    X = pd.concat([X, others_rows], axis=0)
    y = pd.concat([y, pd.Series([new_label] * len(others_rows))], axis=0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Train Random Forest Classifier with GridSearchCV
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate model on the final test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Compute cross-validated scores for standard deviation calculation
cv_results = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
accuracy_sd = np.std(cv_results)
precision_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted'))
recall_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted'))
f1_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='f1_weighted'))

# Print results with standard deviation
print(f"Test Set Accuracy: {accuracy:.4f} (±{accuracy_sd:.4f})")
print(f"Precision: {precision:.4f} (±{precision_sd:.4f})")
print(f"Recall: {recall:.4f} (±{recall_sd:.4f})")
print(f"F1-Score: {f1:.4f} (±{f1_sd:.4f})")

In [None]:
import binary_classifier

ra_diversity_matrix = binary_classifier.create_diversity_matrix(event_log)
ra_diversity_matrix_binary = ra_diversity_matrix.copy()
# Apply a binary transformation: any count > 0 becomes 1 (yes), else 0 (no)
ra_diversity_matrix_binary.iloc[:, 1:] = (ra_diversity_matrix_binary.iloc[:, 1:] > 0).astype(int)

activities = ra_diversity_matrix.columns[1:].tolist()  # Convert to a list of activities

binary_activities = ra_diversity_matrix_binary.iloc[:, :]


In [None]:
# Keep only resources that are in sequences_df
filtered_binary_activities = binary_activities[binary_activities['org:resource'].isin(sequences_df['org:resource'])]

# Reset index to ensure proper alignment
filtered_binary_activities = filtered_binary_activities.reset_index(drop=True)
sequences_df = sequences_df.reset_index(drop=True)

# Merge again
merged_df = pd.concat([sequences_df, filtered_binary_activities], axis=1)

merged_df.head(n=10)



In [None]:
## Experiment 2: Next Activity Prediction with activity information

X = merged_df[activities + [f"activity_{i+1}" for i in range(prefix_length)]]
y = merged_df['next_activity']

merged_df.head(n=10)

# Identify and handle rare numeric classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if len(rare_classes) == 1:
    rare_class = rare_classes[0]
    rare_class_rows = X[y == rare_class]
    X = pd.concat([X, rare_class_rows], axis=0)
    y = pd.concat([y, pd.Series([rare_class] * len(rare_class_rows))], axis=0)
else:
    new_label = max(y) + 1
    y = y.replace(rare_classes, new_label)
    others_rows = X[y.isin(rare_classes)]
    X = pd.concat([X, others_rows], axis=0)
    y = pd.concat([y, pd.Series([new_label] * len(others_rows))], axis=0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Train Random Forest Classifier with GridSearchCV
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate model on the final test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Compute cross-validated scores for standard deviation calculation
cv_results = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
accuracy_sd = np.std(cv_results)
precision_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted'))
recall_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted'))
f1_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='f1_weighted'))

# Print results with standard deviation
print(f"Test Set Accuracy: {accuracy:.4f} (±{accuracy_sd:.4f})")
print(f"Precision: {precision:.4f} (±{precision_sd:.4f})")
print(f"Recall: {recall:.4f} (±{recall_sd:.4f})")
print(f"F1-Score: {f1:.4f} (±{f1_sd:.4f})")

**Experiment 3: Next Activity Prediction with activity transitions count**

In [None]:
sequences_df = sequences_df.drop(columns=['org:resource'])

In [None]:
## Experiment 3: Next Activity Prediction with activity transitions count
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []

# Iterate through each row to count transitions
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Extract non-null activities

    # Count actual transitions
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1

    # Ensure every possible transition exists (fill with 0 if not present)
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)

# Convert list of transition count dictionaries to a DataFrame
transitions_df = pd.DataFrame(transition_counts)

# Rename columns to string format (e.g., '0->0', '0->1', etc.)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

# Merge with original DataFrame
result_df = pd.concat([sequences_df, transitions_df], axis=1)

X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

# Identify and handle rare numeric classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if len(rare_classes) == 1:
    rare_class = rare_classes[0]
    rare_class_rows = X[y == rare_class]
    X = pd.concat([X, rare_class_rows], axis=0)
    y = pd.concat([y, pd.Series([rare_class] * len(rare_class_rows))], axis=0)
else:
    new_label = max(y) + 1
    y = y.replace(rare_classes, new_label)
    others_rows = X[y.isin(rare_classes)]
    X = pd.concat([X, others_rows], axis=0)
    y = pd.concat([y, pd.Series([new_label] * len(others_rows))], axis=0)

X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# Split data into training and testing sets with selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False]
}

# Train Random Forest Classifier with GridSearchCV
rf_model_selected = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model_selected, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate model on the final test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Compute cross-validated scores for standard deviation calculation
cv_results = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
accuracy_sd = np.std(cv_results)
precision_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted'))
recall_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted'))
f1_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='f1_weighted'))

# Print results with standard deviation
print(f"Test Set Accuracy: {accuracy:.4f} (±{accuracy_sd:.4f})")
print(f"Precision: {precision:.4f} (±{precision_sd:.4f})")
print(f"Recall: {recall:.4f} (±{recall_sd:.4f})")
print(f"F1-Score: {f1:.4f} (±{f1_sd:.4f})")


In [None]:
# Experiment 4: Next Activity Prediction with activity transitions count and repeat pattern features
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []
repeat_pattern_features = []

# Iterate through each row to count transitions and compute repeat features
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Non-null activities
    
    # --- Transition Counting ---
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)
    
    # --- Repeat Pattern Features ---
    max_run = 1
    current_run = 1
    run_lengths = []
    repetitive_activities = set()
    
    for i in range(1, len(activities)):
        if activities[i] == activities[i - 1]:
            current_run += 1
            repetitive_activities.add(activities[i])
        else:
            run_lengths.append(current_run)
            current_run = 1
    run_lengths.append(current_run)  # Add final run
    
    max_run_length = max(run_lengths)
    avg_run_length = np.mean(run_lengths)
    num_runs = len(run_lengths)
    num_repetitive_activities = len(repetitive_activities)

    repeat_pattern_features.append({
        'max_run_length': max_run_length,
        'avg_run_length': avg_run_length,
        'num_runs': num_runs,
        'num_repetitive_activities': num_repetitive_activities
    })

# Convert to DataFrames
transitions_df = pd.DataFrame(transition_counts)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

repeat_df = pd.DataFrame(repeat_pattern_features)

# Merge everything
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Compute mutual information scores for repeat pattern features
mi_scores = mutual_info_classif(repeat_df, result_df['next_activity'], discrete_features=True)
feature_scores = dict(zip(repeat_df.columns, mi_scores))
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

print("\nMutual Information Scores for Repeat Pattern Features:")
for feature, score in sorted_features:
    print(f"{feature}: {score:.4f}")

# Remove the least important features based on MI scores (i.e., num_repetitive_activities, max_run_length)
repeat_df = repeat_df.drop(columns=['num_repetitive_activities', 'max_run_length'])

# Merge updated repeat_df with result_df
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Prepare features and labels
X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

# Handle rare classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()
if len(rare_classes) == 1:
    rare_class = rare_classes[0]
    rare_class_rows = X[y == rare_class]
    X = pd.concat([X, rare_class_rows], axis=0)
    y = pd.concat([y, pd.Series([rare_class] * len(rare_class_rows))], axis=0)
else:
    new_label = max(y) + 1
    y = y.replace(rare_classes, new_label)
    others_rows = X[y.isin(rare_classes)]
    X = pd.concat([X, others_rows], axis=0)
    y = pd.concat([y, pd.Series([new_label] * len(others_rows))], axis=0)

# Feature selection
X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# GridSearchCV for best RandomForest
param_grid = {
    'n_estimators': [50, 100, 200, 300], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False]
}
rf_model_selected = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model_selected, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Cross-validation standard deviations
cv_results = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
accuracy_sd = np.std(cv_results)
precision_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted'))
recall_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted'))
f1_sd = np.std(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='f1_weighted'))

accuracy_cv = np.mean(cv_results)
precision_cv = np.mean(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted'))
recall_cv = np.mean(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted'))
f1_cv = np.mean(cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='f1_weighted'))


print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy (from GridSearch): {grid_search.best_score_:.4f}")
print(f"Cross-Validated Accuracy: {accuracy_cv:.4f} (±{accuracy_sd:.4f})")
print(f"Cross-Validated Precision: {precision_cv:.4f} (±{precision_sd:.4f})")
print(f"Cross-Validated Recall: {recall_cv:.4f} (±{recall_sd:.4f})")
print(f"Cross-Validated F1-Score: {f1_cv:.4f} (±{f1_sd:.4f})")
print(f"\nTest Set Accuracy: {accuracy:.4f}")
print(f"Test Set Precision: {precision:.4f}")
print(f"Test Set Recall: {recall:.4f}")
print(f"Test Set F1-Score: {f1:.4f}")

