In [37]:
import pm4py
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [38]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")

parsing log, completed traces :: 100%|██████████| 251734/251734 [00:48<00:00, 5154.09it/s]


In [39]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['org:resource', 'time:timestamp'])


In [86]:
def create_activity_sequences(df, prefix_length):
    sequences = []
    next_activities = []
    resources = []

    # Iterate through the dataframe, grouped by resource
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values  # Get the activities for this resource

        # Only generate sequences if there are enough activities for a valid prefix
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]  # Get the first 'prefix_length' activities
            next_activity = activities[prefix_length]  # Next activity after the prefix
            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(resource)  # Append the resource for each sequence

    # Convert to DataFrame with column names indicating the sequence of activities
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i + 1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities  # Add the next activity to the DataFrame
    sequences_df['org:resource'] = resources  # Add the resource column (e.g., user_1, user_2)

    return sequences_df


# Example of how to use this function
prefix_length = 700  # You can change this value
sequences_df = create_activity_sequences(df, prefix_length)

In [87]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit on all unique activity values (including 'next_activity')
all_activities = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()

label_encoder.fit(all_activities)  # Fit on the entire dataset

# Apply the same encoding across all columns
for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
    sequences_df[col] = label_encoder.transform(sequences_df[col])

sequences_df.head()

Unnamed: 0,activity_1,activity_2,activity_3,activity_4,activity_5,activity_6,activity_7,activity_8,activity_9,activity_10,...,activity_693,activity_694,activity_695,activity_696,activity_697,activity_698,activity_699,activity_700,next_activity,org:resource
0,31,30,31,30,31,30,31,30,31,30,...,18,18,18,18,18,18,18,18,18,NONE
1,11,15,11,15,11,15,11,15,11,15,...,15,11,15,11,15,11,15,11,15,batch_00
2,17,17,17,17,17,17,17,17,2,2,...,17,17,17,17,17,17,17,17,17,batch_01
3,11,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,batch_02
4,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,batch_03


In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.api.models import Sequential
from keras.api.layers import LSTM, Dense
from keras.api.callbacks import EarlyStopping
from keras.api.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

## Experiment 1: Next Activity Prediction without activity information

# Define features (prefix activities) and target (next_activity)
X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
y = sequences_df['next_activity']

# Identify and handle rare numeric classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

# Handle rare class scenario by replacing rare classes with a placeholder value (e.g., -1)
if len(rare_classes) > 0:
    y = y.replace(rare_classes, -1)  # Replace rare classes with -1 or some placeholder value
elif len(rare_classes) == 1:
    # Duplicate the rare class to avoid error
    y = y.append(pd.Series(rare_classes * 2)).reset_index(drop=True)

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# One-hot encode target labels
y_encoded = to_categorical(y_encoded)

# Reshape features to 3D array for LSTM input
X_reshaped = np.array(X)
X_reshaped = X_reshaped.reshape((X_reshaped.shape[0], X_reshaped.shape[1], 1))  # (samples, timesteps, features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_encoded, test_size=0.2, random_state=42)


# Build the LSTM model
def build_model():
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(y_encoded.shape[1], activation='softmax'))  # 'softmax' for multi-class classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Implement Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Initialize KFold for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train, np.argmax(y_train, axis=1)):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]

    # Build and train the model for each fold
    model = build_model()
    model.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, validation_data=(X_val_cv, y_val_cv), callbacks=[early_stopping], verbose=0)

    # Make predictions
    y_pred_prob = model.predict(X_val_cv)
    y_pred = np.argmax(y_pred_prob, axis=1)

    # Convert one-hot encoded validation labels back to single class values
    y_val_decoded = np.argmax(y_val_cv, axis=1)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_val_decoded, y_pred)
    precision = precision_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_decoded, y_pred, average='weighted', zero_division=0)

    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate standard deviation for each metric
accuracy_sd = np.std(accuracies)
precision_sd = np.std(precisions)
recall_sd = np.std(recalls)
f1_sd = np.std(f1_scores)

# Calculate mean for each metric (cross-validation average)
accuracy_mean = np.mean(accuracies)
precision_mean = np.mean(precisions)
recall_mean = np.mean(recalls)
f1_mean = np.mean(f1_scores)

# Print the results
print(f"Mean Accuracy: {accuracy_mean:.4f} (±{accuracy_sd:.4f})")
print(f"Mean Precision: {precision_mean:.4f} (±{precision_sd:.4f})")
print(f"Mean Recall: {recall_mean:.4f} (±{recall_sd:.4f})")
print(f"Mean F1-Score: {f1_mean:.4f} (±{f1_sd:.4f})")


  super().__init__(**kwargs)


KeyboardInterrupt: 

In [62]:
## Experiment 2: Next Activity Prediction with activity information
import binary_classifier

ra_diversity_matrix = binary_classifier.create_diversity_matrix(event_log)
ra_diversity_matrix_binary = ra_diversity_matrix.copy()
# Apply a binary transformation: any count > 0 becomes 1 (yes), else 0 (no)
ra_diversity_matrix_binary.iloc[:, 1:] = (ra_diversity_matrix_binary.iloc[:, 1:] > 0).astype(int)

activities = ra_diversity_matrix.columns[1:].tolist()  # Convert to a list of activities

binary_activities = ra_diversity_matrix_binary.iloc[:, :]

In [63]:
# Keep only resources that are in sequences_df
filtered_binary_activities = binary_activities[binary_activities['org:resource'].isin(sequences_df['org:resource'])]

# Reset index to ensure proper alignment
filtered_binary_activities = filtered_binary_activities.reset_index(drop=True)
sequences_df = sequences_df.reset_index(drop=True)

# Merge again
merged_df = pd.concat([sequences_df, filtered_binary_activities], axis=1)

In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.api.models import Sequential
from keras.api.layers import LSTM, Dense
from keras.api.callbacks import EarlyStopping
from keras.api.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

X = merged_df[[f"activity_{i+1}" for i in range(prefix_length)] + activities]
y = merged_df['next_activity']

# Identify and handle rare numeric classes
rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

# Handle rare class scenario by replacing rare classes with a placeholder value (e.g., -1)
if len(rare_classes) > 0:
    y = y.replace(rare_classes, -1)  # Replace rare classes with -1 or some placeholder value
elif len(rare_classes) == 1:
    # Duplicate the rare class to avoid error
    y = y.append(pd.Series(rare_classes * 2)).reset_index(drop=True)

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# One-hot encode target labels
y_encoded = to_categorical(y_encoded)

# Reshape features to 3D array for LSTM input
X_reshaped = np.array(X)
X_reshaped = X_reshaped.reshape((X_reshaped.shape[0], X_reshaped.shape[1], 1))  # (samples, timesteps, features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_encoded, test_size=0.2, random_state=42)


# Build the LSTM model
def build_model():
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(y_encoded.shape[1], activation='softmax'))  # 'softmax' for multi-class classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Implement Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Initialize KFold for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train, np.argmax(y_train, axis=1)):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]

    # Build and train the model for each fold
    model = build_model()
    model.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, validation_data=(X_val_cv, y_val_cv), callbacks=[early_stopping], verbose=0)

    # Make predictions
    y_pred_prob = model.predict(X_val_cv)
    y_pred = np.argmax(y_pred_prob, axis=1)

    # Convert one-hot encoded validation labels back to single class values
    y_val_decoded = np.argmax(y_val_cv, axis=1)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_val_decoded, y_pred)
    precision = precision_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_decoded, y_pred, average='weighted', zero_division=0)

    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate standard deviation for each metric
accuracy_sd = np.std(accuracies)
precision_sd = np.std(precisions)
recall_sd = np.std(recalls)
f1_sd = np.std(f1_scores)

# Calculate mean for each metric (cross-validation average)
accuracy_mean = np.mean(accuracies)
precision_mean = np.mean(precisions)
recall_mean = np.mean(recalls)
f1_mean = np.mean(f1_scores)

# Print the results
print(f"Mean Accuracy: {accuracy_mean:.4f} (±{accuracy_sd:.4f})")
print(f"Mean Precision: {precision_mean:.4f} (±{precision_sd:.4f})")
print(f"Mean Recall: {recall_mean:.4f} (±{recall_sd:.4f})")
print(f"Mean F1-Score: {f1_mean:.4f} (±{f1_sd:.4f})")

  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
Mean Accuracy: 0.5749 (±0.1597)
Mean Precision: 0.3851 (±0.2033)
Mean Recall: 0.5749 (±0.1597)
Mean F1-Score: 0.4511 (±0.2014)


In [88]:
sequences_df = sequences_df.drop(columns=['org:resource'])

In [77]:
from collections import defaultdict
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.api.models import Sequential
from keras.api.layers import LSTM, Dense
from keras.api.callbacks import EarlyStopping

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []

# Iterate through each row to count transitions
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Extract non-null activities

    # Count actual transitions
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1

    # Ensure every possible transition exists (fill with 0 if not present)
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)

# Convert list of transition count dictionaries to a DataFrame
transitions_df = pd.DataFrame(transition_counts)

# Rename columns to string format (e.g., '0->0', '0->1', etc.)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

# Merge with original DataFrame
result_df = pd.concat([sequences_df, transitions_df], axis=1)

X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

# Identify rare classes (fewer than 2 instances)
rare_classes = y.value_counts()[y.value_counts() < 2].index.tolist()

# Replace rare classes with -1
if len(rare_classes) > 0:
    y = y.replace(rare_classes, -1)

    # Ensure at least two instances of -1 for StratifiedKFold
    if (y == -1).sum() == 1:
        y = pd.concat([y, pd.Series([-1])], ignore_index=True)
        X = pd.concat([X, X.iloc[[0]]], ignore_index=True)  # Duplicate one row in X


# One-hot encode target variable
y_encoded = pd.get_dummies(y).values  # Convert categorical labels to one-hot encoding

# Feature selection
X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Reshape X_train and X_test to 3D for LSTM
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # (samples, timesteps=1, features)
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
def build_model():
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(1, X_train.shape[2])))  # Fix input shape
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(y_encoded.shape[1], activation='softmax'))  # Ensure correct output size
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Implement Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Initialize KFold for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train, y_train.argmax(axis=1)):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]

    # Build and train the model for each fold
    model = build_model()
    model.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, validation_data=(X_val_cv, y_val_cv), callbacks=[early_stopping], verbose=0)

    # Make predictions
    y_pred_prob = model.predict(X_val_cv)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Convert one-hot encoded validation labels back to single class values
    y_val_decoded = np.argmax(y_val_cv, axis=1)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_val_decoded, y_pred)
    precision = precision_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_decoded, y_pred, average='weighted', zero_division=0)

    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate standard deviation for each metric
accuracy_sd = np.std(accuracies)
precision_sd = np.std(precisions)
recall_sd = np.std(recalls)
f1_sd = np.std(f1_scores)

# Calculate mean for each metric (cross-validation average)
accuracy_mean = np.mean(accuracies)
precision_mean = np.mean(precisions)
recall_mean = np.mean(recalls)
f1_mean = np.mean(f1_scores)

# Print the results
print(f"Mean Accuracy: {accuracy_mean:.4f} (±{accuracy_sd:.4f})")
print(f"Mean Precision: {precision_mean:.4f} (±{precision_sd:.4f})")
print(f"Mean Recall: {recall_mean:.4f} (±{recall_sd:.4f})")
print(f"Mean F1-Score: {f1_mean:.4f} (±{f1_sd:.4f})")


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
Mean Accuracy: 0.6693 (±0.0384)
Mean Precision: 0.4910 (±0.0465)
Mean Recall: 0.6693 (±0.0384)
Mean F1-Score: 0.5608 (±0.0442)


In [89]:
# Experiment 5: Next Activity Prediction with activity transitions count and repeat pattern features
from collections import defaultdict
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from keras.api.models import Sequential
from keras.api.layers import LSTM, Dense
from keras.api.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []
repeat_pattern_features = []

# Iterate through each row to count transitions and compute repeat features
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Non-null activities
    
    # --- Transition Counting ---
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)
    
    # --- Repeat Pattern Features ---
    max_run = 1
    current_run = 1
    run_lengths = []
    repetitive_activities = set()
    
    for i in range(1, len(activities)):
        if activities[i] == activities[i - 1]:
            current_run += 1
            repetitive_activities.add(activities[i])
        else:
            run_lengths.append(current_run)
            current_run = 1
    run_lengths.append(current_run)  # Add final run
    
    max_run_length = max(run_lengths)
    avg_run_length = np.mean(run_lengths)
    num_runs = len(run_lengths)
    num_repetitive_activities = len(repetitive_activities)

    repeat_pattern_features.append({
        'max_run_length': max_run_length,
        'avg_run_length': avg_run_length,
        'num_runs': num_runs,
        'num_repetitive_activities': num_repetitive_activities
    })

# Convert to DataFrames
transitions_df = pd.DataFrame(transition_counts)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

repeat_df = pd.DataFrame(repeat_pattern_features)

# Merge everything
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Compute mutual information scores for repeat pattern features
mi_scores = mutual_info_classif(repeat_df, result_df['next_activity'], discrete_features=True)
feature_scores = dict(zip(repeat_df.columns, mi_scores))
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

print("\nMutual Information Scores for Repeat Pattern Features:")
for feature, score in sorted_features:
    print(f"{feature}: {score:.4f}")

# Remove the least important features based on MI scores (i.e., num_repetitive_activities, max_run_length)
repeat_df = repeat_df.drop(columns=['num_repetitive_activities', 'max_run_length'])

# Merge updated repeat_df with result_df
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Prepare features and labels
X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

# Identify rare classes (fewer than 2 instances)
rare_classes = y.value_counts()[y.value_counts() < 2].index.tolist()

# Replace rare classes with -1
if len(rare_classes) > 0:
    y = y.replace(rare_classes, -1)

    # Ensure at least two instances of -1 for StratifiedKFold
    if (y == -1).sum() == 1:
        y = pd.concat([y, pd.Series([-1])], ignore_index=True)
        X = pd.concat([X, X.iloc[[0]]], ignore_index=True)  # Duplicate one row in X
        
# Feature selection
X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# One-hot encode target variable
y_encoded = pd.get_dummies(y).values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Reshape X_train and X_test to 3D for LSTM
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # (samples, timesteps=1, features)
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
def build_model():
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(1, X_train.shape[2])))  # Fix input shape
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(y_encoded.shape[1], activation='softmax'))  # Ensure correct output size
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Implement Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Initialize KFold for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train, y_train.argmax(axis=1)):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]

    # Build and train the model for each fold
    model = build_model()
    model.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, validation_data=(X_val_cv, y_val_cv), callbacks=[early_stopping], verbose=0)

    # Make predictions
    y_pred_prob = model.predict(X_val_cv)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Convert one-hot encoded validation labels back to single class values
    y_val_decoded = np.argmax(y_val_cv, axis=1)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_val_decoded, y_pred)
    precision = precision_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val_decoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_decoded, y_pred, average='weighted', zero_division=0)

    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate standard deviation for each metric
accuracy_sd = np.std(accuracies)
precision_sd = np.std(precisions)
recall_sd = np.std(recalls)
f1_sd = np.std(f1_scores)

# Calculate mean for each metric (cross-validation average)
accuracy_mean = np.mean(accuracies)
precision_mean = np.mean(precisions)
recall_mean = np.mean(recalls)
f1_mean = np.mean(f1_scores)

# Print the results
print(f"Mean Accuracy: {accuracy_mean:.4f} (±{accuracy_sd:.4f})")
print(f"Mean Precision: {precision_mean:.4f} (±{precision_sd:.4f})")
print(f"Mean Recall: {recall_mean:.4f} (±{recall_sd:.4f})")
print(f"Mean F1-Score: {f1_mean:.4f} (±{f1_sd:.4f})")





Mutual Information Scores for Repeat Pattern Features:
max_run_length: 1.2873
num_runs: 1.1414
avg_run_length: 1.1414
num_repetitive_activities: 0.4906


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
Mean Accuracy: 0.6328 (±0.0481)
Mean Precision: 0.4690 (±0.0556)
Mean Recall: 0.6328 (±0.0481)
Mean F1-Score: 0.5312 (±0.0535)
