# Next Activity Prediciton

## Libraries importing

In [None]:
from Split_functions import data_split

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import pm4py as pm4

import plotly.express as px
from xgboost import XGBClassifier
from hyperopt import hp, tpe, Trials, fmin, space_eval, STATUS_OK
import pickle

## Downoalding the data

In [None]:
# df = pm4.convert_to_dataframe(pm4.read.read_xes('BPI_Challenge_2012.xes.gz'))
df = pd.read_csv('cleaned_data.csv')
df.head()

## Feature Engineering
- Discovering new variables: next_activity, previous_activity1,2,3,4,5
- Label encoding discovered variables to put into XGBoost and Random Classifier

In [None]:
df['next_activity'] = df.groupby('case:concept:name')['concept:name'].shift(-1)
df['previous_activity1'] = df.groupby('case:concept:name')['concept:name'].shift(1)
df['previous_activity2'] = df.groupby('case:concept:name')['concept:name'].shift(2)
df['previous_activity3'] = df.groupby('case:concept:name')['concept:name'].shift(3)
df['previous_activity4'] = df.groupby('case:concept:name')['concept:name'].shift(4)
df['previous_activity5'] = df.groupby('case:concept:name')['concept:name'].shift(5)

le = LabelEncoder()
df['current_activity_encoded'] = le.fit_transform(df['concept:name'])
df['next_activity_encoded'] = le.fit_transform(df['next_activity'])
df['previous_activity1_encoded'] = le.fit_transform(df['previous_activity1'])
df['previous_activity2_encoded'] = le.fit_transform(df['previous_activity2'])
df['previous_activity3_encoded'] = le.fit_transform(df['previous_activity3'])
df['previous_activity4_encoded'] = le.fit_transform(df['previous_activity4'])
df['previous_activity5_encoded'] = le.fit_transform(df['previous_activity5'])

## Train Test Split Function

In [None]:
predictor = df[['current_activity_encoded', 
                'previous_activity1_encoded', 
                'previous_activity2_encoded', 
                'previous_activity3_encoded',
                'previous_activity4_encoded',
                'previous_activity5_encoded',
                'case:concept:name', 
                'time:timestamp']]
target = df[['next_activity_encoded', 'case:concept:name', 'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

print('+----------------------------------------------------------------+')
print('After cleaning traces!')
print('Training dataset max time:',X['time:timestamp'].max())
print('Testing dataset min time:', X_test['time:timestamp'].min())
print('+----------------------------------------------------------------+')

# Next action prediction

## Taking only necessary columns for the prediction

In [None]:
X_features = ['current_activity_encoded', 
       'previous_activity1_encoded', 
       'previous_activity2_encoded', 
       'previous_activity3_encoded',
       'previous_activity4_encoded',
       'previous_activity5_encoded',
       'case:concept:name']
y_features = ['next_activity_encoded',
              'case:concept:name']

In [None]:
X = X[X_features]
X_test = X_test[X_features]
y = y[y_features]
y_test = y_test[y_features]

## Random Forest Model

In [None]:
X = X.reset_index(drop = True)
y = y.reset_index(drop = True)

In [None]:
# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 5, stop = 100, num = 60)]),
    'max_depth': hp.choice('max_depth', [5, 6, 7, 9, 10, 12, 13, 15, 16, 17, 19, 20, 22, 23, 25]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4, 6, 8])
}

# Initialize variables to store results
best_params_rc_list = []
best_scores_list = []

# Number of folds
n = 10

# Custom Nested K-fold cross validation
start = 0
end = len(X)
step_size = end//n

train_start = 0 
train_end = end - step_size

test_start = train_end
test_end = end

# outer k-fold
for i in range(n):
    if train_start == test_start:
        train_x = X.loc[test_end+1:]
        train_y = y.loc[test_end+1:]

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]

    else:
        if test_end + 1 >= len(X):
            train_x = X.loc[train_start:train_end-1]
            train_y = y.loc[train_start:train_end-1]
        else:
            train_x = pd.concat([X.loc[train_start:train_end-1], X.loc[test_end+1:]])
            train_y = pd.concat([y.loc[train_start:train_end-1], y.loc[test_end+1:]])

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]
    
    overlapping_sets = list(set(train_x['case:concept:name'].unique()).intersection(set(test_x['case:concept:name'].unique())))
    # # Clean train
    X_train = train_x[train_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_train = train_y[train_y['case:concept:name'].isin(train_x['case:concept:name'].unique())]
    
    # # Clean test
    X_validation = test_x[test_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_validation = test_y[test_y['case:concept:name'].isin(test_x['case:concept:name'].unique())]

    # # Finalizing the data
    X_train = X_train[X_features[:-1]].values
    X_validation = X_validation[X_features[:-1]].values
    y_train = y_train[y_features[0]].values
    y_validation = y_validation[y_features[0]].values
    

    # Define a function to optimize using Hyperopt (inner k-fold)
    def objective(params):
        rfc = RandomForestClassifier(**params, n_jobs = -1)
        rfc.fit(X_train, np.ravel(y_train))
        score = rfc.score(X_validation, y_validation)
        return {'loss': -score, 'status': STATUS_OK}
    
    # Define Trials object to store optimization results
    trials = Trials()
    
    # Use Hyperopt to find the best hyperparameters
    best = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials=trials, return_argmin=False)
    
    # Store the best parameters and corresponding score
    best_params_rc_list.append(best)
    best_scores_list.append(-trials.best_trial['result']['loss'])  # Convert back to positive
    
    
    test_end = test_start
    train_end -= step_size
    test_start = train_end

#Print the best parameters and average score across all outer folds
print("Best Parameters:")
for params in best_params_rc_list:
    print(params)
print("Average Score:", np.mean(best_scores_list))

In [None]:
# Saving trained models to pickle format to easily retrieve them later
count = 1
for i in best_params_rc_list:
    model = RandomForestClassifier(**i)
    model.fit(X[X_features[:-1]].values, np.ravel(y[y_features[0]].values))
    print(f"Model {count} with scores: ",model.score(X_test[X_features[:-1]].values, y_test[y_features[0]].values), 'saved!')
    
    pickle.dump(model , open(f'next_activity_prediction_rfc_{count}.pk1' , 'wb'))
    count+=1

## Gradient Boosting (Choose XGBoost or LightGBM)

In [None]:
# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 5, stop = 50, num = 45)]),
    'max_depth': hp.choice('max_depth', [int(i) for i in range(3,11)]),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.2)
}

# Initialize variables to store results
best_params_xgbc_list = []
best_scores_list = []

# Number of folds
n = 10

# Custom Nested K-fold cross validation
start = 0
end = len(X)
step_size = end//n

train_start = 0 
train_end = end - step_size

test_start = train_end
test_end = end

# outer k-fold
for i in range(n):
    if train_start == test_start:
        train_x = X.loc[test_end+1:]
        train_y = y.loc[test_end+1:]

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]

    else:
        if test_end + 1 >= len(X):
            train_x = X.loc[train_start:train_end-1]
            train_y = y.loc[train_start:train_end-1]
        else:
            train_x = pd.concat([X.loc[train_start:train_end-1], X.loc[test_end+1:]])
            train_y = pd.concat([y.loc[train_start:train_end-1], y.loc[test_end+1:]])

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]
    
    overlapping_sets = list(set(train_x['case:concept:name'].unique()).intersection(set(test_x['case:concept:name'].unique())))
    # # Clean train
    X_train = train_x[train_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_train = train_y[train_y['case:concept:name'].isin(train_x['case:concept:name'].unique())]
    
    # # Clean test
    X_validation = test_x[test_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_validation = test_y[test_y['case:concept:name'].isin(test_x['case:concept:name'].unique())]

    # # Finalizing the data
    X_train = X_train[X_features[:-1]].values
    X_validation = X_validation[X_features[:-1]].values
    y_train = y_train[y_features[0]].values
    y_validation = y_validation[y_features[0]].values
    
    # Define a function to optimize using Hyperopt (inner K-fold)
    def objective(params):
        xgb = XGBClassifier(**params)
        xgb.fit(X_train, y_train)
        score = xgb.score(X_validation, y_validation)
        return {'loss': -score, 'status': STATUS_OK}
    
    # Define Trials object to store optimization results
    trials = Trials()
    
    # Use Hyperopt to find the best hyperparameters
    best = fmin(objective, space, algo=tpe.suggest, max_evals=10, trials=trials, return_argmin=False)
    
    # Store the best parameters and corresponding score
    best_params_xgbc_list.append(best)
    best_scores_list.append(-trials.best_trial['result']['loss'])  # Convert back to positive
    
    
    test_end = test_start
    train_end -= step_size
    test_start = train_end

#Print the best parameters and average score across all outer folds
print("Best Parameters:")
for params in best_params_xgbc_list:
    print(params)
print("Average Score:", np.mean(best_scores_list))

In [None]:
# Storing trained models in pickle format to easily retrieve them later
count = 1
for i in best_params_xgbc_list:
    model = XGBClassifier(**i)
    model.fit(X[X_features[:-1]].values, np.ravel(y[y_features[0]].values))
    print(f"Model {count}",model.score(X_test[X_features[:-1]].values, y_test[y_features[0]].values), 'successfully saved!')
    pickle.dump(model, open(f'next_activity_prediction_xgbc_{count}.pk1', 'wb'))
    count += 1