# Import of necessary libraries

In [None]:
from Split_functions import data_split_viz

In [None]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import pm4py as pm4

import plotly.express as px
import seaborn as sns

# Data loading

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.head()

# Feature engineering

In [None]:
df['next_activity'] = df.groupby('case:concept:name')['concept:name'].shift(-1)
df['previous_activity1'] = df.groupby('case:concept:name')['concept:name'].shift(1)
df['previous_activity2'] = df.groupby('case:concept:name')['concept:name'].shift(2)

le = LabelEncoder()
df['current_activity_encoded'] = le.fit(df['concept:name']).transform(df['concept:name'])
df['next_activity_encoded'] = le.fit(df['next_activity']).transform(df['next_activity'])
df['previous_activity1_encoded'] = le.fit(df['previous_activity1']).transform(df['previous_activity1'])
df['previous_activity2_encoded'] = le.fit(df['previous_activity2']).transform(df['previous_activity2'])

In [None]:
predictor = df[['current_activity_encoded', 'previous_activity1_encoded', 'previous_activity2_encoded', 'case:concept:name', 'concept:name', 'time:timestamp']]
target = df[['next_activity_encoded', 'case:concept:name', 'time:timestamp']]
train_size = 0.8

In [None]:
X, X_test, y, y_test, drop_set = data_split_viz(predictor, target, train_size)

In [None]:
X.reset_index(inplace = True)
X_test.reset_index(inplace = True)

In [None]:
dz = df.reset_index()

### Showcasing the data (triain and test) before deleting intersecting applications

In [None]:
fig1 = px.scatter(dz, x = 'time:timestamp', y = 'index', color = 'concept:name', 
                  labels = {
                      'time:timestamp': 'Time (2011-2012)',
                      'case:concept:name': 'Concept name and case number',
                      'index': 'Index of the trace'
                  },
                  title = 'Depiction of the BPI Challenge 2012 data',
                  width=1100, height=600)
fig1.add_vline(x=pd.to_datetime(X['time:timestamp'].max()), line_width=1, line_dash="dash", line_color="black")
fig1.add_hline(y= X_test.head(1)['index'].iloc[0], line_width=1, line_dash="dash", line_color="black")
fig1.add_annotation(text = 'Training Data', x = pd.to_datetime('2011-11-06'), y = 60000)
fig1.add_annotation(text = 'Test Data', x = pd.to_datetime('2012-02-24'), y = 250000)
fig1.show();

### Showcasing the data (triain and test) after deleting intersecting applications

In [None]:
concated = pd.concat([X, X_test])
concated.sort_values(by = 'case:concept:name', inplace = True)
concated = concated.reset_index()

In [None]:
# concated = pd.concat([X])
fig = px.scatter(concated, x = 'time:timestamp', y = 'index', color = 'concept:name', 
                  labels = {
                      'time:timestamp': 'Time (2011-2012)',
                      'index': 'Index of the trace',
                      'concept:name': 'Concept Name'
                  },
                  title = 'Cleaned Train Test split of BPI Challenge 2012'
                  , width=1100, height=600)
fig.add_vline(x=pd.to_datetime(X['time:timestamp'].max()), line_width=1, line_dash="dash", line_color="black")
fig.add_hline(y= X_test.head(1)['index'].iloc[0], line_width=1, line_dash="dash", line_color="black")
fig.add_annotation(text = 'Training Data', x = pd.to_datetime('2011-11-06'), y = 60000)
fig.add_annotation(text = 'Test Data', x = pd.to_datetime('2012-02-24'), y = 250000)
# fig.update_layout(showlegend=False)
fig.show();

### Showcasing which events were in both test and train time frames and had to be deleted

In [None]:
df['is_depreciated'] = df['case:concept:name'].apply(lambda x: 'Deleted' if (x in(drop_set)) else 'Used')

In [None]:
dr = df.reset_index()

In [None]:
fig1 = px.scatter(dr, x = 'time:timestamp', y = 'index', color = 'is_depreciated', color_continuous_scale = 'YlGn', 
                  labels = {
                      'time:timestamp': 'Time (2011-2012)',
                      'index': 'Index of the trace',
                      'is_depreciated': 'Case Concept Name:'},
                  title = 'Deletion of traces which overlap in the test time set',
                  width=1100,
                  height=600)
fig1.add_vline(x=pd.to_datetime(X['time:timestamp'].max()), line_width=1, line_dash="dash", line_color="black")
fig1.add_hline(y= X_test.head(1)['index'].iloc[0], line_width=1, line_dash="dash", line_color="black")
fig1.add_annotation(text = 'Training Data', x = pd.to_datetime('2011-11-06'), y = 60000)
fig1.add_annotation(text = 'Test Data', x = pd.to_datetime('2012-02-24'), y = 250000)
# fig1.update_coloraxes(showscale=False)
fig1.show();

In [None]:
df['day'] = df['time:timestamp'].dt.day

In [None]:
gl = df.drop_duplicates(subset = ['case:concept:name'], keep = 'last')

In [None]:
gl['concept:name'].unique()

### Distribution of event types in the data by which the trace ends

In [None]:
px.histogram(gl['concept:name'])

### Distribution of event type in the data by which the trace starts

In [None]:
hf = df.drop_duplicates(subset = ['case:concept:name'], keep = 'first')

In [None]:
px.histogram(hf['concept:name'])

### Visualization of the all possible traces via pm4 library

In [None]:
log = pm4.read_xes('BPI_Challenge_2012.xes.gz')

map = pm4.discover_heuristics_net(log)
pm4.view_heuristics_net(map)