In [7]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [8]:
from dateutil.parser import parse
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [9]:
#name = 'bpi_2012'
#name = 'bpi_2013'
name = 'small_log'
#name = 'large_log'

args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
    'scaler': 'standardization', 
}

args = argparse.Namespace(**args)

# Load data

In [10]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [11]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Activity A,1970-01-01 09:00:00
1,1,Activity B,1970-01-01 10:00:00
2,1,Activity C,1970-01-01 11:00:00
3,1,Activity D,1970-01-01 12:00:00
4,1,Activity E,1970-01-01 13:00:00


In [12]:
data['Activity'].value_counts()

Activity A    2000
Activity F    2000
Activity L    2000
Activity N    2000
Activity G    2000
Activity I    2000
Activity H    2000
Activity B    2000
Activity K    2000
Activity J    2000
Activity D    2000
Activity E    2000
Activity C    2000
Activity M    2000
Name: Activity, dtype: int64

In [13]:
#Calculate duration and cumulative duration
groupByCase = data.groupby(['CaseID'])
case_dict = {}

for case, group in groupByCase:
    starting_time = group.iloc[0,2]
    ending_time = group.iloc[-1,2]
    duration = (ending_time - starting_time).total_seconds()
    case_dict[case] = duration

In [7]:
duration_df = pd.DataFrame([i for i in case_dict.items()], columns=['CaseID', 'Duration'])

In [8]:
duration_df.head()

Unnamed: 0,CaseID,Duration
0,1,21600.0
1,2,21600.0
2,3,21600.0
3,4,21600.0
4,5,21600.0


In [9]:
duration_df['Duration'].describe(percentiles=[0.25])

count    15000.0
mean     21600.0
std          0.0
min      21600.0
25%      21600.0
50%      21600.0
max      21600.0
Name: Duration, dtype: float64

In [None]:
duration_df['Duration']