In [10]:
import os
from lxml import etree
import csv
import time


In [2]:
a=100

In [3]:
# path="datasets/cm2.5k.mxml"
filename="PrepaidTravelCost.xes"
dataset=filename[:filename.rindex(".")]
path="datasets/"+filename

In [4]:
def Parse(in_path):
    pathExtension = os.path.splitext(in_path)[1]
    traceControlFlow = []
    if pathExtension == ".MXML" or pathExtension == ".mxml":
        traceControlFlow,events = parseMXML(in_path)
    elif pathExtension == ".XES" or pathExtension == ".xes":
        doc = etree.parse(in_path)
        root = doc.getroot()
        if len(root.nsmap) < 1:
            traceControlFlow,events = parseXESNoXmlns(in_path)
        else:
            prefix = root.nsmap[None]
            traceControlFlow,events = parseXES(in_path, prefix)
    else:
        print("Error: the file must be .XES or .MXML! Please upload again.")
    return traceControlFlow,events

def parseMXML(in_path):
    traces = []
    events = set()
    tree = etree.parse(in_path)
    root = tree.getroot()
    process = root.xpath('./Process')[0]
    allTraces = process.xpath('./ProcessInstance')
    for case in allTraces:
        trace = []
        for subNode in case.iterfind('AuditTrailEntry'):
            dictEvent = parseMxmlEvent(subNode)
            dictEvent["id"]=case.attrib["id"]
            trace.append(dictEvent)
            events.add(dictEvent["name"])
        traces.append(trace)
    traceControlFlow = onlyControlFlow(traces)
    return traces,events

def parseXESNoXmlns(in_path):
    traces = []
    events = set()
    doc = etree.parse(in_path)
    allTraces = doc.xpath('trace')
    for case in allTraces:
        trace = []
        eventIntrace = case.xpath('./event')
        for subNode in eventIntrace:
            dictEvent = parseXesEventNoXmlns(subNode)
            trace.append(dictEvent)
            events.add(dictEvent["name"])
        traces.append(trace)
    # traceControlFlow = onlyControlFlow(traces)
    return traces,events
def parseXES(in_path, prefix):
    traces = []
    events = set()
    doc = etree.parse(in_path)
    allTraces = doc.xpath('//pre:trace', namespaces={"pre": prefix})
    for case in allTraces:
        trace = []
        eventIntrace = case.xpath('./pre:event', namespaces={"pre": prefix})
        for subNode in eventIntrace:
            dictEvent = parseXesEvent(subNode, prefix)
            trace.append(dictEvent)
            events.add(dictEvent["name"])
        traces.append(trace)
    # traceControlFlow = onlyControlFlow(traces)
    return traces,events

def parseMxmlEvent(subNode):
    dictEvent = dict()
    for item in subNode:
        if item.tag == 'WorkflowModelElement':
            dictEvent['name'] = item.text.strip()
        elif item.tag == 'EventType':
            dictEvent['type'] = item.text.strip()
        elif item.tag == 'Timestamp' or item.tag == 'timestamp':
            dictEvent['timestamp'] = item.text.strip()
    return dictEvent

def parseXesEventNoXmlns(subNode):
    dictEvent = dict()
    stringNode = subNode.xpath('./string')
    dateNode = subNode.xpath('./date')

    for item in stringNode:
        if item.get('key') == 'concept:name':
            dictEvent['name'] = item.get('value').strip()
        elif item.get('key') == 'lifecycle:transition':
            dictEvent['type'] = item.get('value').strip()
    for item in dateNode:
        if item.get('key') == 'time:timestamp':
            dictEvent['timestamp'] = item.get('value').strip()
    return dictEvent
def parseXesEvent(subNode, prefix):
    dictEvent = dict()
    if prefix == None:
        stringNode = subNode.xpath('./pre:string')
        dateNode = subNode.xpath('./pre:date')
    else:
        stringNode = subNode.xpath('./pre:string', namespaces={"pre": prefix})
        dateNode = subNode.xpath('./pre:date', namespaces={"pre": prefix})

    for item in stringNode:
        if item.get('key') == 'concept:name':
            dictEvent['name'] = item.get('value').strip()
        elif item.get('key') == 'lifecycle:transition':
            dictEvent['type'] = item.get('value').strip()
    for item in dateNode:
        if item.get('key') == 'time:timestamp':
            dictEvent['timestamp'] = item.get('value').strip()
    return dictEvent
def onlyControlFlow(traces):
    listControl = []
    for t in traces:
        tempTrace = []
        for t_event in t:
            if t_event['type'] == "complete" or t_event['type'] == "COMPLETE":
                tempTrace.append(t_event['name'])
            else:
                pass
        listControl.append(tempTrace)
    return listControl

In [5]:
traces,events=Parse(path)

In [8]:
len(events)

29

In [11]:
traces[0][0]['timestamp']

'2017-01-09T14:48:43.000+01:00'

In [None]:
len(traces)

In [None]:
traces[0]

In [15]:
def time_format(ttime):
        ttime=" ".join(ttime.split("T"))
        ttime=ttime.split(".")[0]
        try:
            date_format_str = '%Y-%m-%d %H:%M:%S'
            conversion = time.strptime(ttime, date_format_str)
        except:
            date_format_str = '%Y/%m/%d %H:%M:%S'
            conversion = time.strptime(ttime, date_format_str)
        return time.mktime(conversion)

In [16]:
def analyze(traces,events):
    num_activity=len(events)
    num_trace=len(traces)
    num_event=0
    average_trace_length=0
    max_trace_length=0
    max_event_duration=0
    average_event_duration=0
    max_trace_duration=0
    average_trace_duration=0
    event_dif_list=[]
    trace_dif_list=[]
    for trace in traces:
        start=0
        pre=0
        end=0
        num_event+=len(trace)
        max_trace_length=max(max_trace_length,len(trace))
        for event in trace:
            if start==0:
                start=time_format(event["timestamp"])
                pre=start
            else:
                cur=time_format(event["timestamp"])
                event_dif_list.append(cur-pre)
                pre=cur
        trace_dif_list.append(cur-start)
    max_trace_duration=max(trace_dif_list)
    max_event_duration=max(event_dif_list)
    average_event_duration=sum(event_dif_list)/len(event_dif_list)
    average_trace_duration=sum(trace_dif_list)/len(trace_dif_list)
    average_trace_length=num_event/len(traces)
    print("活动数量：",num_activity)
    print("事件数量：",num_event)
    print("轨迹数量：",num_trace)
    print("平均轨迹长度：",average_trace_length)
    print("最大轨迹长度：",max_trace_length)
    print("最大活动持续时间：",max_event_duration)
    print("平均活动持续时间：",average_event_duration)
    print("最大轨迹持续时间：",max_trace_duration)
    print("平均轨迹持续时间：",average_trace_duration)
            

In [17]:
analyze(traces,events)

活动数量： 29
事件数量： 18246
轨迹数量： 2099
平均轨迹长度： 8.692710814673655
最大轨迹长度： 21
最大活动持续时间： 27376881.0
平均活动持续时间： 413261.8457298569
最大轨迹持续时间： 28077926.0
平均轨迹持续时间： 3189328.243925679


In [None]:
event_list=list(events)
event_list.sort()
events_index=[i for i in range(len(event_list))]
event_dict={event_list[i]:i for i in range(len(event_list))}
if not os.path.exists("datasets/"+dataset):
    print("sure does")
    os.mkdir("datasets/"+dataset)
with open("datasets/"+dataset+"/"+dataset+".csv","w",encoding='utf8',newline='') as f:
    writer=csv.DictWriter(f,fieldnames=['id','name','type',"timestamp"])
    writer.writeheader()
    for i in range(len(traces)):
        for j in range(len(traces[i])):
            if traces[i][j]["type"]=="assign":
                continue
            traces[i][j]["id"]=i
            traces[i][j]["name"]=event_dict[traces[i][j]["name"]]
            writer.writerow(traces[i][j])


In [None]:
event_dict