In [30]:
from pm4py.objects.log.exporter.xes import factory as xes_exporter
from pm4py.objects.log.util.log import log as pmlog
import json 
from datetime import datetime
from Text_Preprocessing import preprocess_text
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import numpy as np

In [31]:
msdialog_filepath = "data/MSDialog/Intent/MSDialog-Intent.json"
with open(msdialog_filepath) as f:
    data = json.load(f)

### 0: TF-IDF Mining

In [32]:
filtered_messages = []
for thread in data.keys():
    utterances = data[thread]["utterances"]
    for utterance in utterances:
        message = " ".join(preprocess_text(utterance["utterance"]))
        filtered_messages.append(message)
        
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(filtered_messages)


### Helper Functions

**Tags Rule Set**  
Used to filter in the dataset, to make the output .xes file more comprehensible. 
Filters:
* Remove Greetings if there are other classifications

**Tags to Set**  
Takes a string of tags and converts it to a list of tags. Also filters in tags with **Tags Rule Set**.  
Example.  "IR OQ NF" -> ["IR", "OQ", "NF"]

**Utterance to event**  
Creates an PM4PY event object from each utterance. 

In [33]:
def get_max_word(item):
    max_elem = np.argmax(tfidf_vectorizer_vectors[item])    
    return list(tfidf_vectorizer.vocabulary_.keys())[list(tfidf_vectorizer.vocabulary_.values()).index(max_elem)] 

def tags_rule_set(tags_list):
    if len(tags_list) > 1:
        
        # Remove Greetings
        if "GG" in tags_list:
            tags_list.remove("GG")
            
            
    return tags_list

def tags_to_set(tags_string):
    tags_list = list(set(tags_string.split(" ")))
    filtered_tags_list = tags_rule_set(tags_list)
    return filtered_tags_list

def date_to_datetime(date_string): 
    return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S')

def utterance_to_event(utterance, tf_idf_index):
    event = {}
    event["org:resource"] = utterance["user_id"]
    event['concept:name'] = tags_to_set(utterance["tags"])
    event["time:timestamp"] = date_to_datetime(utterance["utterance_time"])
    event["topics"] = get_max_word(tf_idf_index)
    return pmlog.Event(event)

### Log Extract

In [34]:
log = pmlog.EventLog()                        # Log we want to return
tf_idf_index = 0
for thread in data.keys():
    utterances = data[thread]["utterances"]
    trace = pmlog.Trace()                     # Trace for each conversation
    for utterance in utterances:
        event = utterance_to_event(utterance, tf_idf_index) # Utterance Event
        trace.append(event)
        tf_idf_index += 1
    log.append(trace)
xes_exporter.export_log(log, "MSDialogLog.xes")