In [1]:
'''
This file is to preprocess data to json file
Jay Ho, David Khankin
'''
import json, re

In [2]:
# Training data path
train_data_path = "text_train/Subtask_1_2_train.json"
# Load json training data
with open(train_data_path, "r") as read_file:
    train_dataset = json.load(read_file)

In [3]:
def preprocess(text):
    # convert all lowercase
    new_str = text.lower()
    # convert all occurrences of the following markup strings in the tweet message:
    # \&quot; \&amp; \&gt; and \&lt into white space characters
    new_str = re.sub('&quot;|&amp;|&gt;|&lt;', " ", new_str)
    #replace every occurrence of the following special characters in the tweet message
    new_str = re.sub(r'[\t,;\"!\"\?\"+\=\*\|\(\)\[\]\{}]', " ", new_str)
    new_str = re.sub(re.compile('(?:\.){2,}')," ",new_str)
    new_str = re.sub(re.compile('(?:\.\s|^\s+|\s+$)'), " ", new_str)
    new_str = re.sub(r'^\.+|\.+$', ' ', new_str)
    new_str = re.sub(re.compile('(?:\s){2,}')," ",new_str)
    new_str = re.sub(re.compile('<.*?>'), '', new_str)
    new_str = new_str.strip() # remove any extra whitespace at the end of the line
    return new_str

In [4]:
conversation_list = []

# visit each conversation
for i in range(len(train_dataset)):
    conversation_dict = dict()
    conversation_id = int(train_dataset[i]["conversation_ID"])
    conversation_dict["conversation_id"] = conversation_id
    # visit each utterance within a conversation and save to the csv file
    # add emo-cause pairs list
    emo_cause_pairs = []
    # for loop emo cause pairs
    for pair in train_dataset[i]["emotion-cause_pairs"]:
        emotion_text = pair[0].split("_")
        cause_text = pair[1].split("_")
        # pair is (emotion utterance id - integer, cause utterance id - integer)
        emo_cause_pairs.append((int(emotion_text[0]), int(cause_text[0])))
        
    conversation_dict["emo_cause_pairs"] = emo_cause_pairs
    
    # add to conversation dictionary
    utterance_list = []
    for utterance in train_dataset[i]["conversation"]:
        # {"utterance_id": integer, "emotion": string, "speaker": string, "utterance_text": string}
        utterance_dict = dict()
        utterance_dict["utterance_id"] = int(utterance["utterance_ID"])
        utterance_dict["emotion"] = utterance["emotion"]
        utterance_dict["speaker"] = utterance["speaker"]
        utterance_dict["utterance_text"] = preprocess(utterance["text"])
        utterance_list.append(utterance_dict)
    conversation_dict["utterances"] = utterance_list
    conversation_list.append(conversation_dict)

In [5]:
with open("emocause_data.json", "w") as outfile:
    json.dump(conversation_list, outfile)