In [146]:
import json
import os
import numpy as np
import pandas as pd

emotion_label_policy = {'angry': 0, 'anger': 0,
    'disgust': 1,
    'fear': 2,
    'happy': 3, 'happines': 3, 'happiness': 3, 
    'sad': 4, 'sadness': 4,
    'surprise': 5, 'surprised': 5, 
    'neutral': 6,
    'excited': 7,
    'frustrated': 8,}

emotion_label_num_to_emotion = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral', 'excited', 'frustrated']

In [147]:
# json 데이터셋 파일 열기
datasets = ['../dataset/dailydialog_train.json', '../dataset/dailydialog_valid.json', '../dataset/dailydialog_test.json']

# 각 데이터셋의 데이터를 하나의 리스트로 합치기
with open(datasets[0], 'r', encoding='utf-8') as f:
    train = json.load(f)
with open(datasets[1], 'r', encoding='utf-8') as f:
    valid = json.load(f)
with open(datasets[2], 'r', encoding='utf-8') as f:
    test = json.load(f)

# concate 3 dictionaries 
valid.update(test)
train.update(valid)
total_data = train

# dataset 폴더에 'dailydialog_total.json' 파일이 없으면, total_data를 json 파일로 저장
if not os.path.isfile('../dataset/dailydialog_total.json'):
    with open('../dataset/dailydialog_total.json', 'w', encoding='utf-8') as f:
        json.dump(total_data, f, ensure_ascii=False, indent='\t')
        
dialog_ids = list(total_data.keys())

## Dialog_table_dict 만드는 코드

In [148]:
# table1_columns = ['dialog_id', 'emotion', 'num_utterances']
table_columns = ['dialog_id', 'turn', 'speaker', 'utterance', 'emotion', 'cause turn', 'cause span', 'cause_type', 'explanation']

# 각 dialog가 들어갈 table 선언하고 내용물 채우기 (column은 table_columns)
# key는 dialog_id와 uid의 조합
# value는 speaker, utterance, emotion, cause turn, cause span
# table: dialog_id, uid, speaker, utterance, emotion, cause turn, cause span

# 각 row를 저장할 데이터프레임
dialog_table_all = pd.DataFrame(columns=table_columns)
for dialog_id in total_data.keys():
    dialog = total_data[dialog_id][0]
    for utterance in dialog:
        turn = utterance['turn']
        uid = f'{dialog_id}_{turn}'
        
        utterance['emotion'] = emotion_label_num_to_emotion[emotion_label_policy[utterance['emotion']]]
        if utterance['emotion'] != 'neutral':
            if 'explanation' in utterance.keys():
                row = pd.DataFrame([[dialog_id, turn, utterance['speaker'], utterance['utterance'], utterance['emotion'], \
                    utterance['expanded emotion cause evidence'], utterance['expanded emotion cause span'], utterance['type'], utterance['explanation']]], columns=table_columns)
            else:
                row = pd.DataFrame([[dialog_id, turn, utterance['speaker'], utterance['utterance'], utterance['emotion'], \
                    utterance['expanded emotion cause evidence'], utterance['expanded emotion cause span'], utterance['type'], '']], columns=table_columns)
                
        else:
            row = pd.DataFrame([[dialog_id, turn, utterance['speaker'], utterance['utterance'], utterance['emotion'], '', '', '', '']], columns=table_columns)
        dialog_table_all = pd.concat([dialog_table_all, row], ignore_index=True)
        
        
    # table1: dialog_id, emotion, num_utterances

dialog_table_dict = {}
for dialog_id in total_data.keys():
    dialog_table_dict[dialog_id] = dialog_table_all[dialog_table_all['dialog_id'] == dialog_id]


## Dialog_table 만드는 코드

In [171]:
# dialog의 각 감정, dialog의 길이에 따라서 dialog 단위로 attribute를 설정
dialog_table_dict.keys()

table_columns = ['dialog_id', 'emotion', 'len']
dialog_table = pd.DataFrame(columns=table_columns)

for dialog_id in dialog_table_dict.keys():
    dialog = dialog_table_dict[dialog_id]
    # emotion은 dialog['emotion']의 최빈값 중 neutral이 아닌 값으로 정함
    emotion = 'neutral'
    for emotion_appearance in dialog['emotion'].value_counts().index:
        if emotion_appearance == 'neutral':
            continue
        if emotion_appearance != 'neutral':
            emotion = emotion_appearance#dialog['emotion'].value_counts().index[0]
            break
    # emotion = dialog['emotion'].value_counts().index[0]
    len_ = len(dialog)
    row = pd.DataFrame([[dialog_id, emotion, len_]], columns=table_columns)
    dialog_table = pd.concat([dialog_table, row], ignore_index=True)

In [172]:
# dialog_table_dict를 펴서 하나의 DataFrame으로 만들기
table_columns = ['dialog_id', 'turn', 'speaker', 'utterance', 'emotion', 'cause turn', 'cause span', 'cause_type', 'explanation']
dialog_table_all = pd.DataFrame(columns=table_columns)
for dialog_id in dialog_table_dict.keys():
    dialog_table_all = pd.concat([dialog_table_all, dialog_table_dict[dialog_id]], ignore_index=True)

## 정리한 데이터
<b><i>dialog_table_dict</i></b>: dialog_id를 key로 갖는 dictionary. 각 dialog는 DataFrame 형태로 utterance들을 가짐  <br>
<b><i>dialog_table</i></b>: DataFrame 타입. Dialog 단위로 (dialog_id, emotion(neutral제외), 대화길이(발화수))를 가짐 <br>

In [173]:
dialog_table_dict['tr_4466']

Unnamed: 0,dialog_id,turn,speaker,utterance,emotion,cause turn,cause span,cause_type,explanation
0,tr_4466,1,A,"Hey , you wanna see a movie tomorrow ?",happy,[1],[see a movie tomorrow ?],[no-context],
1,tr_4466,2,B,Sounds like a good plan . What do you want to ...,happy,[1],[see a movie tomorrow ?],[inter-personal],
2,tr_4466,3,A,How about Legally Blonde .,neutral,,,,
3,tr_4466,4,B,"Ah , my girlfriend wanted to see that movie . ...",neutral,,,,
4,tr_4466,5,A,Isn't that a scary movie ?,neutral,,,,
5,tr_4466,6,B,"How scary can it be ? Come on , it'll be fun .",neutral,,,,
6,tr_4466,7,A,Ok . I'll give it a try .,happy,[6],[it'll be fun .],[inter-personal],
7,tr_4466,8,B,That's the spirit . I'll see you tomorrow afte...,happy,"[6, 7]","[it'll be fun ., Ok . I'll give it a try .]",[hybrid],
8,tr_4466,9,A,Ok . See you tomorrow .,happy,"[1, 6]","[see a movie tomorrow ?, it'll be fun .]",[hybrid],


In [174]:
# table['emotion']의 분포를 보기
dialog_table['emotion'].value_counts()

happy       789
angry       115
sad          83
surprise     69
disgust      34
fear         16
Name: emotion, dtype: int64

In [175]:
type(dialog_table_all)

pandas.core.frame.DataFrame

In [188]:
# 정리한 데이터를 csv로 저장
# 공백 줄 없이

with open('../dataset/dailydialog_table_all_before_processed.csv', 'w', encoding='utf-8') as f:
    dialog_table_all.to_csv(f, index=False)
with open('../dataset/dailydialog_table_before_processed.csv', 'w', encoding='utf-8') as f:
    dialog_table.to_csv(f, index=False)

In [189]:
dialog_table

Unnamed: 0,dialog_id,emotion,len
0,tr_4466,happy,9
1,tr_7536,angry,10
2,tr_754,happy,8
3,tr_4110,happy,5
4,tr_3432,happy,5
...,...,...,...
1101,te_999,happy,12
1102,tr_9789,happy,7
1103,te_767,sad,6
1104,te_796,happy,8


In [190]:
path_list = ['../dataset/dailydialog_table_before_processed.csv', '../dataset/dailydialog_table_all_before_processed.csv']
output_path = ['../dataset/dailydialog_table.csv', '../dataset/dailydialog_table_all.csv']

# 텍스트 파일을 불러와서 짝수 번째 줄을 제거하고 다시 저장하는 코드
for path, output_path in zip(path_list, output_path):
    all_lines = ''
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # lines의 짝수 번째를 제거
    for i in range(len(lines)):
        if i % 2 == 0:
            all_lines += lines[i]
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(all_lines)