In [1]:
## データセットemotional_persuasive_dialogueを整形してデータベースに格納する
## このセルは1回実行しておくだけでOK

import random
import sqlite3
from pathlib import Path
import time

def select_label(mode, label_options):
    """
    受諾度合い、感情ラベルで、3つ中2つ以上一致している
    ラベルをその発話のラベルとして採用する
    """
    
    # ラベル候補のリストからランダムに3つ要素を取得する
    label_options = random.sample(label_options.split("-"), 3)
    # 改行文字を取り除く
    # 感情状態ラベルの末尾には「,」が付いていて、改行文字が末尾に来ない
    # 受諾度合いラベルの末尾には「,」が付いていないため、改行文字が末尾に来てしまう
    # 改行文字を除去しないと、ラベルの付与数を正常にcountできない
    for i in range(len(label_options)):
        label_options[i] = label_options[i].strip()
    
    # 感情ラベルと受諾度合いラベルのパターン
    emotions = ["平静", "安心", "喜び", "怒り", "悲しみ"]
    acceptances = [str(i) for i in range(1, 6)]
    
    # 各ラベルが何個付与されているかを数えて、辞書として記録する
    count_dict = {}
    if mode == "system_emotion" or mode == "user_emotion":
        for x in emotions:
            count_dict[x] = label_options.count(x)
    elif mode == "user_acceptance":
        for x in acceptances:
            count_dict[x] = label_options.count(x)
    
    # 2個以上付与されているラベルが何なのかを求める
    # 2つ以上同じラベルが付与されていたら、リストの要素は必ず1つになる
    # 1つずつ違うラベルが付与されていたら、resultは空になる
    result = [k for k, v in count_dict.items() if v >= 2]
    if result != []:
        return result[0]
    else:
        return "NONE"

def data_preperation():
    """
    データセットをデータベースに登録する。
    受諾度合いラベルと感情状態ラベルは、1発話に付与されているもの先頭3個を使う。
    その3個のうち2個以上同じものがあれば、その発話のラベルとして採用する。
    1つも同じものが無ければ、NONEを付与する。
    データベースに登録はするが、実際の学習では、NONEが付与されたものは除去する。
    """
    
    origin_path = "../emotional_persuasive_dialogue/"
    # データセットを格納するデータベース
    db = sqlite3.connect("dialogue_corpus.db")
    c = db.cursor()
    
    for scene in ["cleaning", "exercise", "game", "lunch", "sleep"]:
        for i in range(0, 200):
            # 5場面 * 200対話
            # Pathオブジェクト
            path = Path(origin_path + scene + "/" + str(i) + ".txt.label")
            
            # executeメソッドでSQL文を実行する
            # カラム：talker | talk_content | acceptance | emotion
            table_name = scene + str(i)
            create_table = "create table " + table_name + " (talker text, talk_content text, emotion text, acceptance text)"
            c.execute(create_table)
            
            with path.open('r', encoding='utf-8') as file:
                for line in file:
                    # データセットから話者などの情報を取得する
                    columns = line.split(",") # 例：['ロボット1', 'Ａ君、お部屋を掃除をしましょう。', '平静-平静-安心-怒り', '\n']
                    talker, talk_content, emotion, acceptance = "", "", "", ""
                    if columns[0].find("ロボット") != -1:
                        # システムの発話を読み込んでいる時
                        talker, talk_content, emotion, acceptance = "system", columns[1], select_label("system_emotion", columns[2]), "-"
                    elif columns[0].find("A君") != -1:
                        # ユーザの発話を読み込んでいる時
                        talker, talk_content, emotion, acceptance = "user", columns[1], select_label("user_emotion", columns[2]), select_label("user_acceptance", columns[3])
                    
                    # SQL文に値をセットする場合は，Pythonのformatメソッドなどは使わずに，
                    # セットしたい場所に?を記述し，executeメソッドの第2引数に?に当てはめる値を
                    # タプルで渡す．
                    sql = "insert into " + table_name + " (talker, talk_content, emotion, acceptance) values (?, ?, ?, ?)"
                    data = (talker, talk_content, emotion, acceptance)
                    c.execute(sql, data)

                    db.commit()
            print("Table:{} has been registered.".format(table_name))
        print("Scene:{} has been registered.".format(scene))
    db.close()
    print("dialogue_corpus.db has been structed completely.")

before_time = time.time()
data_preperation()
after_time = time.time()
print("By time.time(): data_preperation took about {} seconds.".format(after_time - before_time))

Table:cleaning0 has been registered.
Table:cleaning1 has been registered.
Table:cleaning2 has been registered.
Table:cleaning3 has been registered.
Table:cleaning4 has been registered.
Table:cleaning5 has been registered.
Table:cleaning6 has been registered.
Table:cleaning7 has been registered.
Table:cleaning8 has been registered.
Table:cleaning9 has been registered.
Table:cleaning10 has been registered.
Table:cleaning11 has been registered.
Table:cleaning12 has been registered.
Table:cleaning13 has been registered.
Table:cleaning14 has been registered.
Table:cleaning15 has been registered.
Table:cleaning16 has been registered.
Table:cleaning17 has been registered.
Table:cleaning18 has been registered.
Table:cleaning19 has been registered.
Table:cleaning20 has been registered.
Table:cleaning21 has been registered.
Table:cleaning22 has been registered.
Table:cleaning23 has been registered.
Table:cleaning24 has been registered.
Table:cleaning25 has been registered.
Table:cleaning26 has b

Table:exercise13 has been registered.
Table:exercise14 has been registered.
Table:exercise15 has been registered.
Table:exercise16 has been registered.
Table:exercise17 has been registered.
Table:exercise18 has been registered.
Table:exercise19 has been registered.
Table:exercise20 has been registered.
Table:exercise21 has been registered.
Table:exercise22 has been registered.
Table:exercise23 has been registered.
Table:exercise24 has been registered.
Table:exercise25 has been registered.
Table:exercise26 has been registered.
Table:exercise27 has been registered.
Table:exercise28 has been registered.
Table:exercise29 has been registered.
Table:exercise30 has been registered.
Table:exercise31 has been registered.
Table:exercise32 has been registered.
Table:exercise33 has been registered.
Table:exercise34 has been registered.
Table:exercise35 has been registered.
Table:exercise36 has been registered.
Table:exercise37 has been registered.
Table:exercise38 has been registered.
Table:exerci

Table:game29 has been registered.
Table:game30 has been registered.
Table:game31 has been registered.
Table:game32 has been registered.
Table:game33 has been registered.
Table:game34 has been registered.
Table:game35 has been registered.
Table:game36 has been registered.
Table:game37 has been registered.
Table:game38 has been registered.
Table:game39 has been registered.
Table:game40 has been registered.
Table:game41 has been registered.
Table:game42 has been registered.
Table:game43 has been registered.
Table:game44 has been registered.
Table:game45 has been registered.
Table:game46 has been registered.
Table:game47 has been registered.
Table:game48 has been registered.
Table:game49 has been registered.
Table:game50 has been registered.
Table:game51 has been registered.
Table:game52 has been registered.
Table:game53 has been registered.
Table:game54 has been registered.
Table:game55 has been registered.
Table:game56 has been registered.
Table:game57 has been registered.
Table:game58 h

Table:lunch65 has been registered.
Table:lunch66 has been registered.
Table:lunch67 has been registered.
Table:lunch68 has been registered.
Table:lunch69 has been registered.
Table:lunch70 has been registered.
Table:lunch71 has been registered.
Table:lunch72 has been registered.
Table:lunch73 has been registered.
Table:lunch74 has been registered.
Table:lunch75 has been registered.
Table:lunch76 has been registered.
Table:lunch77 has been registered.
Table:lunch78 has been registered.
Table:lunch79 has been registered.
Table:lunch80 has been registered.
Table:lunch81 has been registered.
Table:lunch82 has been registered.
Table:lunch83 has been registered.
Table:lunch84 has been registered.
Table:lunch85 has been registered.
Table:lunch86 has been registered.
Table:lunch87 has been registered.
Table:lunch88 has been registered.
Table:lunch89 has been registered.
Table:lunch90 has been registered.
Table:lunch91 has been registered.
Table:lunch92 has been registered.
Table:lunch93 has be

Table:sleep96 has been registered.
Table:sleep97 has been registered.
Table:sleep98 has been registered.
Table:sleep99 has been registered.
Table:sleep100 has been registered.
Table:sleep101 has been registered.
Table:sleep102 has been registered.
Table:sleep103 has been registered.
Table:sleep104 has been registered.
Table:sleep105 has been registered.
Table:sleep106 has been registered.
Table:sleep107 has been registered.
Table:sleep108 has been registered.
Table:sleep109 has been registered.
Table:sleep110 has been registered.
Table:sleep111 has been registered.
Table:sleep112 has been registered.
Table:sleep113 has been registered.
Table:sleep114 has been registered.
Table:sleep115 has been registered.
Table:sleep116 has been registered.
Table:sleep117 has been registered.
Table:sleep118 has been registered.
Table:sleep119 has been registered.
Table:sleep120 has been registered.
Table:sleep121 has been registered.
Table:sleep122 has been registered.
Table:sleep123 has been register

In [1]:
import my_data_preparation_kit as my_dpkit

## 1つ下のセルを実行した時は、my_data_preparation_kitの内容をこのファイルに直接書いていた。
## それで実行したら、PCがフリーズした（出力ログの溜まりすぎ？）。
## -> データの前処理の関数を1つのpythonファイルにまとめる。
## -> 実際の処理は複数のノートブックに分割し、それぞれで上記のpythonファイルをimportして利用する。
## これで行けるはず。

In [2]:
## word_tableとsynonym_tableを作成してJSON形式で保存する
## このセルは1回実行しておくだけでOK。それ以降はJSONファイルを読み込む

my_dpkit.record_word_and_synonym()

Finished getting all user talks from dialogue_corpus.db: about 0.21516919136047363 seconds.
Finished making word_table, synonym_table: about 19292.525589942932 seconds.
Finished registering word_table as a dict: word_table_data.json
Finished registering synonym_table as a dict: synonym_table_data.json
Finished saving word_table, synonym_table: about 0.16710400581359863 seconds.
