In [1]:
import random
import sqlite3
from pathlib import Path

def select_label(mode, label_options):
    """
    受諾度合い、感情ラベルで、3つ中2つ以上一致している
    ラベルをその発話のラベルとして採用する
    """
    
    # ラベル候補のリストからランダムに3つ要素を取得する
    label_options = random.sample(label_options.split("-"), 3)
    # 改行文字を取り除く
    # 感情状態ラベルの末尾には「,」が付いていて、改行文字が末尾に来ない
    # 受諾度合いラベルの末尾には「,」が付いていないため、改行文字が末尾に来てしまう
    # 改行文字を除去しないと、ラベルの付与数を正常にcountできない
    for i in range(len(label_options)):
        label_options[i] = label_options[i].strip()
    
    # 感情ラベルと受諾度合いラベルのパターン
    emotions = ["平静", "安心", "喜び", "怒り", "悲しみ"]
    acceptances = [str(i) for i in range(1, 6)]
    
    # 各ラベルが何個付与されているかを数えて、辞書として記録する
    count_dict = {}
    if mode == "system_emotion" or mode == "user_emotion":
        for x in emotions:
            count_dict[x] = label_options.count(x)
    elif mode == "user_acceptance":
        for x in acceptances:
            count_dict[x] = label_options.count(x)
    
    # 2個以上付与されているラベルが何なのかを求める
    # 2つ or 3つ全て同じラベルが付与されていたら、リストの要素は必ず1つになる
    # 1つずつ違うラベルが付与されていたら、resultは空になる
    result = [k for k, v in count_dict.items() if v >= 2]
    if result != []:
        return result[0]
    else:
        return "NONE"

def data_preperation():
    """
    データセットをデータベースに登録する。
    受諾度合いラベルと感情状態ラベルは、1発話に付与されているもの先頭3個を使う。
    その3個のうち2個以上同じものがあれば、その発話のラベルとして採用する。
    1つも同じものが無ければ、NONEを付与する。
    データベースに登録はするが、実際の学習では、NONEが付与されたものは除去する。
    """
    
    origin_path = "../emotional_persuasive_dialogue/"
    # データセットを格納するデータベース
    db = sqlite3.connect("dialogue_corpus.db")
    c = db.cursor()
    
    for scene in ["cleaning", "exercise", "game", "lunch", "sleep"]:
        for i in range(0, 200):
            # 5場面 * 200対話
            # Pathオブジェクト
            path = Path(origin_path + scene + "/" + str(i) + ".txt.label")
            
            # executeメソッドでSQL文を実行する
            # カラム：talker | talk_content | acceptance | emotion
            table_name = scene + str(i)
            create_table = "create table " + table_name + " (talker text, talk_content text, emotion text, acceptance text)"
            c.execute(create_table)
            
            with path.open('r', encoding='utf-8') as file:
                for line in file:
                    # データセットから話者などの情報を取得する
                    columns = line.split(",") # 例：['ロボット1', 'Ａ君、お部屋を掃除をしましょう。', '平静-平静-安心-怒り', '\n']
                    talker, talk_content, emotion, acceptance = "", "", "", ""
                    if columns[0].find("ロボット") != -1:
                        # システムの発話を読み込んでいる時
                        talker, talk_content, emotion, acceptance = "system", columns[1], select_label("system_emotion", columns[2]), "-"
                    elif columns[0].find("A君") != -1:
                        # ユーザの発話を読み込んでいる時
                        talker, talk_content, emotion, acceptance = "user", columns[1], select_label("user_emotion", columns[2]), select_label("user_acceptance", columns[3])
                    
                    # SQL文に値をセットする場合は，Pythonのformatメソッドなどは使わずに，
                    # セットしたい場所に?を記述し，executeメソッドの第2引数に?に当てはめる値を
                    # タプルで渡す．
                    sql = "insert into " + table_name + " (talker, talk_content, emotion, acceptance) values (?, ?, ?, ?)"
                    data = (talker, talk_content, emotion, acceptance)
                    c.execute(sql, data)

                    db.commit()
            print("Table:{} has been registered.".format(table_name))
        print("Scene:{} has been registerd.".format(scene))
    db.close()
    print("dialogue_corpus.db has been structed completely.")

data_preperation()

Table:cleaning0 has been registered.
Table:cleaning1 has been registered.
Table:cleaning2 has been registered.
Table:cleaning3 has been registered.
Table:cleaning4 has been registered.
Table:cleaning5 has been registered.
Table:cleaning6 has been registered.
Table:cleaning7 has been registered.
Table:cleaning8 has been registered.
Table:cleaning9 has been registered.
Table:cleaning10 has been registered.
Table:cleaning11 has been registered.
Table:cleaning12 has been registered.
Table:cleaning13 has been registered.
Table:cleaning14 has been registered.
Table:cleaning15 has been registered.
Table:cleaning16 has been registered.
Table:cleaning17 has been registered.
Table:cleaning18 has been registered.
Table:cleaning19 has been registered.
Table:cleaning20 has been registered.
Table:cleaning21 has been registered.
Table:cleaning22 has been registered.
Table:cleaning23 has been registered.
Table:cleaning24 has been registered.
Table:cleaning25 has been registered.
Table:cleaning26 has b

Table:exercise13 has been registered.
Table:exercise14 has been registered.
Table:exercise15 has been registered.
Table:exercise16 has been registered.
Table:exercise17 has been registered.
Table:exercise18 has been registered.
Table:exercise19 has been registered.
Table:exercise20 has been registered.
Table:exercise21 has been registered.
Table:exercise22 has been registered.
Table:exercise23 has been registered.
Table:exercise24 has been registered.
Table:exercise25 has been registered.
Table:exercise26 has been registered.
Table:exercise27 has been registered.
Table:exercise28 has been registered.
Table:exercise29 has been registered.
Table:exercise30 has been registered.
Table:exercise31 has been registered.
Table:exercise32 has been registered.
Table:exercise33 has been registered.
Table:exercise34 has been registered.
Table:exercise35 has been registered.
Table:exercise36 has been registered.
Table:exercise37 has been registered.
Table:exercise38 has been registered.
Table:exerci

Table:game29 has been registered.
Table:game30 has been registered.
Table:game31 has been registered.
Table:game32 has been registered.
Table:game33 has been registered.
Table:game34 has been registered.
Table:game35 has been registered.
Table:game36 has been registered.
Table:game37 has been registered.
Table:game38 has been registered.
Table:game39 has been registered.
Table:game40 has been registered.
Table:game41 has been registered.
Table:game42 has been registered.
Table:game43 has been registered.
Table:game44 has been registered.
Table:game45 has been registered.
Table:game46 has been registered.
Table:game47 has been registered.
Table:game48 has been registered.
Table:game49 has been registered.
Table:game50 has been registered.
Table:game51 has been registered.
Table:game52 has been registered.
Table:game53 has been registered.
Table:game54 has been registered.
Table:game55 has been registered.
Table:game56 has been registered.
Table:game57 has been registered.
Table:game58 h

Table:lunch65 has been registered.
Table:lunch66 has been registered.
Table:lunch67 has been registered.
Table:lunch68 has been registered.
Table:lunch69 has been registered.
Table:lunch70 has been registered.
Table:lunch71 has been registered.
Table:lunch72 has been registered.
Table:lunch73 has been registered.
Table:lunch74 has been registered.
Table:lunch75 has been registered.
Table:lunch76 has been registered.
Table:lunch77 has been registered.
Table:lunch78 has been registered.
Table:lunch79 has been registered.
Table:lunch80 has been registered.
Table:lunch81 has been registered.
Table:lunch82 has been registered.
Table:lunch83 has been registered.
Table:lunch84 has been registered.
Table:lunch85 has been registered.
Table:lunch86 has been registered.
Table:lunch87 has been registered.
Table:lunch88 has been registered.
Table:lunch89 has been registered.
Table:lunch90 has been registered.
Table:lunch91 has been registered.
Table:lunch92 has been registered.
Table:lunch93 has be

Table:sleep96 has been registered.
Table:sleep97 has been registered.
Table:sleep98 has been registered.
Table:sleep99 has been registered.
Table:sleep100 has been registered.
Table:sleep101 has been registered.
Table:sleep102 has been registered.
Table:sleep103 has been registered.
Table:sleep104 has been registered.
Table:sleep105 has been registered.
Table:sleep106 has been registered.
Table:sleep107 has been registered.
Table:sleep108 has been registered.
Table:sleep109 has been registered.
Table:sleep110 has been registered.
Table:sleep111 has been registered.
Table:sleep112 has been registered.
Table:sleep113 has been registered.
Table:sleep114 has been registered.
Table:sleep115 has been registered.
Table:sleep116 has been registered.
Table:sleep117 has been registered.
Table:sleep118 has been registered.
Table:sleep119 has been registered.
Table:sleep120 has been registered.
Table:sleep121 has been registered.
Table:sleep122 has been registered.
Table:sleep123 has been register

In [1]:
# 形態素解析以外の前処理

import nltk
import unicodedata
import re
from pathlib import Path

def normalize_unicode(text, form = "NFKC"):
    """
    unicodeの正規化
    半角カタカナを全角カタカナに変換したりする
    """
    normalized_text = unicodedata.normalize(form, text)
    return normalized_text

def normalize_number(text):
    """
    1個以上連続した数字を0で置換
    """
    replaced_text = re.sub(r"\d+", "0", text)
    return replaced_text

def lower_text(text):
    """
    英字を小文字に変換する
    """
    return text.lower()

def normalize(text):
    """
    語の正規化
    """
    normalized_text = normalize_unicode(text)
    normalized_text = normalize_number(normalized_text)
    normalized_text = lower_text(normalized_text)
    return normalized_text

def get_stopwords():
    """
    ストップワードの取得
    以下からストップワードの情報を保存する
    http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt
    """
    stopwords_file = Path("../stopwords_jpn.txt")
    with stopwords_file.open("r", encoding="utf-8") as file:
        # 改行文字のみの行は無視
        # 各要素の末尾にある改行文字を除去
        stopwords = [line.rstrip() for line in file if line != "\n"]
    stopwords.remove("時間") # 同じ「時間」でも、「2時間」(接尾語)はあまり重要ではないが「時間が...」は重要となるため
    return stopwords

In [2]:
import MeCab
import sqlite3
from pprint import pprint

# タスクメモ
# SVRの学習特徴量 コーパスに含まれる全単語分の次元数の疎ベクトル
# (row, col) = (各単語, コーパス内の各発話)

def feature_vector(user_talks):
    """
    入力発話からの特徴ベクトルの作成
    1. MeCabを用いて形態素解析→ユーザ発話から言語特徴量を抽出する
    ※言語特徴量...当該発話に含まれる単語と、WordNetを用いて抽出した同義語からなるbag of wordsの単語特徴ベクトル
    ※SVRの学習特徴量...コーパスに含まれる全単語数の次元数のベクトルを用意する
      入力された発話に含まれる単語に相当する次元を1、含まれない単語に相当する次元を0とする疎ベクトルを作成する
    2. 単語極性辞書→発話文に含まれる単語に付与された極性スコアの平均値を算出する
    3. 2の値を上記の単語特徴ベクトルに付加→1発話の特徴ベクトルとする
    """
    # WordNet、単語極性辞書の読み込み
    wn_db = sqlite3.connect("../wnjpn.db")
    wn_c = wn_db.cursor()
    pn_table = {}
    with open("../pn_ja.dic") as file:
        for line in file:
            line = line.split(":")
            pn_table[line[0]] = float(line[3])
    
    m = MeCab.Tagger("-Ochasen")
    # 1対話内の各ユーザ発話について
    for talk in user_talks:
        # 発話の正規化
        talk = normalize(talk)
        # MeCabによる形態素解析で、発話に含まれる単語を抽出する
        node = m.parseToNode(talk)
        # 抽出した単語に対応する単語IDを取得する
        word_table = {} # {単語: 単語ID}
        while node:
            # 1.表層形のデータが空ではない
            # 2.表層形がget_stopwords()に含まれていない
            # 3.単語の品詞が名詞、動詞、形容詞以外(これら以外は解析においてあまり重要ではないため)
            # 4.名詞だとしても接尾語ではない(同じ名詞の「時間」でも、「2時間」はあまり重要ではないが「時間が...」は重要) 
            surface_has_content = node.surface != ""
            not_in_stopwords = node.surface not in get_stopwords()
            is_meaningful_word = node.feature.split(",")[0] == "名詞" or node.feature.split(",")[0] == "動詞" or node.feature.split(",")[0] == "形容詞"
            not_suffix = node.feature.split(",")[1] != "接尾"
            if surface_has_content and not_in_stopwords and is_meaningful_word and not_suffix:
                # 各単語の語幹を取り出す
                # ※node.surface -> 表層形(区切った単語自体)
                # ※node.feature -> 表層形の詳細(品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音)
                orig_form = node.feature.split(",")[6]
                wn_c.execute("select * from word where lemma = ?", (orig_form,))
                word_table[node.surface] = wn_c.fetchall() # 単語ID|jpnかeng|単語|None|品詞
            print(node.surface, node.feature)
            node = node.next
        
        # 単語IDを用いて各単語の同義語を取得する
        synonym_table = {} # {単語ID: 同義語のリスト}
        for k, v in word_table.items():
            if v != []:
                wordid = v[0][0] # word_tableから単語IDを参照する
                wn_c.execute("select * from sense where wordid = ?", (wordid,))
                synset_info = wn_c.fetchall() # synset|wordid|lang|rank|lexid|freq|src
                synsets = [s[0] for s in synset_info] # synset_infoからsynsetだけを取得
                for s in synsets:
                    wn_c.execute("select lemma from sense, word where synset = ? and word.lang = \"jpn\" and sense.wordid = word.wordid", (s,))
                    synonym_table[wordid] = wn_c.fetchall()
        
        print("word_table ", end = "")
        pprint(word_table)
        print("synonym_table ", end = "")
        pprint(synonym_table)
        print()
        
        # 当該発話に含まれる単語と，WordNetを用いて抽出した同義語からなる bag of wordsの単語特徴ベクトル
        # (row, col) = (同義語, 単語)

def estimate_acceptance():
    """
    ユーザの受諾度合いの推定
    学習データ：入力とするユーザ発話の特徴ベクトルおよび出力する正解ラベル
    正解ラベル->発話に付与される5段階のユーザの受諾度合いラベルを用いる
    """

# システムから発話
# ユーザの入力発話
# ユーザの受諾度合いの推定
# システム感情の遷移
# 応答文選択
# システムの応答
if __name__ == "__main__":
    db = sqlite3.connect("dialogue_corpus.db")
    c = db.cursor()
    c.execute("select * from cleaning78")
    usertalk_list = [talk[1] for i, talk in enumerate(c.fetchall()) if i % 2 == 1]
    print(usertalk_list)
    print()
    feature_vector(usertalk_list)

['そんなことないよ', 'ちょっとじゃん', 'えー', '僕は平気だよ', 'えーそれはいやだー', '僕一人で？いっぱい時間がかかるじゃん', 'じゃあ床だけ掃除機かけるよ', '掃除機持ってきて', '手伝ってよ', 'じゃあ始めよう！']

 BOS/EOS,*,*,*,*,*,*,*,*
そんな 連体詞,*,*,*,*,*,そんな,ソンナ,ソンナ
こと 名詞,非自立,一般,*,*,*,こと,コト,コト
ない 形容詞,自立,*,*,形容詞・アウオ段,基本形,ない,ナイ,ナイ
よ 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
 BOS/EOS,*,*,*,*,*,*,*,*
word_table {'ない': [(157029, 'jpn', 'ない', None, 'r')]}
synonym_table {157029: [('ない',)]}

 BOS/EOS,*,*,*,*,*,*,*,*
ちょっと 副詞,助詞類接続,*,*,*,*,ちょっと,チョット,チョット
じゃん 助動詞,*,*,*,不変化型,基本形,じゃん,ジャン,ジャン
 BOS/EOS,*,*,*,*,*,*,*,*
word_table {}
synonym_table {}

 BOS/EOS,*,*,*,*,*,*,*,*
えー フィラー,*,*,*,*,*,えー,エー,エー
 BOS/EOS,*,*,*,*,*,*,*,*
word_table {}
synonym_table {}

 BOS/EOS,*,*,*,*,*,*,*,*
僕 名詞,代名詞,一般,*,*,*,僕,ボク,ボク
は 助詞,係助詞,*,*,*,*,は,ハ,ワ
平気 名詞,一般,*,*,*,*,平気,ヘイキ,ヘイキ
だ 助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ
よ 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
 BOS/EOS,*,*,*,*,*,*,*,*
word_table {'僕': [(172602, 'jpn', '僕', None, 'n')],
 '平気': [(208887, 'jpn', '平気', None, 'n'), (208888, 'jpn', '平気', None, 'a')]}
synonym_table {172602: [('召使い',), ('僕',), ('下郎',)