In [1]:
import copy
import collections
import glob
import numpy as np
import os
import subprocess
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import random

from itertools import groupby
torch.manual_seed(1)

ans_path = './ner-mturk/answers*'
ans_file_list = sorted(glob.glob(ans_path))
single_ans_list = sorted(glob.glob('ner-mturk/single_ans*'))
truth_file = './ner-mturk/ground_truth.txt'
path_w = './MBEM_outputs'
train_num = 5385
train_num5 = 1077
test_num = 600
k = 9
redundancy = 5
iteration_times = 5
#ans_file = ans_file_list[6]
#ans_file = single_ans_list[1]
#ans_file = '../MBEM/ner-mturk/sim_data_cw0_08.txt'
#ans_file = '../MBEM/ner-mturk/sim_data_single_cw0_07.txt'
ans_file = '../MBEM/ner-mturk/mild_single_07.txt'

In [2]:
def get_ans(workers, answers):
    ans_label = []
    for a in workers:
        ans_label.append(class_dic[answers[a]])
    return ans_label

In [3]:
def get_n_ans(answers, n):
    ans = []
    for k in range(len(answers)):
        ans.append(answers[k][n])
    return ans

In [4]:
#tupleには対応していないので返り値用に多重配列作り直した方がいいかもしれない
def label2num(answers, class_dic, class_dic_r):
    if answers[0][0] in class_dic:
        for i in range(len(answers)):
            for w in range(len(answers[i])):
                answers[i][w] = class_dic[answers[i][w]]
    elif answers[0][0] in class_dic_r:
        for i in range(len(answers)):
            for w in range(len(answers[i])):
                answers[i][w] = class_dic_r[int(answers[i][w])]
    else: print("error")
    return answers

In [5]:
def read_output(path, class_dic_r, to_label=False):
    f = open(path)
    make_xypair = lambda v: list(zip(*[l.strip().split() for l in v]))
    is_emptyline = lambda x: x.strip() != '' and not x.startswith('-DOCSTART-')
    list_int = lambda t: list([class_dic_r[int(v)] for v in t])
    xypairs = [make_xypair(v) for k, v in groupby(f, is_emptyline) if k]
    if to_label == True:
        return ([p[0] for p in xypairs], [list(list_int(p[1])) for p in xypairs])
    if len(xypairs[0]) == 2:
        return ([p[0] for p in xypairs], [list(p[1]) for p in xypairs])
    else:
        return ([p[0] for p in xypairs], [list(p[1:]) for p in xypairs])

In [6]:
#データをダウンロード trainとvalに分けて戻す
def load_crowd_data(ans_file, truth_file, train_num):
    #ここからanswersの格納
    #answers_list[task_num] = [[words, ...], [回答者], [[answers(one hot ベクトル)]...]]
    answers_list = []
    sentence = []
    answers = []
    worker = []
    words_dic = {}
    get_worker = lambda x: tuple([i for i, l in enumerate(x) if l !='?'])
    for l in open(ans_file):
        ans = l.split()
        if len(l) < 2:
            answers_list.append([sentence, worker, answers])
            sentence = []
            worker = []
            answers = []
        else:
            if len(worker) < 1:
                worker = get_worker(ans[1:])
            if not ans[0] in words_dic:
                words_dic[ans[0]] = len(words_dic)
            sentence.append(ans[0])
            answers.append(get_ans(worker, ans[1:]))            
    #truth_list: [(task_word, answer), ....]
    truth_list = [tuple(l.split()) for l in open(truth_file) if l != "\n"]
    ans_train = answers_list[0:train_num]
#    ans_test = answers_list[train_num:]
    return ans_train, words_dic

In [7]:
def ans_matrix(ans_train, redundancy, k):
    ans_m = np.zeros((num_words, redundancy,k))
    m = 0
    for n in range(len(ans_train)):
        for i in range(len(ans_train[n][0])):
            ans_m[m] = np.eye(k)[ans_train[n][2][i][:5]]
            m += 1
    return ans_m

In [8]:
#deep-learning-modelからのデータの読み込み
def get_newest_prediction():
    files_list = glob.glob("./deep-learning-model/outputs/*")
    latest_file = max(glob.glob(max(files_list, key=os.path.getctime)+"/*"), key=os.path.getctime) + "/best_model_log/pred.txt"
    prediction_list = [list(l.split()) for l in open(latest_file) if l != "\n"]
    return prediction_list

In [9]:
#ワーカのタスクに対する回答の事後確率の初期化
#Algorithm 1のInitialixe posterior distribution using weighted mv(論文p6)
def init_posdis(redundancy, ans_m, k):
    simple_agg = np.zeros((num_words,k))
    mv_ans = np.zeros((num_words))
    for r in range(redundancy):
        simple_agg += (1/redundancy)*ans_m[r]
    for i in range(len(simple_agg)):
        mv_ans[i] = np.argmax(simple_agg[i])
    return mv_ans

In [10]:
#ワーカのタスクに対する回答の事後確率の初期化
#mvしない場合
def init_posdis_single(redundancy, ans_m, k):
    simple_agg = np.zeros((num_words,k))
    mv_ans = np.zeros((num_words))
    simple_agg += (1/redundancy)*ans_m[0]
    for i in range(len(simple_agg)):
        mv_ans[i] = np.argmax(simple_agg[i])
    return mv_ans

In [11]:
def post_prob_DS(est_labels, ans_train, k, worker_num, num_words):
    #学習機とラベルの更新
    n = len(ans_train)
    m = worker_num
    e_conf = np.zeros((m,k,k))
    temp_conf = (1/float(k))*np.ones((m,k,k))
    labels_md = np.zeros((k))
    worker_acc = np.zeros((m,k)) #ワーカーの信頼度
    task_acc = np.zeros((n, k))
    #est_labelsを正解ラベルとして混同行列を作成 式(7)
    for i in range(n):
        for l in range(len(ans_train[i][2])):
            #真のラベルの周辺分布を更新
            labels_md[int(est_labels[i][l])] += 1
            for a, j in enumerate(ans_train[i][1]): #各回答ワーカーについて
                temp_conf[j,:,:] = temp_conf[j,:,:] + np.outer(np.eye(k)[int(est_labels[i][l])],np.eye(k)[ans_train[i][2][l][a]]) #外積 混同行列の作成
    #temp_confの正規化
    for j in range(m):  
        for r in range(k):
            e_conf[j,:,:] = np.divide(temp_conf[j,:,:],np.outer(np.sum(temp_conf[j,:,:],axis =1),np.ones(k)))
    labels_md = np.divide(labels_md, np.sum(labels_md)*np.ones(k))
    print("labels_md", labels_md)
    #混同行列を使って真のラベル推定(論文p5の(5)式)
    for i in range(n):
        for l in range(len(ans_train[i][2])):
            temp_class = 0.0
            for a, j in enumerate(ans_train[i][1]): #論文と添字違うので治せたら直す
                temp_acc = np.log(np.dot(e_conf[j,:,:],np.transpose(np.eye(k)[ans_train[i][2][l][a]])))
                temp_class = temp_class + temp_acc
            temp_class = np.log(labels_md) +temp_class
            est_labels[i][l] = np.argmax(temp_class)
    return est_labels, e_conf

In [12]:
def post_prob_DS_single(est_labels, ans_train, k, worker_num, num_words):
    #学習機とラベルの更新
    n = len(ans_train)
    m = worker_num
    temp_class = np.zeros((num_words,k))
    e_conf = np.zeros((m,k,k))
    temp_conf = (1/float(k))*np.ones((m,k,k))
    labels_md = np.zeros((k))
    worker_acc = np.ones((m,k)) #ワーカーの信頼度
    task_acc = np.zeros((n, k))
    #est_labelsを正解ラベルとして混同行列を作成 式(7)
    for i in range(n):
        for l in range(len(ans_train[i][2])):
            #真のラベルの周辺分布を更新
            labels_md[int(est_labels[i][l])] += 1
            temp_conf[ans_train[i][1][0],:,:] = temp_conf[ans_train[i][1][0],:,:] + np.outer(np.eye(k)[int(est_labels[i][l])],np.eye(k)[ans_train[i][2][l][0]]) #外積 混同行列の作成
    #temp_confの正規化
    for j in range(m):  
        for r in range(k):
            e_conf[j,:,:] = np.divide(temp_conf[j,:,:],np.outer(np.sum(temp_conf[j,:,:],axis =1),np.ones(k)))
    labels_md = np.divide(labels_md, np.sum(labels_md)*np.ones(k))
    print("labels_md", labels_md)
    #混同行列を使って真のラベル推定(論文p5の(5)式)
    for i in range(n):
        for l in range(len(ans_train[i][2])):
            temp_class = 0.0
            temp_acc = np.log(np.dot(e_conf[ans_train[i][1][0],:,:],np.transpose(np.eye(k)[ans_train[i][2][l][0]])))
            temp_class = temp_class + temp_acc
            worker_acc[ans_train[i][1][0]] =+ temp_acc
            temp_class = np.log(labels_md) + temp_class
            est_labels[i][l] = np.argmax(temp_class)
    return est_labels, e_conf

In [13]:
#deep-learining-modelに渡すデータの書き込み
def write_prediction(est_labels, ans_train, iter_num):
    w_list = []
    for n in range(len(ans_train)):
        for i in range(len(ans_train[n][0])):
            w_list.append(ans_train[n][0][i] +" "+ str(est_labels[n][i]) + "\n")
        w_list.append("\n")
    w_file = path_w + "/prediction.txt"
    with open(w_file, mode='w') as f:
        f.writelines(w_list)
    with open("./MBEM_outputs/prediction"+str(iter_num)+".txt", mode='w') as f:
        f.writelines(w_list)

In [14]:
#正解ラベルと混同行列が与えられた時のワーカーの回答ラベル作成
def worker_ans(true_labels, labelnum_dic, class_dic_r, conf):
    ans_true = [labelnum_dic[y] for y in true_labels]
    res_ans = []
    a = 0.0
    res_pre = -1
    for i in range(len(ans_true)):
        res = np.argmax(np.random.multinomial(1,conf[ans_true[i],:]))
        if res == 4:
            res_ans.append(class_dic_r[8])
        else:
            if res_pre == res:
                res_ans.append(class_dic_r[res+4])
            else:
                res_ans.append(class_dic_r[res])
        res_pre = res
    return res_ans

In [15]:
#引数:タスク,混同行列,書き込みファイル
#出力:ワーカーの回答(ファイルに書き込む)
#ワーカーはランダムにとる
def generate_ans(X_true, y_true, labelnum_dic, class_dic_r, conf, repeat):
    m, k = conf.shape[0], conf.shape[1]
    n = len(X_true)
    workers_this_example = np.zeros((n,repeat),dtype=np.int)
    ans_list = []
    for i in range(n):        #m人のワーカーからrepeat人重複なしで選ぶ
        workers_this_example[i] = np.sort(np.random.choice(m,repeat,replace=False))
        answers = []
        for j in workers_this_example[i]:
            answers.append(worker_ans(y_true[i], labelnum_dic, class_dic_r, conf[j]))        
        ans_list.append([X_true[i], list(workers_this_example[i]),answers])
    return ans_list

In [37]:
#gammma=0.7
"""
conf = generate_conf(class_label, 47, 0.7)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data07.txt')
ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_single07.txt')

#gamma=0.8
conf = generate_conf(class_label, 47, 0.8)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data08.txt')
#ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
#write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_single08.txt')

#gamma=0.8

conf = generate_conf(class_label, 47, 0.8, class_wise=0)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_cw0_08.txt')
#ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
#write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_single_cw0_08.txt')

#gamma=0.7
conf = generate_conf(class_label, 47, 0.7, class_wise=0)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_cw0_07.txt')
#ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
#write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/sim_data_single_cw0_07.txt')

"""

In [16]:
def write_ans_list(ans_list, worker_num, w_path):
    w_str = ""
    for n in range(len(ans_list)):
        for i in range(len(ans_list[n][0])):
            w_str += ans_list[n][0][i] + fillin_ans(ans_list[n][1], ans_list[n][2], i, worker_num) + "\n"
        w_str += "\n"
    with open(w_path, mode='w') as f:
        f.write(w_str)

In [17]:
def fillin_ans(workers, answers, i, worker_num):
    w_str = ""
    for w in range(worker_num):
        if w in workers:
            ans_index = workers.index(w)
            w_str += " " + answers[ans_index][i]
        else:
            w_str += " ?"
    return w_str

In [18]:
#シミュレーションデータのワーカ生成
#混同行列の作成
#引数:ラベル,ワーカーの人数,ワーカーの正解率
#返り値:混同行列
def generate_conf(class_label, m, gamma, class_wise=1):
    k = len(class_label)
    conf = (1/float(k))*np.ones((m,k,k))
    if class_wise==1:
        for i in range(m):
            for j in range(k):
            # gammaの確率でそのクラスは正解する
                if(np.random.uniform(0,1) < gamma):
                    conf[i,j,:] = 0
                    conf[i,j,j] = 1 
                else:
                    conf[i,j,:] = 1
                    conf[i,j,j] = 1 + np.random.uniform(0.1,0.11)
                    conf[i,j,:] = conf[i,j,:]/np.sum(conf[i,j,:])
    if class_wise==0:
        for i in range(m):
        # gammaの確率でワーカーは正解する
            if(np.random.uniform(0,1) < gamma):
                for j in range(k):
                    conf[i,j,:] = 0
                    conf[i,j,j] = 1 
            else:
                for j in range(k):
                    conf[i,j,:] = 1
                    conf[i,j,j] = 1 + np.random.uniform(0.1,0.11)
                    conf[i,j,:] = conf[i,j,:]/np.sum(conf[i,j,:])
    return conf

In [19]:
#学習機に渡すシミュレーションデータの作成
def makefile_ans(X_true, y_true, class_shuffle, precision, recall, w_path):
    loc_num, misc_num, org_num, per_num = 0, 0, 0, 0
    w_str = ""
    b_tag = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER']
    for ans in y_true:
        loc_num += ans.count('B-LOC')
        misc_num += ans.count('B-MISC')
        org_num += ans.count('B-ORG')
        per_num += ans.count('B-PER')
    print(loc_num, misc_num, org_num, per_num)
    P = loc_num + misc_num + org_num + per_num
    TP = int(recall*P) #NER正解データ数
    FP = int(TP*((1/precision)-1)) #NERミス
    FN = int(P - TP) #NER見逃し
    print("P", P, "TP", TP, "FP", FP, "FN", FN)
    z = 0
    for n in range(len(X_true)):
        if z < TP:
            for i in range(len(X_true[n])):
                w_str += X_true[n][i] + " " + y_true[n][i] + "\n"
            w_str += "\n"
            z += ans.count('B-LOC') + ans.count('B-MISC') + ans.count('B-ORG') + ans.count('B-PER')
        elif z < TP + FN:
            #完全に見逃す
            for i in range(len(X_true[n])):
                w_str += X_true[n][i] + " " + "O\n"
            w_str += "\n"
            z += ans.count('B-LOC') + ans.count('B-MISC') + ans.count('B-ORG') + ans.count('B-PER')
        else:
            #タグの付け方ミス(位置ミス、種類ミス) !位置ミスはタグのがしになりかねない(Bタグだけの時)!
            #if random.randint(0,1)==0: #位置ミス
            #    for i in range(len(X_true)):
            #        w_str += X_true[n][i] + " " + y_true[n][i]
            #    w_str += "\n" 
            #else #種類ミス
            a = random.randint(0,1)
            new_y = [class_shuffle[a][w] for w in y_true[n]]
            for i in range(len(X_true[n])):
                w_str += X_true[n][i] + " " + new_y[i] + "\n"
            w_str += "\n"
    with open(w_path, mode='w') as f:
        f.write(w_str)

In [79]:
#makefile_ans(X_true, y_true, class_shuffle, precision=0.5, recall=0.8, w_path='../MBEM/ner-mturk/sim_p05_r08.txt')

2929 1343 2779 2808
P 9859 TP 7887 FP 7887 FN 1972


In [80]:
#makefile_ans(X_true, y_true, class_shuffle, precision=0.8, recall=0.5, w_path='../MBEM/ner-mturk/sim_p08_r05.txt')

2929 1343 2779 2808
P 9859 TP 4929 FP 1232 FN 4930


In [29]:
#makefile_ans(X_true[:5385], y_true[:5385], class_shuffle, precision=0.7, recall=0.7, w_path='../MBEM/ner-mturk/sim_p07_r07.txt')
#makefile_ans(X_true[:5385], y_true[:5385], class_shuffle, precision=0.8, recall=0.6, w_path='../MBEM/ner-mturk/sim_p08_r06.txt')
#makefile_ans(X_true[:5385], y_true[:5385], class_shuffle, precision=0.6, recall=0.8, w_path='../MBEM/ner-mturk/sim_p06_r08.txt')

2567 1183 2547 2476
P 8773 TP 6141 FP 2631 FN 2632
2567 1183 2547 2476
P 8773 TP 5263 FP 1315 FN 3510
2567 1183 2547 2476
P 8773 TP 7018 FP 4678 FN 1755


In [26]:
makefile_ans(X_true[:5385], y_true[:5385], class_shuffle, precision=0.5, recall=0.5, w_path='../MBEM/ner-mturk/sim_p05_r05.txt')
makefile_ans(X_true[:5385], y_true[:5385], class_shuffle, precision=0.6, recall=0.6, w_path='../MBEM/ner-mturk/sim_p06_r06.txt')

2567 1183 2547 2476
P 8773 TP 4386 FP 4386 FN 4387
2567 1183 2547 2476
P 8773 TP 5263 FP 3508 FN 3510


In [144]:
#シミュレーションデータのワーカ生成
#混同行列の作成
#引数:ラベル,ワーカーの人数,ワーカーの正解率
#返り値:混同行列
def generate_mild_conf(class_label, m, eta_p, eta_r):#gamma,a をまとめてeta
    k = len(class_label)
    conf = (1/float(k))*np.ones((m,k,k))
    for i in range(m): #ワーカー
        #ベータ分布でサンプリングされた値r
        eta = np.random.uniform(a, 1.0, k) #ベータ分布で出すb=1で固定a=としてクラス数ぶんr_r,r_pサンプリング
                #Oタグの時にはeta_O,それ以外eta_b
                #r_r,r_pをサンプリング
                #r_rが高い->j,j＝r_r
                #r_pが高い->Oタグとjjが高い, kj(kはj以外)が低い
                #各クラスにr_r, r_p
                conf[i,j,:] = (1 - r)/(k)
                conf[i,j,j] += r
                #recallとprecisionの値
    return conf

In [25]:
def count_ner(y_true, b_tag):
    loc_num, misc_num, org_num, per_num = 0, 0, 0, 0
    for ans in y_true:
        loc_num += ans.count('B-LOC')
        misc_num += ans.count('B-MISC')
        org_num += ans.count('B-ORG')
        per_num += ans.count('B-PER')
    return loc_num, misc_num, org_num, per_num
    P = loc_num + misc_num + org_num + per_num
    TP = int(recall*P) #NER正解データ数
    FP = int(TP*((1/precision)-1)) #NERミス
    FN = int(P - TP) #NER見逃し
    print("P", P, "TP", TP, "FP", FP, "FN", FN)

In [24]:
count_ner(y_true, b_tag)

2929 1343 2779 2808


In [None]:
#シミュレーションデータのワーカ生成
#混同行列の作成
#引数:ラベル,ワーカーの人数,ワーカーの正解率
#返り値:混同行列
def generate_mild_conf(class_label, m, eta_p, eta_r):#gamma,a をまとめてeta
    k = len(class_label)
    conf = (1/float(k))*np.ones((m,k,k))
    for i in range(m): #ワーカー
        #ベータ分布でサンプリングされた値r
        eta = np.random.uniform(a, 1.0, k) #ベータ分布で出すb=1で固定a=としてクラス数ぶんr_r,r_pサンプリング
                #Oタグの時にはeta_O,それ以外eta_b
                #r_r,r_pをサンプリング
                #r_rが高い->j,j＝r_r
                #r_pが高い->Oタグとjjが高い, kj(kはj以外)が低い
                #各クラスにr_r, r_p
                conf[i,j,:] = (1 - r)/(k)
                conf[i,j,j] += r
                #recallとprecisionの値
    return conf

In [149]:
#gammma=0.0
#a = 0.0
conf = generate_mild_conf(class_label, 47, 0, 0.0)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_mv_00.txt')
ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_single_00.txt')

#a = 0.3
conf = generate_mild_conf(class_label, 47, 0, 0.3)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_mv_03.txt')
ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_single_03.txt')

#a = 0.5
conf = generate_mild_conf(class_label, 47, 0, 0.5)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_mv_05.txt')
ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_single_05.txt')

In [191]:
#a = 0.7
conf = generate_mild_conf(class_label, 47, 0, 0.5)
ans_list_temp = generate_ans(X_true5[:1077], y_true5[:1077], labelnum_dic, class_dic_r, conf, 5)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_mv_07.txt')
ans_list_temp = generate_ans(X_true[:5385], y_true[:5385], labelnum_dic, class_dic_r, conf, 1)
write_ans_list(ans_list_temp, 47, '../MBEM/ner-mturk/mild_single_07.txt')

In [20]:
class_list = list(set(l.split()[1] for l in open(truth_file) if l != "\n"))
class_dic = {k:i for i, k in enumerate(sorted(class_list))}
class_dic_r = {class_dic[c]:c for c in class_dic}
class_label = {'LOC': 0, 'MISC': 1, 'ORG': 2, 'PER': 3, 'O': 4}
labelnum_dic = {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 0, 'I-MISC': 1, 'I-ORG': 2, 'I-PER': 3, 'O': 4}
s1 = {'B-LOC': 'B-MISC', 'B-MISC': 'B-ORG', 'B-ORG': 'B-PER', 'B-PER': 'B-LOC', 'I-LOC': 'I-MISC', 'I-MISC': 'I-ORG', 'I-ORG': 'I-PER', 'I-PER': 'I-LOC', 'O':'O'}
s2 = {'B-LOC': 'B-ORG', 'B-MISC': 'B-PER', 'B-ORG': 'B-LOC', 'B-PER':'B-MISC' , 'I-LOC': 'I-ORG', 'I-MISC': 'I-PER', 'I-ORG':  'I-LOC', 'I-PER':'I-MISC', 'O':'O'}
class_shuffle = [s1, s2]
b_tag = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER']
#データの読み込み
ans_train, words_dic = load_crowd_data(ans_file, truth_file, train_num)
X_true, y_true = read_output('../MBEM/ner-mturk/ground_truth.txt', class_dic_r)
X_true5, y_true5 = read_output('../MBEM/ner-mturk/testset5.txt', class_dic_r)
workers_count = []
for i in range(len(ans_train)):
    for w in ans_train[i][1]:
        workers_count.append(w)
worker_num = len(set(workers_count))

#多分データを変えるごとにクラス確認したくなるので残しておく
num_words = 0
for l in ans_train:
    num_words += len(l[0])
print("worker_num", worker_num)
print("クラス：",class_dic)
print("タスク数",len(ans_train))
print("タスク数(単語):", num_words)
print("語彙数", len(words_dic))
worker_num = 47

worker_num 47
クラス： {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
タスク数 5385
タスク数(単語): 70985
語彙数 12378


In [194]:
#ans_mは回答(redundancy)×タスク(単語)×ラベル数
ans_m = ans_matrix(ans_train, redundancy, k)
ans_m = ans_m.transpose(1, 0, 2) 

mv 5

In [184]:
#Algorithm 1
#deep-learining-modelに渡すデータの書き込み
w_list = []
i = 0
simple_agg = init_posdis(redundancy, ans_m, k)
for task in ans_train:
    for word in task[0]:
        w_list.append(word +" "+ str(class_dic_r[int(simple_agg[i])]) + "\n")
        i += 1
    w_list.append("\n")
w_file = path_w + "/agg_mild_mv_03.txt"
with open(w_file, mode='w') as f:
    f.writelines(w_list)

In [None]:
subprocess.run(['./deep-learning-models/python', 'main.py', 'dataset_reader=normal_bc5cdr'])

In [185]:
iter_time = 1
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
#if len(est_labels[0]) == 22:
#    est_labels[0].insert(0, '3')
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num5], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

1  iteration

labels_md [6.72143920e-02 2.52850768e-02 2.59225158e-02 2.04688717e-02
 9.20745095e-04 6.37438912e-04 4.53289893e-03 5.17033784e-03
 8.49847723e-01]


In [47]:
iter_time = 2
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

2  iteration

labels_md [0.03208443 0.01643176 0.04369998 0.04681635 0.         0.
 0.         0.         0.86096749]




In [35]:
iter_time = 3
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

3  iteration

labels_md [0.0389546  0.01643176 0.0434875  0.04207097 0.         0.
 0.         0.         0.85905517]




In [36]:
iter_time = 4
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

4  iteration

labels_md [0.03498831 0.0160068  0.04957858 0.04780792 0.         0.
 0.         0.         0.85161839]




In [37]:
iter_time = 5
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

5  iteration

labels_md [0.03498831 0.0160068  0.04957858 0.04780792 0.         0.
 0.         0.         0.85161839]




single label

In [195]:
#Algorithm 1
#deep-learining-modelに渡すデータの書き込み
w_list = []
i = 0
simple_agg = init_posdis_single(redundancy, ans_m, k)
for task in ans_train:
    for word in task[0]:
        w_list.append(word +" "+ str(class_dic_r[int(simple_agg[i])]) + "\n")
        i += 1
    w_list.append("\n")
w_file = path_w + "/agg_mild_single_07.txt"
with open(w_file, mode='w') as f:
    f.writelines(w_list)

In [196]:
iter_time = 1
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

1  iteration

labels_md [0.02856942 0.01611608 0.03659928 0.02496302 0.02411777 0.0101289
 0.01381982 0.01914489 0.82654082]


In [81]:
iter_time = 2
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

2  iteration

labels_md [0.03588082 0.01949708 0.04646052 0.05171515 0.         0.
 0.         0.         0.84644643]




In [19]:
iter_time = 3
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

3  iteration

labels_md [0.04268507 0.02275129 0.05595548 0.05718109 0.         0.
 0.         0.         0.82142706]




In [27]:
iter_time = 4
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

4  iteration



IndexError: list index out of range

In [24]:
iter_time = 5
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

5  iteration

labels_md [0.04055786 0.         0.01790519 0.02676622 0.00571952 0.
 0.01300275 0.02435726 0.8716912 ]




In [33]:
for t in range(iteration_times):
    print(t+1, " iteration\n")
    est_labels = get_newest_prediction()
    if len(est_labels[0]) == 22:
        est_labels[0].insert(0, '3')
    est_labels, e_conf = post_prob_DS(est_labels[0:train_num], ans_train, k, worker_num, num_words)
    write_prediction(est_labels, ans_train, t+1)
    subprocess.run(['python', './deep-learning-models/main.py', 'dataset_reader=normal_bc5cdr2']) #ここがうまく行かない

1  iteration

2  iteration

3  iteration

4  iteration

5  iteration



確率でやりたい時にはこちら(probの方がうまく動かせないので後で直す)

In [14]:
#ワーカのタスクに対する回答の事後確率の初期化
#Algorithm 1のInitialixe posterior distribution using weighted mv(論文p6)
def init_posdis_p(redundancy, ans_m, k):
    simple_agg = np.zeros((num_words,k))
    mv_ans = np.zeros((num_words))
    for r in range(redundancy):
        simple_agg += (1/redundancy)*ans_m[r] #simple_aggをmvせずにそのまま使えるようにしたい
    return simple_agg

In [68]:
#Algorithm 1
#deep-learining-modelに渡すデータの書き込み
w_list = [str(class_dic_r)+"\n\n"]
i = 0
simple_agg = init_posdis_p(redundancy, ans_m, k)
for task in ans_train:
    for word in task[0]:
        w_list.append(word +" "+ str(simple_agg[i])[1:-1] + "\n")
        i += 1
    w_list.append("\n")
w_file = path_w + "/simple_agg_p.txt"
with open(w_file, mode='w') as f:
    f.writelines(w_list)

In [26]:
iter_time = 5
print(iter_time, " iteration\n")
est_labels = get_newest_prediction()
#if len(est_labels[0]) == 22:
#    est_labels[0].insert(0, '3')
est_labels, e_conf = post_prob_DS(label2num(est_labels[0:train_num5], class_dic, class_dic_r), ans_train, k, worker_num, num_words)
write_prediction(label2num(est_labels, class_dic, class_dic_r), ans_train, iter_time)

5  iteration

labels_md [0.04041827 0.01312667 0.02640166 0.03396618 0.00496885 0.00904776
 0.0120884  0.02921982 0.83076239]


In [27]:
est_labels = simple_pred
for t in range(iteration_times):
    print(t, " iteration\n")
    #simple_pred(weighed mv)とans_train(回答と回答ワーカ)を使って学習
    #simple_predを使うのは事前分布のため
    #学習機のtrainと推定ラベルの更新
    model, est_labels = train(words_dic, class_dic, ans_train, est_labels)
    #est_labelsで混同行列の更新
    est_labels, e_conf = post_prob_DS(est_labels, ans_train, k, worker_num)

NameError: name 'simple_pred' is not defined

### ニューラルネットワーク

In [20]:
print(len(get_newest_prediction()[1]))

29


In [96]:
#ワーカーの回答ラベルでtrainしてmodelとpredicted_labels(タスク数*各文書単語数list)を返す
def train(words_dic, class_dic, ans_train, ans_labels):
    EMBEDDING_DIM = 5
    HIDDEN_DIM = 4
    predicted_labels = []
    model = BiLSTM_CRF(len(words_dic), class_dic, EMBEDDING_DIM, HIDDEN_DIM)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
    # Modelトレーニング
    for epoch in range(2):
        m = 0
        for sentence, worker, classes in ans_train:
            # 勾配の初期化
            model.zero_grad()
            tags = ans_labels[m:m+len(sentence)]
            sentence_in = prepare_sequence(sentence, words_dic)
            targets = torch.tensor(tags, dtype=torch.long)
            loss = model.neg_log_likelihood(sentence_in, targets)
            # 損失関数と勾配の計算をしてパラメータ更新
            loss.backward()
            optimizer.step()
    #ラベル予測
    for i in range(len(ans_train)):
        with torch.no_grad():
            check_sent = prepare_sequence(ans_train[i][0], words_dic)
            predicted_labels.append(model(check_sent))
    return model, predicted_labels

In [77]:
with torch.no_grad():
    precheck_sent = prepare_sequence(ans_train[6][0], words_dic)
    print(ans_train[0][0])
    print(model(precheck_sent))

['A', 'newborn', 'with', 'massive', 'tricuspid', 'regurgitation', '&#44;', 'atrial', 'flutter', '&#44;', 'congestive', 'heart', 'failure', '&#44;', 'and', 'a', 'high', 'serum', 'lithium', 'level', 'is', 'described', '.']
(tensor(83.0933), [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
