In [2]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count, combinations
import pickle
import scipy
from scipy.special import softmax
import sys
from sklearn.feature_extraction.text import CountVectorizer as CV
import re
import copy
from tqdm import tqdm
from gensim.models import KeyedVectors
from sklearn_crfsuite import CRF
import nltk

In [3]:
# Sequence Loader
class BuildDataLoader:
    
    def __init__(self, folder):
        self.sequence = []
        self.word_dict = {}
        self.label_dict = {}
        self.folder = folder

        with open(folder + '_string.txt', 'r') as x_file, open(folder + '_label.txt', 'r') as y_file: 
            for x, y in zip(x_file, y_file):
                x = [char for char in x.lower().replace("\n",'')]
                y = y.lower().replace("\n",'').split(',')
                if(len(y) > 1):
                    if len(y[-1]) == 0:
                        y = y[:-1]
                    for i in range(len(x)):
                        if x[i].isdigit():
                            x[i] = 'NUM'
                    for char, label in zip(x, y):
                        if char not in self.word_dict:
                            self.word_dict[char] = len(self.word_dict)
                        if label not in self.label_dict:
                            self.label_dict[label] = len(self.label_dict)
                    self.sequence.append((x, y))
    
    def shuffle(self, seed = 4):
        random.Random(4).shuffle(self.sequence)
    
    def get_word_dict(self):
        return self.word_dict
    
    def get_label_dict(self):
        return self.label_dict

In [4]:
# CRF
class CrfModel(object):
    
    def __init__(self, data):
        self.label_dict = data.label_dict
        self.word_dict = data.word_dict
        
        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        
        self.X_train=[]
        self.Y_train=[]
    
        print ('label dict size: {}'.format(len(self.label_dict)))
        print ('word dict size: {}'.format(len(self.word_dict)))
        
    def reset(self):
        self.X_train=[]
        self.Y_train=[]
    
    def char2feature(self, sent, i):
        # for current character
        features = {'0:word': sent[i]}
        # for previous character
        if i > 0:
            features.update({'-1:word': sent[i-1]})
        # for next character
        if i < len(sent)-1:
            features.update({'+1:word': sent[i+1]})
        return features
    
    def add_instances(self, sequences):
        for seq in sequences:
            x = seq[0]
            y = seq[1]
            self.X_train.append([self.char2feature(x, i) for i in range(len(x))])
            self.Y_train.append(y)
    
    def compute_confidence(self, sequence):
        x = [self.char2feature(sequence[0], i) for i in range(len(sequence[0]))]
        y_pred = self.crf.tagger_.tag(x)
        prob_norm = math.exp(math.log(self.crf.tagger_.probability(y_pred)) / len(x))
        
        label_list = self.crf.tagger_.labels()
        prob_list = []
        for i in range(len(x)):
            marginal_prob = [self.crf.tagger_.marginal(k, i) for k in label_list]
            prob_list.append(max(marginal_prob))
        return (prob_list, sum(prob_list), prob_norm)
    
    def compute_entropy(self, sequence):
        x = [self.char2feature(sequence[0], i) for i in range(len(sequence[0]))]
        label_list = self.crf.tagger_.labels()
        self.crf.tagger_.set(x)
        entropy_seq = []
        for i in range(len(x)):
            marginal_prob = [self.crf.tagger_.marginal(k, i) for k in label_list]
            entropy_seq.append(scipy.stats.entropy(marginal_prob))
        return (entropy_seq, sum(entropy_seq))
    
    def train(self):
        self.crf.fit(self.X_train, self.Y_train) 
        return len(self.X_train)
    
    def predict(self, sequence):
        x = [self.char2feature(sequence[0], i) for i in range(len(sequence[0]))]
        return self.crf.tagger_.tag(x)    
    
    def evaluate_acc(self, sequences):
        # Calculate phrase-level accuracy and out-of-phrase accuracy
        X_test = [[self.char2feature(seq[0], i) for i in range(len(seq[0]))] for seq in sequences]
        Y_test = [seq[1] for seq in sequences]
        Y_pred = self.crf.predict(X_test)
        
        # Consider the accuracy in phrase level.
        in_cnt,  in_crt = 0, 0    # Total/correct number of phrases
        out_cnt, out_crt = 0, 0   # Total/correct number of "o"
        all_cnt, all_crt = 0, 0   # Total/correct number of all words

        for y_test, y_pred in zip(Y_test, Y_pred):
            correct_flag = False
            for j in range(len(y_test)):
                all_cnt += 1
                if y_test[j] == y_pred[j]:
                    all_crt += 1

                # If the character is a beginning-of-phrase.
                if y_test[j][0] == 'b':
                    in_cnt += 1
                    if y_test[j] == y_pred[j]:
                        if correct_flag:
                            in_crt += 1
                        correct_flag = True
                    else:
                        if correct_flag:
                            if y_pred[j][2:] != y_pred[j-1][2:]:  # special case
                                in_crt += 1
                        correct_flag = False

                # If the character is an inside-of-phrase.
                elif y_test[j][0] == 'i':
                    if y_test[j] != y_pred[j]:
                        correct_flag = False

                # If the character is an out-of-phrase.
                elif y_test[j][0] == 'o':
                    out_cnt += 1
                    if y_test[j] == y_pred[j]:
                        out_crt += 1
                        if correct_flag:
                            in_crt += 1
                            correct_flag = False
                    else:
                        if correct_flag:
                            if y_pred[j][2:] != y_pred[j-1][2:]:  # special case
                                in_crt += 1
                            correct_flag = False

            # For the case where the phrase is at the end of a string.
            if correct_flag:
                in_crt += 1
        in_acc = 0 if in_cnt == 0 else in_crt/in_cnt
        out_acc = 0 if out_cnt == 0 else out_crt/out_cnt
        all_acc = 0 if all_cnt == 0 else all_crt/all_cnt 
            
        return in_acc, out_acc, all_acc

In [5]:
# Vectorize a set of string by n-grams.
def string_vectorize(Xs_list):
    vc = CV(analyzer='char_wb', ngram_range=(3, 4), min_df=1, token_pattern='[a-z]{2,}')
    name = []
    for i in Xs_list:
        s = re.findall('(?i)[a-z]{2,}', "".join(str(x) for x in i))
        name.append(' '.join(s))
    vc.fit(name)
    vec = vc.transform(name).toarray()
    # print(name)
    # print(vec)
    dictionary = vc.get_feature_names()
    return vec, dictionary

# Experiment Setting

In [49]:
SOURCE = 'eub'
DATA_PATH = "./dataset/" + SOURCE

PRETRAIN_SIZE = 15
CANDIDATE_SIZE = 600
VALIDATE_SIZE = 200
TEST_SIZE = 200
BUDGET = 800

#inductive or transductive labeling
M = 20
BETA = 3.0
METHOD = 'none' #choice: none, selfSim, testSim
#fully or partial labeling
SUBSEQ_FLAG = True
SUBSEQ_SIZE = 11
STRATEGY = 'partial' #choice: fully, partial

# Load data
data = BuildDataLoader(DATA_PATH)
data.shuffle(8)
pretrain_list = data.sequence[:PRETRAIN_SIZE]
validation_list = data.sequence[-TEST_SIZE - VALIDATE_SIZE : -TEST_SIZE]
candidate_list  = data.sequence[PRETRAIN_SIZE : PRETRAIN_SIZE + CANDIDATE_SIZE]
# test_list = data.sequence[-TEST_SIZE:]
test_list = pretrain_list + validation_list + candidate_list
print ("=== data setup ===")
print ("pretrain  : {}".format(len(pretrain_list)))
print ("candidate : {}".format(len(candidate_list)))
print ("validation: {}".format(len(validation_list)))
print ("test      : {}".format(len(test_list)))

# initialize CRF with #CRF_PRETRAIN_SIZE instances
crf = CrfModel(data)
crf.add_instances(pretrain_list)
crf.train()

count = sum([len(seq[1]) for seq in pretrain_list]) 
cost_list = [count]

(in_acc, out_acc, all_acc) = crf.evaluate_acc(test_list)
in_acc_list = [in_acc]
out_acc_list = [out_acc]
all_acc_list = [all_acc]

=== data setup ===
pretrain  : 15
candidate : 600
validation: 200
test      : 815
label dict size: 242
word dict size: 32


# Active Learning

In [50]:
lens = 0
for seq in data.sequence:
    lens += len(seq[0])
print (lens/len(data.sequence))

22.095903165735567


In [51]:
# Vectorized and clustered test set.
Xs = [seq[0] for seq in test_list]
Xs.extend([seq[0] for seq in candidate_list])
vec, _ = string_vectorize(Xs)
validation_vec = vec[:len(test_list)].tolist()
candidate_vec = vec[len(test_list):].tolist()

# Pre-calculate similarity: both between validation-test and validation-validate
sim_matrix_test = np.zeros((len(candidate_vec), len(validation_vec)))
sim_matrix_self = np.zeros((len(candidate_vec), len(candidate_vec)))
if METHOD != 'none':
    iterator = tqdm(range(len(candidate_vec)))
    for i in iterator:
        for j in range(len(validation_vec)):
            sim_matrix_test[i, j] = 1 - scipy.spatial.distance.cosine(candidate_vec[i], validation_vec[j])
        for j in range(len(candidate_vec)):
            sim_matrix_self[i, j] = 1 - scipy.spatial.distance.cosine(candidate_vec[i], candidate_vec[j])
    iterator.close()
print ('Similarity done!')

Similarity done!


In [52]:
visited_candidate_idx = []
seqs_list = []
subs_list = []
preds_list = []
sort1_list = []
sort2_list = []
sort3_list = []
try:
    with tqdm(range(CANDIDATE_SIZE)) as iterator:
        for seqs_size in iterator:
            if cost_list[-1] > BUDGET:
                break

            # Sort the test set based on confidence.
            prob_test_list = []
            for i in range(len(test_list)):
                (prob_per_token, _, prob_sum) = crf.compute_confidence(test_list[i])
                prob_test_list.append(prob_sum)
            rank_idx_test = np.argsort(np.array(prob_test_list), kind='mergesort').tolist()[::-1]

            # Calculate the average similarity between the unlabeled samples and the selected test samples.
            distance = []
            if METHOD != 'none':
                if METHOD == 'testSim':
                    distance = np.sum(sim_matrix_test[:, rank_idx_test[:M]], axis=1) / M
                else:
                    distance = np.sum(sim_matrix_self, axis=1) / (len(candidate_vec)-1)
#                 mean_dist = np.mean(distance)
#                 std_dist = np.std(distance)
#                 distance = [(distance[i] - mean_dist) / std_dist for i in range(len(candidate_list))]


            ####
            # Compute the top-K tokens and its seq_idx: subsequence with or without SEBSEQ_FLAG
            prob_list = []
            subseq_idx_list = []
            for i in range(len(candidate_list)):
                (prob_per_token, prob_sum) = crf.compute_entropy(candidate_list[i])
                prob_sum /= len(candidate_list[i][1])
                if STRATEGY == 'partial':
                    subseq_idxs = []
                    subseq_prob_sum = -sys.maxsize
                    if SUBSEQ_FLAG:
                        end_p = len(prob_per_token) - SUBSEQ_SIZE + 1
                        for k in range(0, end_p): # the largest subsequence
                            prob_tmp = sum([prob_per_token[k+j] for j in range(SUBSEQ_SIZE)]) / SUBSEQ_SIZE
                            if prob_tmp > subseq_prob_sum:
                                subseq_prob_sum = prob_tmp
                                subseq_idxs = [k+j for j in range(SUBSEQ_SIZE)]
                        if end_p < 1: # if length is not longer than subseq_size
                            subseq_prob_sum = prob_sum / len(prob_per_token)
                            subseq_idxs = range(0, len(prob_per_token))
                    else:
                        token_sorted = np.argsort(np.array(prob_per_token), kind='mergesort').tolist()[::-1]
                        subseq_idxs = [token_sorted[k] for k in range(min(SUBSEQ_SIZE, len(prob_per_token)))]
                        subseq_prob_sum = sum([prob_per_token[k] for k in subseq_idxs]) / len(subseq_idxs)
                    prob_sum = subseq_prob_sum
                    subseq_idx_list.append(subseq_idxs)

                prob_list.append(prob_sum)

            # Entropy weighted with or without similarity
            mean_prob = np.mean(prob_list)
            std_prob = np.std(prob_list)
            prob_list = [(prob_list[i] - mean_prob) / std_prob for i in range(len(candidate_list))]

            # norm_dist = [1/(1+math.exp(x)) for x in norm_dist]
            score_list = []
            for i in range(len(candidate_list)):
                if METHOD == 'none':
                    score_list.append(prob_list[i])
                else:
                    score_list.append(prob_list[i] * math.pow(distance[i], BETA))

            # Locate the subseq_idx with largest score
            rank_idx = np.argsort(np.array(score_list), kind='mergesort').tolist()[::-1]
            for i in rank_idx:
                if i not in visited_candidate_idx:
                    seq_idx = i
                    visited_candidate_idx.append(seq_idx)
                    break
            query_seq = candidate_list[seq_idx]
            
            if STRATEGY == 'partial':
                subseq_idxs = subseq_idx_list[seq_idx]
                predict_y = crf.predict(query_seq)
                for i in range(len(query_seq[1])):
                    if i not in subseq_idxs:
                        query_seq[1][i] = predict_y[i]
                count += len(subseq_idxs)
                subs_list.append([query_seq[0][i] for i in subseq_idxs])
                seqs_list.append(query_seq)
            else:
                count += len(query_seq[1])
            cost_list.append(count)

            crf.add_instances([query_seq])
            crf.train()
            (in_acc, out_acc, all_acc) = crf.evaluate_acc(test_list)
            in_acc_list.append(in_acc)
            out_acc_list.append(out_acc)
            all_acc_list.append(all_acc)
            
            errs = []
            format_dict = {}
            for seq in test_list:
                x = ''.join(seq[0])
                if x not in format_dict:
                    format_dict[x] = 1
                else: format_dict[x] += 1
            for k,v in format_dict.items():
                format_dict[k] = v/len(test_list)
            
            formats = {}
            for seq in test_list:
                x = ''.join(seq[0])
                in_acc, _, all_acc = crf.evaluate_acc([seq])
                if x not in formats:
                    formats[x] = [format_dict[x], all_acc, format_dict[x] * (1-all_acc)]
            sort1_format = sorted(formats.items(), key=lambda kv: kv[1][0])[::-1]
            sort2_format = sorted(formats.items(), key=lambda kv: kv[1][1])
            sort3_format = sorted(formats.items(), key=lambda kv: kv[1][2])[::-1]
            sort1_list.append(sort1_format)
            sort2_list.append(sort2_format)
            sort3_list.append(sort3_format)
            
except KeyboardInterrupt:
    iterator.close()
    raise  
iterator.close()
print ('Done!') 

  7%|▋         | 42/600 [04:24<1:37:05, 10.44s/it]

Done!





In [56]:
for i in range(len(cost_list)):
    print ('{}: {}'.format(cost_list[i], all_acc_list[i]))

342: 0.8654219070308604
353: 0.8684137625353205
364: 0.8714610227713446
375: 0.8684691672668846
386: 0.8689678098509612
397: 0.8820987312316472
408: 0.8852013961992354
419: 0.8865865144883373
430: 0.8881932517036955
441: 0.8829852069366724
452: 0.8711285943819602
463: 0.8757825918333426
474: 0.8770569006593163
485: 0.8822095406947753
496: 0.895174247880769
507: 0.8924040113025652
518: 0.8927364396919497
529: 0.8965039614383068
540: 0.897390437143332
551: 0.898110698653665
562: 0.8994404122112029
573: 0.8996620311374591
584: 0.9001606737215359
595: 0.8976120560695884
606: 0.8991079838218183
617: 0.8993850074796388
628: 0.9114632389606072
639: 0.9116294531552994
650: 0.912737547786581
661: 0.9412155798105158
672: 0.9397750567898498
683: 0.9412155798105158
694: 0.9414926034683362
705: 0.9424344839049255
716: 0.9422682697102333
727: 0.9409939608842595
738: 0.9412709845420799
749: 0.9422682697102333
760: 0.9426561028311817
771: 0.9436533879993352
782: 0.94575876779877
793: 0.947088481356307

In [58]:
for i in range(len(seqs_list)):
    print (cost_list[i])
    print (all_acc_list[i])
    s = ''
    for char in seqs_list[i][0]:
        if char == 'NUM':
            char = '#'
        s += char
    print (s)
    print (seqs_list[i][1])
    print (subs_list[i])
    print ('------------')
#     print (sort1_list[i])
    print (sort2_list[i])
#     print (sort3_list[i])
    print ('\n')

342
0.8654219070308604
ebu#b.rm-#xxx..#nd flr avg clg-pid#
['b_building-ebu3b', 'i_building-ebu3b', 'i_building-ebu3b', 'b_leftidentifier', 'b_leftidentifier', 'o', 'b_room', 'i_room', 'o', 'b_leftidentifier', 'i_max', 'i_max', 'b_leftidentifier', 'o', 'o', 'b_rightidentifier', 'b_rightidentifier', 'i_rightidentifier', 'o', 'b_floor', 'i_floor', 'i_floor', 'o', 'b_average', 'i_average', 'i_average', 'o', 'b_cooling', 'i_cooling', 'i_cooling', 'o', 'o', 'o', 'o', 'b_leftidentifier']
['NUM', 'n', 'd', ' ', 'f', 'l', 'r', ' ', 'a', 'v', 'g']
------------
[('bsbm_enthalpy', [0.001226993865030675, 0.0, 0.001226993865030675]), ('bsbm_rh', [0.001226993865030675, 0.2857142857142857, 0.0008764241893076249]), ('ebuNUMb.server room.leak-shutdown', [0.001226993865030675, 0.2903225806451613, 0.0008707698396991887]), ('ebuNUM_enthalpy', [0.001226993865030675, 0.3076923076923077, 0.0008494572911750826]), ('ebuNUMb.chwpNUM-vfd.voltage', [0.001226993865030675, 0.34782608695652173, 0.0008002133902373967

In [None]:
filename = "./results/" + SOURCE + str(AGENT_PRETRAIN_SIZE) + "_" + str(VALIDATE_SIZE) + "_" + str(BUDGET) + "budget_" + STRATEGY + "_" + METHOD 
if STRATEGY == 'partial':
    filename += "_sub" + str(SUBSEQ_SIZE) + str(SUBSEQ_FLAG)
if METHOD != 'none':
    filename += "_beta" + str(BETA)
    if METHOD == 'testSim':
        filename += "_M" + str(M)
filename += ".bin"

with open(filename, "wb") as result:
    pickle.dump((cost_list, in_acc_list, out_acc_list, all_acc_list), result)

### sanity check

In [None]:
# sim beta
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost_1, in_acc_1, out_acc_1, all_acc_1) = pickle.load(in_file)
       
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost_2, in_acc_2, out_acc_2, all_acc_2) = pickle.load(in_file)

with open("./results/eub1000_partial_none_sub11False_Falsenorm.bin", "rb") as in_file:
    (cost_3, in_acc_3, out_acc_3, all_acc_3) = pickle.load(in_file)

with open("./results/eub1000_partial_none_sub17False_Falsenorm.bin", "rb") as in_file:
    (cost_4, in_acc_4, out_acc_4, all_acc_4) = pickle.load(in_file)
    
# with open("./results/eub1000_partial_none_sub14False_Falsenorm.bin", "rb") as in_file:
#     (cost_5, in_acc_5, out_acc_5, all_acc_5) = pickle.load(in_file)

# with open("./results/eub1000_partial_none_sub17False_Falsenorm.bin", "rb") as in_file:
#     (cost_6, in_acc_6, out_acc_6, all_acc_6) = pickle.load(in_file)

# with open("./results/sod15num_200_testVRL_acc2_all_te_20step_5batch20decay_90loopdecay_64rnn_16filter_4size_2stride.bin", "rb") as in_file:
#     (cost_7, in_acc_7, out_acc_7, all_acc_7) = pickle.load(in_file)

# plt.rc('text', usetex=True)

plt.plot(cost_1, all_acc_1,
         cost_2, all_acc_2,
         cost_3, all_acc_3,
         cost_4, all_acc_4,)
#          cost_5, all_acc_5,
#          cost_6, all_acc_6,)
#          cost_7, acc_valid_7)
plt.legend(['TE', 'denTE', 'TE-part', 'transTE', 'denTE-part' ,'transTE-part'], loc='upper left', fancybox=True, fontsize = 9)
# plt.xlim(200, 600)

plt.title('CoNLL dataset with 15 pretraining samples', fontsize=22)
plt.xlabel("Number of training labels", fontsize=21)
plt.ylabel("Predictive accuracy", fontsize=21)
plt.grid()
plt.savefig('./results/sod.png', bbox_inches='tight')



In [None]:
# partial window size w
# sod: w=9
# sdh: w=8
# ibm: w=17

# SOD
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost_f, in_acc_f, out_acc_f, all_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_fully_selfSim_beta3.0_Falsenorm.bin", "rb") as in_file:
    (cost_8, in_acc_8, out_acc_8, all_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub11False_Falsenorm.bin", "rb") as in_file:
    (cost_9, in_acc_9, out_acc_9, all_acc_9) = pickle.load(in_file)

with open("./results/eub1000_fully_testSim_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost_11, in_acc_11, out_acc_11, all_acc_11) = pickle.load(in_file)

with open("./results/eub1000_partial_selfSim_sub8False_beta1.0_Truenorm.bin", "rb") as in_file:
    (cost_self_part, in_acc_self_part, out_acc_self_part, all_acc_self_part) = pickle.load(in_file)

with open("./results/eub1000_partial_testSim_sub8False_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost_test_part, in_acc_test_part, out_acc_test_part, all_acc_test_part) = pickle.load(in_file)

# plt.rc('text', usetex=True)
# plt.rc('font', family='serif')
fig, axes = plt.subplots(ncols=3, nrows=1)
ax = axes.flatten()

ax[0].set_xlabel("Number of training labels", fontsize=21)
ax[0].set_ylabel("Predictive accuracy", fontsize=21)
ax[0].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[1].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[2].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[0].grid()
ax[0].plot(cost_f, all_acc_f, linestyle=':')
ax[0].plot(cost_8, all_acc_8, linestyle='--')
ax[0].plot(cost_9, all_acc_9)
ax[0].plot(cost_11, all_acc_11)
ax[0].plot(cost_self_part, all_acc_self_part)
ax[0].plot(cost_test_part, all_acc_test_part, c='black')
# ax[0].set_ylim([0.80, 0.96])
ax[0].set_title('Building EBU3B', fontsize=22)
leg = ax[0].legend(['TE', 'denTE', 'TE-part', 'transTE', 'denTE-part', 'transTE-part'], 
                   loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)
# plt.subplots_adjust(hspace=0.5)


# SDH
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost2_f, in2_acc_f, out_acc_f, all2_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_fully_selfSim_beta3.0_Falsenorm.bin", "rb") as in_file:
    (cost2_8, in2_acc_8, out_acc_8, all2_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost2_9, in2_acc_9, out_acc_9, all2_acc_9) = pickle.load(in_file)

with open("./results/eub1000_fully_testSim_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost2_11, in2_acc_11, out_acc_11, all2_acc_11) = pickle.load(in_file)

with open("./results/eub1000_partial_selfSim_sub8False_beta1.0_Falsenorm.bin", "rb") as in_file:
    (cost2_self_part, in2_acc_self_part, out_acc_self_part, all2_acc_self_part) = pickle.load(in_file)

with open("./results/eub1000_partial_testSim_sub8False_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost2_test_part, in2_acc_test_part, out_acc_test_part, all2_acc_test_part) = pickle.load(in_file)


ax[1].set_xlabel("Number of training labels", fontsize=21)
ax[1].set_ylabel("Predictive accuracy", fontsize=21)
ax[1].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[1].grid()
ax[1].plot(cost2_f, all2_acc_f, linestyle=":")
ax[1].plot(cost2_8, all2_acc_8, linestyle='--')
ax[1].plot(cost2_9, all2_acc_9)
ax[1].plot(cost2_11, all2_acc_11)
ax[1].plot(cost2_self_part, all2_acc_self_part)
ax[1].plot(cost2_test_part, all2_acc_test_part, c='black')
# ax[1].set_ylim([0.85, 0.978])
# ax[1].set_xlim([300,1000])
ax[1].set_title('Building SDH', fontsize=22)
leg = ax[1].legend(['TE', 'denTE', 'TE-part', 'transTE', 'denTE-part', 'transTE-part'], 
                   loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)

# IBM
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost3_f, in3_acc_f, out_acc_f, all3_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_fully_selfSim_beta3.0_Falsenorm.bin", "rb") as in_file:
    (cost3_8, in3_acc_8, out_acc_8, all3_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost3_9, in3_acc_9, out_acc_9, all3_acc_9) = pickle.load(in_file)

with open("./results/eub1000_fully_testSim_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost3_11, in3_acc_11, out_acc_11, all3_acc_11) = pickle.load(in_file)

with open("./results/eub1000_partial_selfSim_sub8False_beta1.0_Truenorm.bin", "rb") as in_file:
    (cost3_self_part, in3_acc_self_part, out_acc_self_part, all3_acc_self_part) = pickle.load(in_file)

with open("./results/eub1000_partial_testSim_sub8False_beta1.0_M50_Falsenorm.bin", "rb") as in_file:
    (cost3_test_part, in3_acc_test_part, out_acc_test_part, all3_acc_test_part) = pickle.load(in_file)


ax[2].set_xlabel("Number of training labels", fontsize=21)
ax[2].set_ylabel("Predictive accuracy", fontsize=21)
ax[2].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[2].grid()
ax[2].plot(cost3_f, all3_acc_f, linestyle=":")
ax[2].plot(cost3_8, all3_acc_8, linestyle='--')
ax[2].plot(cost3_9, all3_acc_9)
ax[2].plot(cost3_11, all3_acc_11)
ax[2].plot(cost3_self_part, all3_acc_self_part)
ax[2].plot(cost3_test_part, all3_acc_test_part, c='black')
# ax[2].set_ylim([0.81, 0.95])
# ax[2].set_xlim([200,1000])
ax[2].set_title('Building IBM', fontsize=22)
leg = ax[2].legend(['TE', 'denTE', 'TE-part', 'transTE', 'denTE-part', 'transTE-part'], 
                   loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)

plt.subplots_adjust(wspace=0.3)
fig.set_size_inches(28,4)
plt.show()

In [None]:
# partial window size w
# sod: w=9
# sdh: w=8
# ibm: w=17

# SOD
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost_f, in_acc_f, out_acc_f, all_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost_8, in_acc_8, out_acc_8, all_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub11False_Falsenorm.bin", "rb") as in_file:
    (cost_9, in_acc_9, out_acc_9, all_acc_9) = pickle.load(in_file)

with open("./results/eub1000_partial_none_sub17False_Falsenorm.bin", "rb") as in_file:
    (cost_11, in_acc_11, out_acc_11, all_acc_11) = pickle.load(in_file)

# plt.rc('text', usetex=True)
# plt.rc('font', family='arial')
fig, axes = plt.subplots(ncols=3, nrows=1)
ax = axes.flatten()

ax[0].set_xlabel("Number of training labels", fontsize=21)
ax[0].set_ylabel("Predictive accuracy", fontsize=21)
ax[0].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[1].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[2].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[0].grid()
ax[0].plot(cost_f, all_acc_f, linestyle=':')
ax[0].plot(cost_8, all_acc_8, c='orange')
ax[0].plot(cost_9, all_acc_9, c='green')
ax[0].plot(cost_11, all_acc_11, c='red')
# ax[0].set_xlim([200,600])
ax[0].set_title('Building EBU3B', fontsize=22)
leg = ax[0].legend(['Full', r'w=8', r'w=11', r'w=17'], loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)
# plt.subplots_adjust(hspace=0.5)


# SDH
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost2_f, in2_acc_f, out_acc_f, all2_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost2_8, in2_acc_8, out_acc_8, all2_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub11False_Falsenorm.bin", "rb") as in_file:
    (cost2_9, in2_acc_9, out_acc_9, all2_acc_9) = pickle.load(in_file)

with open("./results/eub1000_partial_none_sub17False_Falsenorm.bin", "rb") as in_file:
    (cost2_11, in2_acc_11, out_acc_11, all2_acc_11) = pickle.load(in_file)

ax[1].set_xlabel("Number of training labels", fontsize=21)
ax[1].set_ylabel("Predictive accuracy", fontsize=21)
ax[1].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[1].grid()
ax[1].plot(cost2_f, all2_acc_f, linestyle=":")
ax[1].plot(cost2_8, all2_acc_8, c='orange')
ax[1].plot(cost2_9, all2_acc_9, c='green')
ax[1].plot(cost2_11, all2_acc_11, c='red')
# ax[1].set_ylim([0.83, 0.978])
# ax[1].set_xlim([300,1000])
ax[1].set_title('Building SDH', fontsize=22)
leg = ax[1].legend(['Full', 'w=5', 'w=8', 'w=14'],
                   loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)

# IBM
with open("./results/eub1000_fully_none_Falsenorm.bin", "rb") as in_file:
    (cost3_f, in3_acc_f, out_acc_f, all3_acc_f) = pickle.load(in_file)
       
with open("./results/eub1000_partial_none_sub8False_Falsenorm.bin", "rb") as in_file:
    (cost3_8, in3_acc_8, out_acc_8, all3_acc_8) = pickle.load(in_file)
    
with open("./results/eub1000_partial_none_sub11False_Falsenorm.bin", "rb") as in_file:
    (cost3_9, in3_acc_9, out_acc_9, all3_acc_9) = pickle.load(in_file)

with open("./results/eub1000_partial_none_sub17False_Falsenorm.bin", "rb") as in_file:
    (cost3_11, in3_acc_11, out_acc_11, all3_acc_11) = pickle.load(in_file)

ax[2].set_xlabel("Number of training labels", fontsize=21)
ax[2].set_ylabel("Predictive accuracy", fontsize=21)
ax[2].tick_params(axis = 'both', which = 'major', labelsize = 17)
ax[2].grid()
ax[2].plot(cost3_f, all3_acc_f, linestyle=':')
ax[2].plot(cost3_8, all3_acc_8, c='orange')
ax[2].plot(cost3_9, all3_acc_9, c='green')
ax[2].plot(cost3_11, all3_acc_11, c='red')
# ax[2].set_xlim([200,1000])
# ax[2].set_ylim([0.83, 0.978])
ax[2].set_title('Building IBM', fontsize=22)
leg = ax[2].legend(['Full', 'w=15', 'w=19', 'w=23'], 
                   loc='lower right', fancybox=True, fontsize = 16)
leg.get_frame().set_alpha(0.7)

plt.subplots_adjust(wspace=0.3)
fig.set_size_inches(28,4)
plt.show()
# plt.tick_params(labelsize=20)
# plt.savefig('perp1.png', bbox_inches='tight')
    
# plt.plot(cost_f, all_acc_f,
#          cost_8, all_acc_8,
#          cost_9, all_acc_9,
#          cost_11, all_acc_11,
#          cost_self_part, all_acc_self_part,
#          cost_test_part, all_acc_test_part)
# plt.legend(['TE', 'denTE', 'TE-part', 'transTE', 'denTE-part', 'transTE-part'], loc='lower right', fancybox=True, fontsize = 12)
# # plt.ylim(0.86, 0.97)

# plt.title('SOD dataset with 5 pretraining samples')
# plt.xlabel('Number of training labels')
# plt.ylabel('Prediction accuracy')
# plt.savefig('./results/sod.png', bbox_inches='tight')
# plt.show()

In [None]:
DATA_PATH = "./dataset/"

with open(DATA_PATH + 'ebu3b_full_parsing.txt','r') as load_f:
    seq_dict = json.load(load_f)

seq_strs = []
seq_lbls = []
for k,v in seq_dict.items():
    strs = ''
    lbls = ''
    for meta_pair in v['VendorGivenName']:
        strs += meta_pair[0]
        lbls += meta_pair[1] + ","
    seq_strs.append(strs)
    seq_lbls.append(lbls)

with open(DATA_PATH + 'eub_string.txt', 'w') as x_file, open(DATA_PATH + 'eub_label.txt', 'w') as y_file: 
    for i in range(len(seq_strs)):
        x_file.write(seq_strs[i] + '\n')
        y_file.write(seq_lbls[i] + '\n')

print (len(seq_strs))