In [1]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from nltk.corpus import wordnet as wn
import os
from utils import prepare_dataset, rf_clf, write_to_file, eval_ensemble

min_depth = 4
min_dist = 0
filter_name = "EMNLP_filter"
graph_filter_weight = False
max_connect_dist = 4
if graph_filter_weight:
    graph_filter_name = "weightfilter"
else:
    graph_filter_name = "nofilter"

proc_type = "train"    
dial_path = "concepts_file/dial_mindepth{}_{}_{}.npy"    
# concepts and CODCs
dial_concepts = np.load(
    dial_path.format(min_depth, filter_name, proc_type),
    allow_pickle=True)
desc_path = "concepts_file/desc_mindepth{}_{}_{}_{}.npy"
desc_concepts_list = [np.load(desc_path.format(min_depth, filter_name, proc_type, i),\
                              allow_pickle=True)\
                     for i in range(5)]
codc_path = "concepts_file/codc_mindepth{}_{}_mindis{}_{}.npy"
codc_allcaps = np.load(
    codc_path.format(min_depth, filter_name, 0, proc_type),
    allow_pickle=True)

# 1. Random select from all codcs from 5 caps

In [3]:
np.random.seed(229)
from itertools import chain
from collections import Counter
from utils import get_f1
codc_vocab = Counter(chain(*[[codc for codc, _ in codcs] for codcs in codc_allcaps]))
codc_vocab_list = list(codc_vocab.keys())
codc_allcaps_name = [list(set([codc for codc, _ in codcs])) for codcs in codc_allcaps]

In [48]:
# retrieve top 1~10 knowledge
# pos proportion ranges from 

for pos_prop in [0.15]:
  for topk in [5]:
    # start traversing
    knowledge = []
    for codcs_name in codc_allcaps_name:  
      np.random.shuffle(codcs_name)
      codc_idx = 0
      know = []
      
      for is_pos in np.random.uniform(low=0.0, high=1.0, size=topk) < pos_prop:
        if codc_idx < len(codcs_name) and is_pos:
          know.append(codcs_name[codc_idx])
          codc_idx += 1
        else:
          know.append(codc_vocab_list[np.random.randint(0, len(codc_vocab_list))])
      knowledge.append(know)
    print("positive proportion:", pos_prop, "topk", topk)
    print("num pos:", sum([sum([k in codcs_name for k in know]) \
                       for codcs_name, know in zip(codc_allcaps_name, knowledge)]) \
                      / (len(codc_allcaps_name)* topk) )
    print(get_f1(knowledge, codc_allcaps_name))
    write_to_file(knowledge, 
      "knowledge/train-knowledge/random-sample/top{}_prop{}_train.txt".format(topk, pos_prop))


positive proportion: 0.15 topk 5
num pos: 0.14974125439867522
number of zeros hits 42565
(0.1497210722417719, 0.16800070498271907, 0.1483812305423733)


# 2. For each cap, select several knowledge

In [4]:
graph_path = '/home/tfangaa/projects/Deprecate/see2017seq2seq/wn_data/wngraph/wngraph.pickle'
G = nx.read_gpickle(graph_path).to_undirected()
def get_graph_shortest_path(G, desc, dial):
    try:
        return len(nx.shortest_path(G, desc, dial))
    except:
        return 1e3
shortest_path = lambda desc, dial: get_graph_shortest_path(G, desc, dial) - 2
# 1. shouldn't be too close
# 2. cy shouldn't be one of the hypernyms of cxs
is_codc = lambda desc_synset, dial_synsets:\
  all(
     shortest_path(desc_synset.name(), dial_synset.name())\
        >= min_distance for dial_synset in dial_synsets) and \
  all(desc_synset not in dial_synset.hypernyms() for dial_synset in dial_synsets)

get_CODC = lambda dial_concepts, desc_concepts:\
  [desc_concept for desc_concept in desc_concepts \
     if is_codc(wn.synset(desc_concept[1]), [wn.synset(dial_concept[1]) for dial_concept in dial_concepts]) and \
      desc_concept[0] not in [dial_concept[0] for dial_concept in dial_concepts]]

In [None]:
# 1. get codcs individually


codc_by_desc = []
for i in range(5):
    codcs = []
    for dial_concept, desc_concept in tqdm(zip(dial_concepts, desc_concepts_list[i])):
        codcs.append(get_CODC(dial_concept, desc_concept))
    codc_by_desc.append(codcs)

12815it [01:47, 121.26it/s]

In [5]:
from multiprocessing import Pool
def get_codc(dial_concepts, desc_concepts):
  codcs = []
  for dial_concept, desc_concept in tqdm(zip(dial_concepts, desc_concepts)):
      codcs.append(get_CODC(dial_concept, desc_concept))
  return codcs

min_distance = 0
workers = Pool(5)
all_results = []
for i in range(5):
    tmp_result = workers.apply_async(
      get_codc, 
      args=(
            dial_concepts, 
            desc_concepts_list[i])
        )
    all_results.append(tmp_result)

workers.close()
workers.join()

codc_by_desc = [tmp_result.get() for tmp_result in all_results]

96620it [15:42, 102.56it/s]
96620it [15:41, 102.60it/s]
96620it [15:47, 101.92it/s]
96620it [16:47, 95.89it/s] 
96620it [16:47, 95.87it/s] 


In [14]:
!wc -l /home/tfangaa/projects/OpenNMT-py-summ/haojie/data/dialogs/dialog.train.5ref.txt
!wc -l /home/tfangaa/projects/OpenNMT-py-summ/haojie/data/ground/desc.train.5ref.txt

483100 /home/tfangaa/projects/OpenNMT-py-summ/haojie/data/dialogs/dialog.train.5ref.txt
483100 /home/tfangaa/projects/OpenNMT-py-summ/haojie/data/ground/desc.train.5ref.txt


In [17]:
# map from this to the dialog.train.5ref.txt

dial_single_path = "/home/tfangaa/projects/OpenNMT-py-summ/haojie/data/dialogs/dialog.train.5ref.txt.single"
desc_path = "/home/tfangaa/projects/OpenNMT-py-summ/haojie/data/ground/desc.{}.5ref.txt.{}" 
desc_5ref_path = "/home/tfangaa/projects/OpenNMT-py-summ/haojie/data/ground/desc.train.5ref.txt"
dial_5ref_path = "/home/tfangaa/projects/OpenNMT-py-summ/haojie/data/dialogs/dialog.train.5ref.txt"

dialogue_lines = open(dial_5ref_path).readlines()
dial_single = open(dial_single_path).readlines()
descs_seps = [open(desc_path.format("train", i)).readlines() for i in range(5)]
descs_5ref = open(desc_5ref_path).readlines()

dial_desc_id_dict = dict()
for i, (dial_line, desc_line) in enumerate(zip(dialogue_lines, descs_5ref)):
  if (dial_line, desc_line) in dial_desc_id_dict:
    if isinstance(dial_desc_id_dict[(dial_line, desc_line)], list):
      dial_desc_id_dict[(dial_line, desc_line)].append(i)
    else:
      dial_desc_id_dict[(dial_line, desc_line)] = [dial_desc_id_dict[(dial_line, desc_line)], i]
  else:
    dial_desc_id_dict[(dial_line, desc_line)] = i

print(len(dial_desc_id_dict), len(dialogue_lines))
print( sum([len(val) if isinstance(val, list) else 1 for key, val in dial_desc_id_dict.items()]) )

482876 483100
483100


In [19]:
dial_desc_id_dict_copy = dial_desc_id_dict.copy()
indexes = []
for i in range(5):
  index = []
  for dial_line, desc_line in zip(dial_single, descs_seps[i]):
    if isinstance(dial_desc_id_dict_copy[(dial_line, desc_line)], list):
      idx = dial_desc_id_dict_copy[(dial_line, desc_line)][0]
      index.append(idx)
      dial_desc_id_dict_copy[(dial_line, desc_line)].remove(idx)
    else:
      index.append(dial_desc_id_dict_copy[(dial_line, desc_line)])
  indexes.append(index)
print(sum([len(set(item)) for item in indexes]))

483100


In [34]:
is_ascending = lambda l: all([l[i+1] > l[i]for i in range(len(l)-1)])
all(is_ascending(indexes[i]) for i in range(5))

True

In [36]:
reverse_indexes = [idx for i, idx in enumerate(list(chain(*indexes)))]


all(a == b for a, b in zip(list(chain(*descs_seps)), descs_5ref))

True

In [39]:
all_codcs_5ref = list(chain(*codc_by_desc))
all_codcs_5ref_names = [[codc for codc, _ in codcs] for codcs in all_codcs_5ref]

In [43]:
# retrieve top 1~10 knowledge
# pos proportion ranges from 

for pos_prop in [0.2, 0.4]:
  for topk in [3, 5, 6, 10]:
    # start traversing
    knowledge = []
    for codcs_name in all_codcs_5ref_names:  
      np.random.shuffle(codcs_name)
      codc_idx = 0
      know = []
      
      for is_pos in np.random.uniform(low=0.0, high=1.0, size=topk) < pos_prop:
        if codc_idx < len(codcs_name) and is_pos:
          know.append(codcs_name[codc_idx])
          codc_idx += 1
        else:
          know.append(codc_vocab_list[np.random.randint(0, len(codc_vocab_list))])
      knowledge.append(know)
    print("positive proportion:", pos_prop, "topk", topk)
    print("proportion of pos:", sum([sum([k in codcs_name for k in know]) \
                       for codcs_name, know in zip(all_codcs_5ref_names, knowledge)]) \
                      / (len(all_codcs_5ref_names)* topk) )
    print(get_f1(knowledge, all_codcs_5ref_names))
    write_to_file(knowledge, 
      "knowledge/train-knowledge/random-fivecap/fivecap_top{}_prop{}_train.txt".format(topk, pos_prop))
    


positive proportion: 0.2 topk 3
proportion of pos: 0.15352239011936797
number of zeros hits 285377
(0.15350893534809906, 0.29780502718491747, 0.193357866270669)
positive proportion: 0.2 topk 5
proportion of pos: 0.14016518319188573
number of zeros hits 211075
(0.14014196508659352, 0.43757336319813034, 0.204118928141822)
positive proportion: 0.2 topk 6
proportion of pos: 0.13380908024563581
number of zeros hits 184394
(0.13378603463741115, 0.4937968466881217, 0.20304827252440208)
positive proportion: 0.2 topk 10
proportion of pos: 0.11103829434899606
number of zeros hits 121859
(0.11100063823914996, 0.652663659215046, 0.18460491300637888)
positive proportion: 0.4 topk 3
proportion of pos: 0.27958186710825916
number of zeros hits 165364
(0.27957082729593596, 0.5198546124233873, 0.34686448230579886)
positive proportion: 0.4 topk 5
proportion of pos: 0.22983937073069757
number of zeros hits 110025
(0.22980780376733598, 0.6758936485327501, 0.3293991755360591)
positive proportion: 0.4 topk 6