In [1]:
import os, sys
import shutil
import urllib.request
import zipfile
os.chdir("/home/ubuntu/MBE")

import argparse
import csv
import re

import pandas as pd
from dateutil import relativedelta
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from src.utils import parse_directed
from src.utils import parse_undirected
import src.pageRank
import operator

# Load Arguments

In [2]:
#dataset = "FB-MBE"
dataset = "BIMR"

os.path.join(os.getcwd(), 'data', dataset)

'/home/ubuntu/MBE/data/BIMR'

In [3]:
parser = argparse.ArgumentParser(description='MBE')

# Experiment control
parser.add_argument('--process_data', action='store_true',
                    help='process knowledge graph (default: False)')
parser.add_argument('--train', action='store_true',
                    help='run path selection set_policy training (default: False)')
parser.add_argument('--inference', action='store_true',
                    help='run knowledge graph inference (default: False)')

parser.add_argument('--dataset', type=str, default='WN-MBE',
                    help='dataset (default: WN-MBE, FB-MBE, NELL-MBE)')
parser.add_argument('--data_dir', type=str, default=os.path.join(os.getcwd(), 'data', dataset),
                    help='directory where the knowledge graph data is stored (default: None)')
parser.add_argument('--model_root_dir', type=str, default=os.path.join(os.getcwd(), 'model', dataset),
                    help='root directory where the model parameters are stored (default: None)')
parser.add_argument('--model_dir', type=str, default=os.path.join(os.getcwd(), 'model', dataset),
                    help='directory where the model parameters are stored (default: None)')
parser.add_argument('--evaluation_dir', type=str, default=os.path.join(os.getcwd(), 'model', 'evaluation'),
                    help='root directory where the model parameters are stored (default: None)')
parser.add_argument('--gpu', dest='gpu', type=int, default=0,
                    help='gpu device (default: 0)')
parser.add_argument('--checkpoint_path', type=str, default=None,
                    help='path to a pretrained checkpoint')

# Network Architecture
parser.add_argument('--model', type=str, default='point',
                    help='knowledge graph QA model (default: point)')
parser.add_argument('--emb_dim', type=int, default=100, metavar='E',
                    help='embedding dimension (default: 100)')
parser.add_argument('--history_dim', type=int, default=100, metavar='H',
                    help='action history encoding LSTM hidden states dimension (default: 400)')
parser.add_argument('--history_num_layers', type=int, default=3, metavar='L',
                    help='action history encoding LSTM number of layers (default: 1)')
parser.add_argument('--use_action_space_bucketing', type=bool, default=True,
                    help='bucket adjacency list by outgoing degree to avoid memory blow-up (default: True)')
parser.add_argument('--bucket_interval', type=int, default=10,
                    help='adjacency list bucket size (default: 32)')

# Optimization
parser.add_argument('--num_epochs', type=int, default=200,
                    help='maximum number of pass over the entire training set (default: 20)')
parser.add_argument('--num_wait_epochs', type=int, default=5,
                    help='number of epochs to wait before stopping training if dev set performance drops')
parser.add_argument('--num_peek_epochs', type=int, default=2,
                    help='number of epochs to wait for next dev set result check (default: 2)')
parser.add_argument('--start_epoch', type=int, default=0,
                    help='epoch from which the training should start (default: 0)')
parser.add_argument('--batch_size', type=int, default=256,
                    help='mini-batch size (default: 256)')
parser.add_argument('--train_batch_size', type=int, default=256,
                    help='mini-batch size during training (default: 256)')
parser.add_argument('--dev_batch_size', type=int, default=64,
                    help='mini-batch size during inferece (default: 64)')
parser.add_argument('--margin', type=float, default=0,
                    help='margin used for base MAMES training (default: 0)')
parser.add_argument('--learning_rate', type=float, default=0.001,
                    help='learning rate (default: 0.001)')
parser.add_argument('--learning_rate_decay', type=float, default=1.0,
                    help='learning rate decay factor for the Adam optimizer (default: 1)')
parser.add_argument('--adam_beta1', type=float, default=0.9,
                    help='Adam: decay rates for the first movement estimate (default: 0.9)')
parser.add_argument('--adam_beta2', type=float, default=0.999,
                    help='Adam: decay rates for the second raw movement estimate (default: 0.999)')
parser.add_argument('--grad_norm', type=float, default=10000,
                    help='norm threshold for gradient clipping (default 10000)')
parser.add_argument('--xavier_initialization', type=bool, default=True,
                    help='Initialize all model parameters using xavier initialization (default: True)')

# Graph Completion
parser.add_argument('--theta', type=float, default=0.2,
                    help='Threshold for sifting high-confidence facts (default: 0.2)')

# Reinforcement Learning
parser.add_argument('--num_rollouts', type=int, default=20,
                    help='number of rollouts (default: 20)')
parser.add_argument('--num_rollout_steps', type=int, default=3,
                    help='maximum path length (default: 3)')
parser.add_argument('--bandwidth', type=int, default=300,
                    help='maximum number of outgoing edges to explore at each step (default: 300)')
parser.add_argument('--beta', type=float, default=0.0,
                    help='entropy regularization weight (default: 0.0)')
parser.add_argument('--gamma', type=float, default=1,
                    help='moving average weight (default: 1)')

# Search Decoding
parser.add_argument('--beam_size', type=int, default=100,
                    help='size of beam used in beam search inference (default: 100)')
'''
Note that the embedding- and rule-based baselines all mask false negative facts in the dev/test set, 
so we also set the mask_test_false negatives as 'True'
And Multi-Hop, GR, RuleGuider also use the same setting
'''
parser.add_argument('--mask_test_false_negatives', type=bool, default=True,
                    help='mask false negative examples in the dev/test set during decoding (default: True. '
                         'Use the same filter settings as other baseline methods.)')
parser.add_argument('--save_beam_search_paths', action='store_true',
                    help='save the decoded path into a CSV file (default: False)')

# MBE parameters
parser.add_argument('--batch_num', type=int, default=6,
                    help='the number of new batch and original KG (default: 5+1=6)')
parser.add_argument('--now_batch', type=int, default='0',
                    help='indicate the currently used data(train: 0, valid: 1; new batch: 2-6)')

# ablation study
parser.add_argument('--argcn', type=bool, default=True,
                    help='If true, the model will use ARGCN to generate embeddings (default: True)')
parser.add_argument('--aug_link', type=bool, default=True,
                    help='If true, the model will use augmentation links (default: True)')
parser.add_argument('--attn', type=bool, default=True,
                    help='If true, the model will use feedback attention (default: True)')

# model details
# ARGCN
parser.add_argument('--rel_agg', type=str, default='sum',
                    help='The pooling function of the relational convolutional layer (the first layer) (default: sum)')
parser.add_argument('--ent_agg', type=str, default='sum',
                    help='The aggregation method of the stacked layers (default: sum)')
parser.add_argument('--neigh_dropout', type=float, default=0.3,
                    help='Dropout rate of neighboring entities (default: 0.3)')
parser.add_argument('--node_dropout', type=float, default=0.3,
                    help='Dropout rate of entity embeddings (default: 0.3)')
parser.add_argument('--gcn_layer', type=int, default=1,
                    help='GCN layer (default: 1)')
# Augmentation link
parser.add_argument('--aug_link_threshold', type=float, default=0.3,
                    help='Confidence threshold value (default: 0.3)')
parser.add_argument('--aug_link_support_threshold', type=float, default=1.0,
                    help='Support threshold value (default: 1.0). value = predict_pos / groundtruth_pos. '
                         'Note the the rollout num = 20, so the value is in [0,20].')

# Evaluation parameters
parser.add_argument('--vs100', type=bool, default=False,
                    help='If true, the model will be evaluated with a 1vs100 setting (default: False)')
parser.add_argument('--run_analysis', action='store_true',
                    help='If true, the model will be evaluated on both validation and testing sets (default: False)')

# Knowledge Graph
parser.add_argument('--add_reverse_relations', type=bool, default=True,
                    help='add reverse relations to KB (default: True)')
parser.add_argument('--add_reversed_training_edges', action='store_true',
                    help='add reversed edges to extend training set (default: False)')
parser.add_argument('--train_entire_graph', type=bool, default=False,
                    help='add all edges in the graph to extend training set (default: False)')
parser.add_argument('--emb_dropout_rate', type=float, default=0.3,
                    help='Knowledge graph embedding dropout rate (default: 0.3)')
parser.add_argument('--zero_entity_initialization', type=bool, default=False,
                    help='Initialize all entities to zero (default: False)')
parser.add_argument('--uniform_entity_initialization', type=bool, default=False,
                    help='Initialize all entities with the same random embedding (default: False)')

args, unknown = parser.parse_known_args()

In [42]:
def load_data(path):

    # Read CSV file into DataFrame df
    df = pd.read_csv(path, sep='\t', names=["head", "relation", "tail"])
    df = df.loc[:, ['head', 'tail', 'relation']]
    
    cols = ['head', 'tail', 'relation']

    # Replace everything after the identifiers (after _ in each string), because  the model uses the strings as identifiers for some operations
    for col in cols:
        df[col] = df[col].str.split('_').str[0]
    
    return df

In [25]:
def is_float(value):
    return bool(re.match(r'^-?\d+(\.\d+)?$', value))

In [35]:
def generate_rank_scores(df, path, isDirected, split_char=':'):

    data = df.values.tolist()

    #graph = parse_directed(data)
    graph = parse_undirected(data)

    p = pageRank.PageRank(graph, isDirected)
    p.rank()

    sorted_r = sorted(p.ranks.items(), key=operator.itemgetter(1), reverse=True)

    with open(path, mode="w") as file:
        for tup in sorted_r:
            node = tup[0]
            if isinstance(node, float) or is_float(node):
                node = round(float(node))
            
            #file.write('{0:30} :{1:10}\n'.format(str(tup[0]), tup[1]))
            file.write(f'{node:<30} {split_char}{tup[1]:10}\n')

In [39]:
def create_dataset_partition(data_dir, number_partitions=2, isDirected=False, prksc_split_char=':'):
    for i in range(0, number_partitions + 1):
        print(f'working on partition: {i}')
    
        if i == 0:
            file = 'train.txt'
            name = 'base_train.triples'
            folder = data_dir
        elif i == 1:
            file = 'valid.txt'
            name = 'valid.triples'
            folder = f'{data_dir}/add_{i}'
        else:
            file = 'test.txt'
            name = 'test.triples'
            folder = f'{data_dir}/add_{i}'
            
        if not os.path.isdir(folder): 
            os.makedirs(folder)
        
        # load data from file and save in right format
        temp = load_data(f'{data_dir}/{file}')
        temp.to_csv(f'{folder}/{name}', sep='\t', index=False, header=False)
        
        # add data to dataframe to calculate pagerank-scores
        if i == 0:
            df_pgrk = temp
        else:
            df_pgrk = pd.concat([df_pgrk, temp])
        
        # delete old pagerank-scores and generate new file with scores    
        if os.path.exists(f'{folder}/node.pgrk'):
            os.remove(f'{folder}/node.pgrk')
        generate_rank_scores(df_pgrk, f'{folder}/node.pgrk', isDirected, split_char=prksc_split_char)
        
        # add data to support-dataframe and save it as a file if i > 1
        if i > 1:
            if i == 2:
                temp_concat = temp
            else:
                temp_concat = pd.concat([temp_concat, temp])
            temp_concat.to_csv(f'{data_dir}/add_{str(i)}/support.triples', sep='\t', index=False, header=False)

In [43]:
create_dataset_partition(args.data_dir)

working on partition: 0
working on partition: 1
working on partition: 2


In [None]:
def load_data(path):

    # Read CSV file into DataFrame df
    df = pd.read_csv(path, sep='\t', names=["head", "relation", "tail"])
    df = df.loc[:, ['head', 'tail', 'relation']]
    
    cols = ['head', 'tail', 'relation']

    # Replace everything after the identifiers (after _ in each string), because  the model uses the strings as identifiers for some operations
    for col in cols:
        df[col] = df[col].str.split('_').str[0]
    
    return df

In [9]:
df = pd.read_csv(f'{args.data_dir}/test.txt', sep='\t', names=["head", "relation", "tail"])
l = []
l = l.append([df['head'].unique(), df['tail'].unique(), df['relation'].unique()])
l

In [10]:
df['head'].unique()

array(['Q107008_Count_Basie', 'Q1701293_John_P._Livadary',
       'Q2702789_Relativity_Media', ..., 'Q238877_Jenna_Fischer',
       'Q229050_Tori_Spelling', 'Q699224_The_Flowers_of_War'],
      dtype=object)

In [37]:
def get_BIMR_id_list(path, list_of_files):
    id_dict = {}
    
    for file in list_of_files:
        with open(f'{args.data_dir}/{file}') as f:
            for line in f:
                t = line.split('\t')
                for element in t:
                    if element.split('_')[0] not in id_dict.keys():
                        id_dict[element.split('_')[0]] = element.replace('\n', '')
    return id_dict

In [38]:
list_of_files = ['train.txt', 'test.txt', 'valid.txt']
path = f'{args.data_dir}/test.txt'

d = get_BIMR_id_list(path, list_of_files)

d

{'Q494722': 'Q494722_Stand_by_Me',
 'P136': 'P136_genre',
 'Q319221': 'Q319221_adventure_film',
 'Q218999': 'Q218999_A_Man_for_All_Seasons',
 'P1411': 'P1411_nominated_for',
 'Q106301': 'Q106301_Academy_Award_for_Best_Supporting_Actress',
 'Q21': 'Q21_England',
 'P1365': 'P1365_replaces',
 'Q179876': 'Q179876_England',
 'Q1196958': 'Q1196958_The_Soloist',
 'P364': 'P364_original_language_of_film_or_TV_show',
 'Q1860': 'Q1860_English',
 'Q55433': 'Q55433_Michelangelo_Antonioni',
 'P27': 'P27_country_of_citizenship',
 'Q38': 'Q38_Italy',
 'Q142369': 'Q142369_Lancaster_County',
 'P47': 'P47_shares_border_with',
 'Q488690': 'Q488690_Dauphin_County',
 'Q83273': 'Q83273_Ibaraki_Prefecture',
 'Q71707': 'Q71707_Fukushima_Prefecture',
 'Q2164531': 'Q2164531_Rounder_Records',
 'Q43343': 'Q43343_contemporary_folk_music',
 'Q130549': 'Q130549_Tracey_Ullman',
 'P31': 'P31_instance_of',
 'Q5': 'Q5_human',
 'Q974': 'Q974_Democratic_Republic_of_the_Congo',
 'Q929': 'Q929_Central_African_Republic',
 'Q

In [35]:
len(d)

15991

In [36]:
sys.getsizeof(d)

589920

In [None]:
# add dummy nodes and edges to the dict, otherwise translation from id2name will sometimes fail
dummy_dict = {'DUMMY_ENTITY': 'DUMMY_ENTITY',
              'NO_OP_ENTITY': 'NO_OP_ENTITY',
              'DUMMY_RELATION': 'DUMMY_RELATION',
              'START_RELATION': 'START_RELATION',
              'NO_OP_RELATION': 'NO_OP_RELATION'}
d = {**d, **dummy_dict}

In [41]:
with open(f'{args.data_dir}/BIMR_id2name.txt', 'w') as file:
    for key, value in d.items():
        file.write('%s\t%s\n' % (key, value))

In [47]:
def load_dict(input_path):
    d = {}
    with open(input_path) as f:
        for line in f.readlines():
            k, v = line.strip().split()
            d[k] = v
    return d

In [48]:
e = load_dict(f'{args.data_dir}/id2name.txt')
e

{'Q494722': 'Q494722_Stand_by_Me',
 'P136': 'P136_genre',
 'Q319221': 'Q319221_adventure_film',
 'Q218999': 'Q218999_A_Man_for_All_Seasons',
 'P1411': 'P1411_nominated_for',
 'Q106301': 'Q106301_Academy_Award_for_Best_Supporting_Actress',
 'Q21': 'Q21_England',
 'P1365': 'P1365_replaces',
 'Q179876': 'Q179876_England',
 'Q1196958': 'Q1196958_The_Soloist',
 'P364': 'P364_original_language_of_film_or_TV_show',
 'Q1860': 'Q1860_English',
 'Q55433': 'Q55433_Michelangelo_Antonioni',
 'P27': 'P27_country_of_citizenship',
 'Q38': 'Q38_Italy',
 'Q142369': 'Q142369_Lancaster_County',
 'P47': 'P47_shares_border_with',
 'Q488690': 'Q488690_Dauphin_County',
 'Q83273': 'Q83273_Ibaraki_Prefecture',
 'Q71707': 'Q71707_Fukushima_Prefecture',
 'Q2164531': 'Q2164531_Rounder_Records',
 'Q43343': 'Q43343_contemporary_folk_music',
 'Q130549': 'Q130549_Tracey_Ullman',
 'P31': 'P31_instance_of',
 'Q5': 'Q5_human',
 'Q974': 'Q974_Democratic_Republic_of_the_Congo',
 'Q929': 'Q929_Central_African_Republic',
 'Q

In [41]:
def load_dict_interpretability(input_path):
    d = {}
    with open(input_path) as f:
        for line in f:
            l = line.split('\t')
            if l[0] != '\n' and not l[1].startswith('('):
                score = l[0]
                #path = ''.join(f'{l[i]}, ' for i in range(2, len(l)))
                path = [l[i].strip('\n, ') for i in range(2, len(l))]
                #path = path.strip('\n, ')
                if l[1] in d:
                    d[l[1]][tuple(path)] = score
                else:
                    d[l[1]] = {tuple(path) : score}    
                    
            else:
                continue
    return d

In [42]:
path = f'{args.data_dir}/interpretability.txt'
k = load_dict_interpretability(path)
k

{'P47_shares_border_with': {('inv_P840_narrative_location',
   'P840_narrative_location',
   'inv_P47_shares_border_with'): '0.05',
  ('P361_part_of', 'inv_P463_member_of'): '0.08333333333333333',
  ('inv_P17_country',
   'inv_P37_official_language',
   'inv_P1889_different_from'): '0.25',
  ('P47_shares_border_with',
   'P206_located_in_or_next_to_body_of_water',
   'inv_P206_located_in_or_next_to_body_of_water'): '0.5833333333333334',
  ('inv_P530_diplomatic_relation',
   'inv_P150_contains_administrative_territorial_entity',
   'P47_shares_border_with'): '0.05',
  ('inv_P527_has_part',
   'inv_P361_part_of',
   'inv_P47_shares_border_with'): '0.4',
  ('P38_currency', 'P17_country', 'P530_diplomatic_relation'): '0.05',
  ('inv_P30_continent', 'P530_diplomatic_relation', 'P30_continent'): '0.05',
  ('P131_located_in_the_administrative_territorial_entity',
   'P150_contains_administrative_territorial_entity',
   'P47_shares_border_with'): '0.45',
  ('inv_P915_filming_location',
   'P91

In [43]:
hit = 1
test_r = 'P161_cast_member'
rule = tuple(['P57_director', 'inv_P57_director', 'P161_cast_member'])
interpretability_score = 0
if hit == 1 and test_r in k:
    if rule in k[test_r]:
        interpretability_score = k[test_r][rule]
interpretability_score

'0.2'

In [40]:
rule = []
for rel in ['P57_director', 'inv_P57_director', 'P161_cast_member']:
    rule.append(rel)
    
tuple(rule)

('P57_director', 'inv_P57_director', 'P161_cast_member')

In [30]:
len(k)

162

In [31]:
sys.getsizeof(k)

4696