<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Load-w2v" data-toc-modified-id="Load-w2v-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load w2v</a></span></li><li><span><a href="#Read-entity2id.txt-and-create-the-similarity-data-frame." data-toc-modified-id="Read-entity2id.txt-and-create-the-similarity-data-frame.-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read entity2id.txt and create the similarity data frame.</a></span></li><li><span><a href="#Append-the-similarity-links" data-toc-modified-id="Append-the-similarity-links-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Append the similarity links</a></span></li><li><span><a href="#Apply-AMIE-and-Evaluate" data-toc-modified-id="Apply-AMIE-and-Evaluate-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Apply AMIE and Evaluate</a></span></li></ul></div>

# Setup

This is the model from https://code.google.com/archive/p/word2vec/.  
You can find it under __Pre-trained entity vectors with Freebase naming__, with the following name/link:  
[freebase-vectors-skipgram1000.bin.gz](https://docs.google.com/file/d/0B7XkCwpI5KDYaDBDQm1tZGNDRHc/edit?usp=sharing)

Download the bin file and and put it in a folder named __`w2v_data`__

# Load w2v


----

In [64]:
import gensim.models as w2v       # Word2Vec Library
import pandas as pd
import sys
import numpy as np
import os
import datetime
from itertools import combinations
from shutil import copyfile

W2V_THR = .85 # Threshold for Cosine similarity of word vectors

In [2]:
wiki_w2v = w2v.KeyedVectors.load_word2vec_format('./w2v_data/freebase-vectors-skipgram1000.bin', binary=True)

In [8]:
all_voc = wiki_w2v.index2entity # List of all words in w2v. They are in /mid format.

In [62]:
print("Total words: " + str(len(all_voc)))
print("example: " + all_voc[0]) 

Total words: 1422903
example: /m/0dgps15


--------

# Read entity2id.txt and create the similarity data frame. 

First we read "/OpenKE/benchmarks/FB15K/entity2id.txt" into a data frame.

In [11]:
# Get all the entities in FB15K
ents = pd.read_csv("./OpenKE/benchmarks/FB15K/entity2id.txt",
                   sep = '\t',header=None, names=['mid'], 
                   skiprows=[0],usecols=[0]) # first row is total line

ents_list = list(ents['mid'])

In [23]:
# find the intersection of w2v and FB15K
new_list = []
for w in ents_list:
    if w in all_voc:
        new_list.append(w)

In [63]:
tot = len(new_list) # number of FB15K words that are also in w2v.
print(tot)

13910


In [65]:
# Calculate the similarities and filter by W2V_THR
h = []
t = []

for i,j in combinations(range(tot),2):
    w1 = new_list[i]
    w2 = new_list[j]
    if wiki_w2v.similarity(w1,w2) > W2V_THR:
        h.append(w1)
        t.append(w2)
    

KeyboardInterrupt: 

In [66]:
d = {'head':h , 'tail':t}
w2v_df = pd.DataFrame(data=d) 

In [67]:
w2v_df

Unnamed: 0,head,tail
0,/m/01cwm1,/m/02029f
1,/m/01cwm1,/m/01n7rc
2,/m/019v9k,/m/016t_3
3,/m/071wvh,/m/02qvhbb
4,/m/071wvh,/m/02ql_ms
5,/m/071wvh,/m/047s_cr
6,/m/071wvh,/m/02qnyr7
7,/m/0f0y8,/m/053yx
8,/m/0f0y8,/m/0lgm5
9,/m/027nb,/m/06s0l


# Append the similarity links

In [69]:
def append_train(input_df,new_name):
    """ Appends the input data frame to a copy of train.txt.
    
    input_df: --pd.DataFrame: has two columns 'head', and 'tail' containing
    the integer ids for heads and tails of similar tuples.
    new_name: --str: name of the new file will be train_{new_name}.txt 
    """
    import os
    import datetime
    new_name = new_name + str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    dest = './FB15K/train_'+ new_name + '.txt'
    while os.path.isfile(dest):
        new_name = input("File already exists. Give another name: ")
        dest = './FB15K/train_'+ new_name + '.txt'
    
    heads = list(input_df['head']) 
    tails = list(input_df['tail']) 
    rels_mid = ['/similar_to']*len(heads)


    d = {'head': heads , 'relation': rels_mid, 'tail':tails}
    df = pd.DataFrame(data=d)

    from shutil import copyfile
    copyfile('./FB15K/train.txt', dest)
    df.to_csv(dest, mode='a', header=False,index=False, sep='\t')
    return dest

In [70]:
train_sub = append_train(w2v_df, 'w2v')

In [71]:
train_sub

'./FB15K/train_w2v2020-05-13_15-26-37.txt'

----

# Apply AMIE and Evaluate

In [72]:
def clean_amie_output(path):
    """
    Warning: this function overwrites the file in path
    """
    with open(path, 'r') as f:
        f_contents = f.readlines()
        
    f_contents = f_contents[13:-3]

    with open(path, 'w') as f:
        f.writelines(f_contents)
        
    print('Rules at %s file cleaned.' % path)
    
def eval_frame(file, test_len):
    
    # Open file
    f = open(file)
    
    # Hits counter
    hits = 0
    
    # Loop though all facts in KB
    for x in range(test_len):

        # Read line
        fact = f.readline()
        fact = fact.split(' ')
        if fact != ['']:
            # Get target head and tail
            head_target = fact[0]
            tail_target = fact[2][:-1]


            # Get head predictions
            headpreds = f.readline()
            headpreds = headpreds.split(' ')
            headpreds = headpreds[1].split('\t')
            headpreds.pop()

            # Get tail predictions
            tailpreds = f.readline()
            tailpreds = tailpreds.split(' ')
            tailpreds = tailpreds[1].split('\t')
            tailpreds.pop()


            if (head_target in headpreds) and (tail_target in tailpreds):
                if (len(headpreds) < 10) and (len(tailpreds) < 10):
                    hits+=1
        else:
            print('miss')
                
    return hits/(test_len)

In [73]:
name = train_sub
name[13:]

'_w2v2020-05-13_15-26-37.txt'

In [74]:
train_add = "FB15K/train" +  name[13:] # From append module
rules_add = "rules/Enriched_rules" + name[13:] # modify this name if you like
eval_add = "evaluation/Enriched_eval" + name[13:] # same here

test_add = "FB15K/test.txt"
valid_add = "FB15K/valid.txt"

import subprocess
test_len = subprocess.run(['wc', '-l', test_add], 
                          stdout=subprocess.PIPE).stdout.decode('utf-8')
test_len = int(test_len.split()[0])
test_len

print("The enriched tr file: " + train_add)
print("Rules will be saved at: "+ rules_add)
print("And rule evaluations at: " + eval_add)

# The texts of the commands for running AMIE
AMIE_plus = ("java -XX:-UseGCOverheadLimit -Xmx4g -jar AMIE/amie_plus.jar "
"-minhc 0.0 -mins 0 -minis 0 " 
f"{train_add} > {rules_add}")

Apply_AMIE_RULES = (f'java -jar AMIE/ApplyAMIERules.jar {rules_add}' 
                    f' {train_add} {test_add} {valid_add}'
                    f' {eval_add}')

x = os.system(AMIE_plus)
print("\n AMIE_plus output: " + str(x))
    
# trim `Enriched_rules{}.txt` again
clean_amie_output(rules_add)

y = os.system(Apply_AMIE_RULES) # if output is 256 then you forgot to trim
print("\n Apply_AMIE_Rules output: " + str(y))
    
print('\n Hits@10: ' + str(eval_frame(eval_add, test_len)))
print("\n")

The enriched tr file: FB15k/train_w2v2020-05-13_15-26-37.txt
Rules will be saved at: rules/Enriched_rules_w2v2020-05-13_15-26-37.txt
And rule evaluations at: evaluation/Enriched_eval_w2v2020-05-13_15-26-37.txt

 AMIE_plus output: 33280
Rules at rules/Enriched_rules_w2v2020-05-13_15-26-37.txt file cleaned.

 Apply_AMIE_Rules output: 0

 Hits@10: 0.12390174535728192


