In [1]:
## Add parent directory to system path so we can add modules from there
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder#11158224
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
## Auto reload changed modules
%load_ext autoreload
%autoreload 2

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import time

from evaluation import reading, evaluation

In [4]:
def is_negative(head, tail, cluster, all_pairs):
    if (head, tail) in all_pairs:
        return False
    else:
        return cluster[head] != cluster[tail]

In [6]:
fb15k_data = reading.read_dataset('fb15k')
wn_data = reading.read_dataset('wn')

In [None]:
fb15k = evaluation.Evaluation(*fb15k_data, is_negative=is_negative)
wn = evaluation.Evaluation(*wn_data, is_negative=is_negative)

In [None]:
fb15k.hits_n(10), wn.hits_n(10)

In [None]:
thresholds = np.arange(0, 0.1, 0.01)
precision_fb = []
precision_wn = []
for t in thresholds:
    precision_fb.append(fb15k.hits_threshold(t))
    precision_wn.append(wn.hits_threshold(t))

In [None]:
plt.plot(thresholds, precision_fb, color='red', label="FastText/FB15K")
plt.plot(thresholds, precision_wn, color='orange', label="FastText/WN")
plt.legend()
plt.xlabel("Threshold")
plt.ylabel("Precision")

In [None]:
precisions_fb = []
precisions_wn = []
counts = []
times = []

for t in thresholds:
    begin = time.clock()
    precision, count = fb15k.hits_neg_threshold(t)
    end = time.clock()
    precisions_fb.append(precision)
    counts.append(count)
    times.append(end - begin)

In [None]:
np.mean(times)

In [None]:
dims = 10
plt.plot(thresholds[:dims], precisions_fb[:dims], color='red', label="FastText/FB15K")
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.legend()

In [None]:
start = 1
plt.plot(thresholds[start:dims], np.min(counts, axis=1)[start:dims], color='yellow', label="Minimum")
plt.plot(thresholds[start:dims], np.median(counts, axis=1)[start:dims], color='orange', label="Median")
plt.plot(thresholds[start:dims], np.mean(counts, axis=1)[start:dims], color='green', label="Mean")
plt.plot(thresholds[start:dims], np.max(counts, axis=1)[start:dims], color='red', label="Maximum")
plt.legend()
plt.xlabel("Threshold")
plt.ylabel("Number of Predictions")

In [None]:
precisions_fb = []
counts = []
times = []

for t in thresholds:
    begin = time.clock()
    precision, count = fb15k.hits_group(t)
    end = time.clock()
    precisions_fb.append(precision)
    counts.append(count)
    times.append(end - begin)

In [None]:
np.mean(times)

In [None]:
np.max(precisions_fb)

In [None]:
plt.plot(thresholds, precisions_fb, color='red', label="FastText/FB15K")
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.legend()

In [None]:
precision, counts = fb15k.hits_group(0.011, False)

In [None]:
plt.plot(counts, label="FastText/FB15K")
plt.legend()

In [None]:
## Becnhmark with approx. maximum
iterations = 10
threshold = 0.0011

times_group = []
times_neg = []

for i in range(iterations):
    start = time.clock()
    fb15k.hits_group(threshold)
    end = time.clock()
    times_group.append(end - start)
    
    start = time.clock()
    fb15k.hits_neg_threshold(threshold)
    end = time.clock()
    times_neg.append(end - start)

print("Grouped:", np.mean(times_group), "Only Negatives:", np.mean(times_neg))

In [None]:
plt.plot(times_group, color='green', label="FB15K/Hits_group")
plt.plot(times_neg, color='orange', label="FB15K/Hits_neg")
plt.show()
plt.legend()

In [None]:
# TODO:
# Select negative samples from graph.
# Approaches:
# 1. Cluster the graph and select nodes pairs from to very different clusters – these should not appear in the data
# 2. Find nodes that never have an edge of some type

In [None]:
# Average probability among top 10
top_n = 10
top_prob = fb15k_data[1][:, :top_n]
np.median(top_prob), np.mean(top_prob), np.max(top_prob), np.min(top_prob)

In [None]:
fb15k_data[1].shape[1]

In [None]:
dims = 15
plt.plot(np.arange(dims), np.median(fb15k_data[1], axis=0)[:dims], color='blue')
plt.plot(np.arange(dims), np.mean(fb15k_data[1], axis=0)[:dims], color='orange')
plt.plot(np.arange(dims), np.max(fb15k_data[1], axis=0)[:dims], color='red')
plt.plot(np.arange(dims), np.min(fb15k_data[1], axis=0)[:dims], color='green')
plt.show()

In [None]:
x = np.arange(len(top_prob))
plt.plot(x, np.median(top_prob, axis=1), color='blue')
#plt.plot(x, np.mean(top_prob, axis=1), color='orange')
#plt.plot(x, np.max(top_prob, axis=1), color='red')
#plt.plot(x, np.min(top_prob, axis=1), color='green')
plt.show()