In [38]:
import pickle, json, requests, csv, copy
from json import JSONDecodeError
import numpy as np
import pprint as pp
import urllib.request, urllib.parse

In [39]:
rel_list = ['/r/IsA', '/rr/IsA', '/r/PartOf', '/rr/PartOf', '/r/AtLocation', '/rr/AtLocation', '/r/RelatedTo']

In [40]:
class_uri = ['/c/en/company',
            '/c/en/education',
            '/c/en/artist',
            '/c/en/athlete',
            '/c/en/officer',
            '/c/en/transport',
            '/c/en/building',
            '/c/en/nature',
            '/c/en/village',
            '/c/en/animal',
            '/c/en/plant',
            '/c/en/album',
            '/c/en/film',
            '/c/en/writing']

In [41]:
class Path:
    
    def __init__(self): # Create an empty path
        self.path_uri = ''
        self.start = None
        self.end = None
        self.length = 0
        self.edges = []
        self.weights = []
        self.nodes = []
        
    def __repr__(self):
        return self.path_uri + ' (' + ','.join([str(x) for x in self.weights]) + ')'
        
    def create_unit_node_path(uri):
        p = Path()
        p.path_uri = uri
        p.start = uri
        p.end = uri
        p.length = 0
        p.edges = []
        p.weights = []
        p.nodes = [uri]
        return p
    
    def create_path_from_edge(sub, rel, obj, weight):
        p = Path()
        p.path_uri = sub + '$' + rel + '$' + obj
        p.start = sub
        p.end = obj
        p.length = 1
        p.edges = [rel]
        p.weights = [weight]
        p.nodes = [sub, obj]
        return p
    
    def concatenate(pa, pb):
        assert pa.end == pb.start, "Cannot concatenate as the end of the first path is not the start of the second path"
        if pb.length == 0:
            return copy.copy(pa)
        if pa.length == 0:
            return copy.copy(pb)
        
        p = Path()
        p.path_uri = pa.path_uri + pb.path_uri[pb.path_uri.find('$'):]
        p.start = pa.start
        p.end = pb.end
        p.length = pa.length + pb.length
        p.edges = list(pa.edges)
        p.edges.extend(pb.edges)
        p.weights = list(pa.weights)
        p.weights.extend(pb.weights)
        p.nodes = list(pa.nodes)
        p.nodes.extend(pb.nodes[1:])
        assert len(p.edges) + 1 == len(p.nodes), "Nodes and edges are inconsistent"
        
        return p
    
    def is_simple_path(self): # Visit each node only once
        return len(set(self.nodes)) == len(self.nodes)
    
    def form_single_path_with(self, p):
        assert self.end == p.start
        if (not self.is_simple_path()) or (not p.is_simple_path()):
            return False
        nodes = list(self.nodes)
        nodes.extend(pb.nodes[1:])
        return len(set(nodes)) == len(nodes) 
    
    def average_weight(self):
        if self.weights == []:
            return 1.0
        return Path.geo_mean(self.weights)
    
    def geo_mean(iterable):
        a = np.array(iterable)
        return a.prod()**(1.0/len(a))

In [10]:
# PATHS_FROM = {}
PATHS_FROM = pickle.load(open("../wordEmbeddings/PATHS_FROM.pickle", "rb"))

In [42]:
def find_edges_of(uri, rel_list = None):
    edges = []
    with open('../wordEmbeddings/conceptnet-assertions-en-filter-5.6.0.csv', 'r', encoding = "utf8") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for line in reader:
            if uri == line[2] or uri + '/' in line[2] or uri == line[3] or uri + '/' in line[3]:
                if rel_list is None or line[1] in rel_list:
                    details = json.loads(line[4])
                    w = details['weight']
                    edges.append({'sub': line[2],
                                 'rel': line[1],
                                 'obj': line[3],
                                 'weight': w})
    return edges

In [43]:
def get_neighbors(uri, rel_list = None):
    neighbors = {}
    edge_list = find_edges_of(uri, rel_list)
    neighbors = process_edges(neighbors, uri, edge_list)
    return neighbors

In [44]:
def remove_word_sense(sub):
    if sub.count('/') > 3:
        if sub.count('/') > 4:
            print(sub)
            assert False, "URI error (with more than 4 slashes)"
        sub = sub[:sub.rfind('/')]
    return sub

In [45]:
def process_edges(a_dict, uri, edge_data):
    for e in edge_data:
        sub = remove_word_sense(e['sub'])
        rel = e['rel']
        obj = remove_word_sense(e['obj'])
        weight = e['weight']
                   
        if sub == uri:
            if obj in a_dict:
                if rel in a_dict[obj]:
                    a_dict[obj][rel] = max(a_dict[obj][rel], weight)
                else:
                    a_dict[obj][rel] = weight
            else:
                a_dict[obj] = { rel:weight }
        elif obj == uri:
            if rel != '/r/RelatedTo': # Bi-directional
                rel = rel.replace('/r/', '/rr/', 1)
            if sub in a_dict:
                if rel in a_dict[sub]:
                    a_dict[sub][rel] = max(a_dict[sub][rel], weight)
                else:
                    a_dict[sub][rel] = weight
            else:
                a_dict[sub] = { rel:weight }
        else:
            print(e)
            assert False, "This edge does not belong to the given uri"
    return a_dict        

In [46]:
def get_all_paths_from(uri, hops):
    global PATHS_FROM
    assert hops >= 0, "Invalid hops (less than 0)"
    
    if uri in PATHS_FROM and hops in PATHS_FROM[uri]:
        return PATHS_FROM[uri][hops]
    
    if uri not in PATHS_FROM:
        PATHS_FROM[uri] = {}
        
    if hops == 0:
        PATHS_FROM[uri][hops] = [Path.create_unit_node_path(uri)]
    elif hops == 1:
        all_paths = []
        neighbors = get_neighbors(uri)
        for end, edges in neighbors.items():
            for r, w in edges.items():
                all_paths.append(Path.create_path_from_edge(uri, r, end, w))
        PATHS_FROM[uri][hops] = all_paths
    else: # hops > 1
        all_paths = []
        previous_paths = get_all_paths_from(uri, hops - 1)
        for p in previous_paths:
            one_hop_paths = get_all_paths_from(p.end, 1)
            for unit_path in one_hop_paths:
                if p.form_single_path_with(unit_path):
                    all_paths.append(Path.concatenate(p, unit_path))
        PATHS_FROM[uri][hops] = all_paths
    print('Memorise all paths from %s with hops = %d, number of paths = %d' % (uri, hops, len(PATHS_FROM[uri][hops])))
                    
    return PATHS_FROM[uri][hops]

In [55]:
def get_kg_vectors_for_a_class(uri, max_hops, rel_list):
    total_rel = len(rel_list)
    vector_size = int(((total_rel ** (max_hops + 1)) - 1)/(total_rel - 1)) # Total size of vector
    all_paths = []
    for i in range(max_hops+1):
        all_paths.extend(get_all_paths_from(uri, i))
    
    end_dict = {}
    for p in all_paths:
        assert p.start == uri, "The start node is not the given class"
        if p.end in end_dict:
            end_dict[p.end].append(p)
        else:
            end_dict[p.end] = [p]
    
    vectors = {}
    for end, paths in end_dict.items():
        v = np.zeros(vector_size) 
        for p in paths:
            if p.edges == []:
                v[-1] += 1
            else:
                v[get_index_from_edges(p.edges, max_hops, rel_list)] += p.average_weight()
        vectors[end] = v
    
    return vectors

In [48]:
def get_index_from_edges(edges, max_hops, rel_list):
    assert len(edges) <= max_hops, "The path is longer than the given max_hops"
    total_rel = len(rel_list)
    
    if edges == []:
        return sum([total_rel ** i for i in range(1, max_hops+1)]) # The last index refers to itself
    
    index = 0
    for i, e in enumerate(reversed(edges)):
        if e not in rel_list:
            assert False, "Found an unsupported relation" + e
        index += (total_rel ** i) * rel_list.index(e)
    return index

In [59]:
def get_kg_vectors_for_classes(class_uri, max_hops, rel_list):
    V = {}
    for uri in class_uri:
        V[uri] = get_kg_vectors_for_a_class(uri, max_hops, rel_list)
        print('Finish producing vectors of class', uri, 'Max hops', max_hops)
    return V

In [60]:
KG_VECTORS_1 = get_kg_vectors_for_classes(class_uri, max_hops = 1, rel_list = rel_list)
pickle.dump(KG_VECTORS_1, open("../wordEmbeddings/KG_VECTORS_1.pickle", "wb"))

Finish producing vectors of class /c/en/company Max hops 1
Memorise all paths from /c/en/education with hops = 0, number of paths = 1
Memorise all paths from /c/en/education with hops = 1, number of paths = 283
Finish producing vectors of class /c/en/education Max hops 1
Memorise all paths from /c/en/artist with hops = 0, number of paths = 1
Memorise all paths from /c/en/artist with hops = 1, number of paths = 216
Finish producing vectors of class /c/en/artist Max hops 1
Memorise all paths from /c/en/athlete with hops = 0, number of paths = 1
Memorise all paths from /c/en/athlete with hops = 1, number of paths = 190
Finish producing vectors of class /c/en/athlete Max hops 1
Memorise all paths from /c/en/officer with hops = 0, number of paths = 1
Memorise all paths from /c/en/officer with hops = 1, number of paths = 352
Finish producing vectors of class /c/en/officer Max hops 1
Memorise all paths from /c/en/transport with hops = 0, number of paths = 1
Memorise all paths from /c/en/trans

In [29]:
pickle.dump(PATHS_FROM, open("../wordEmbeddings/PATHS_FROM.pickle", "wb"))

In [63]:
"""
v = get_kg_vectors_for_a_class('/c/en/company', 1, rel_list)
pp.pprint(v)
"""

{'/c/en/3m': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/abandonware': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/abb': array([0., 1., 0., 0., 0., 0., 0., 0.]),
 '/c/en/abbreviate': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.216, 0.   ]),
 '/c/en/abbreviate_co': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.216, 0.   ]),
 '/c/en/abbreviated': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.73, 0.  ]),
 '/c/en/abbreviated_co': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.73, 0.  ]),
 '/c/en/abc': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/acc': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/access': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/accolade': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/accompany': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/acquire': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/acquiree': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/adidas': array([0. , 0.5, 0. , 0. , 

 '/c/en/golden_parachute': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/goliath': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/good': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.15, 0.  ]),
 '/c/en/good_to_go': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/goods': array([0., 0., 0., 0., 0., 0., 1., 0.]),
 '/c/en/google': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/gravity': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/group': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 2.615, 0.   ]),
 '/c/en/group_people': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.318, 0.   ]),
 '/c/en/gucci': array([0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ]),
 '/c/en/guest': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.278, 0.   ]),
 '/c/en/guests': array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.806, 0.   ]),
 '/c/en/guests_visitors': array([0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. ]),
 '/c/en/guinea_pig_director': array([0., 0., 0., 0., 0., 0., 

In [8]:
"""
pp.pprint(get_neighbors('/c/en/company'))
"""

{'/c/en/3m': {'/rr/IsA': 0.5},
 '/c/en/abandonware': {'/r/RelatedTo': 1.0},
 '/c/en/abb': {'/rr/IsA': 1.0},
 '/c/en/abbreviate': {'/r/RelatedTo': 0.216},
 '/c/en/abbreviate_co': {'/r/RelatedTo': 0.216},
 '/c/en/abbreviated': {'/r/RelatedTo': 0.73},
 '/c/en/abbreviated_co': {'/r/RelatedTo': 0.73},
 '/c/en/abc': {'/r/RelatedTo': 1.0},
 '/c/en/acc': {'/rr/IsA': 0.5},
 '/c/en/access': {'/rr/IsA': 0.5},
 '/c/en/accolade': {'/rr/IsA': 0.5},
 '/c/en/accompany': {'/r/RelatedTo': 1.0},
 '/c/en/acquire': {'/rr/IsA': 0.5},
 '/c/en/acquiree': {'/r/RelatedTo': 1.0},
 '/c/en/adidas': {'/rr/IsA': 0.5},
 '/c/en/administration': {'/r/RelatedTo': 1.0},
 '/c/en/aeon': {'/rr/IsA': 0.5},
 '/c/en/aeros': {'/rr/IsA': 0.5},
 '/c/en/aerostar': {'/rr/IsA': 0.5},
 '/c/en/aetna': {'/rr/IsA': 0.5},
 '/c/en/agco': {'/rr/IsA': 0.5},
 '/c/en/agency': {'/r/RelatedTo': 0.25},
 '/c/en/agent_provocateur': {'/rr/IsA': 0.5},
 '/c/en/ahold': {'/rr/IsA': 0.5},
 '/c/en/airbud': {'/rr/IsA': 1.0},
 '/c/en/airbus': {'/rr/IsA': 1

In [78]:
"""
pp.pprint(find_edges_of('/c/en/company'))
"""

[{'obj': '/c/en/armed_forces',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 1.0},
 {'obj': '/c/en/city',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 1.0},
 {'obj': '/c/en/country',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 1.0},
 {'obj': '/c/en/market_place',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 1.0},
 {'obj': '/c/en/phone_book',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 1.0},
 {'obj': '/c/en/yellow_pages',
  'rel': '/r/AtLocation',
  'sub': '/c/en/company',
  'weight': 2.0},
 {'obj': '/c/en/company',
  'rel': '/r/AtLocation',
  'sub': '/c/en/connection',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/AtLocation',
  'sub': '/c/en/employees',
  'weight': 3.464},
 {'obj': '/c/en/company',
  'rel': '/r/AtLocation',
  'sub': '/c/en/humans',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/AtLocation',
  'sub': '/c/en/leader',
  'weight': 1.0},
 {'obj': '/c/en/comp

  'sub': '/c/en/jive/n',
  'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/jockey_club',
  'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/john_deere',
  'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/joint_stock_company/n',
  'weight': 2.0},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/judd/n',
  'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/junkers',
  'weight': 0.5},
 {'obj': '/c/en/company/n', 'rel': '/r/IsA', 'sub': '/c/en/jvc', 'weight': 0.5},
 {'obj': '/c/en/company/n', 'rel': '/r/IsA', 'sub': '/c/en/k_1', 'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/k_line',
  'weight': 0.5},
 {'obj': '/c/en/company/n', 'rel': '/r/IsA', 'sub': '/c/en/k_s', 'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/c/en/kappa/n',
  'weight': 0.5},
 {'obj': '/c/en/company/n',
  'rel': '/r/IsA',
  'sub': '/

  'sub': '/c/en/board',
  'weight': 0.437},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/board_meeting/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/board_member/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/body/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/body_corporate/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/booly/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/boss',
  'weight': 0.102},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/boss/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/brand_avatar/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/brandwidth/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/b

 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/intalk/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/intercompany/a',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/interoffice/a',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/intracompany/a',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/intrapreneur/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/introduce',
  'weight': 0.148},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/invoice',
  'weight': 0.101},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/ipcc',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/issue/n',
  'weight': 1.0},
 {'obj': '/c/en/company',
  'rel': '/r/RelatedTo',
  'sub': '/c/en/jeep',
  'weight': 0.227},
 {'obj': '/c/en/comp

In [18]:
"""
pa = Path.create_path_from_edge('/c/en/beavertails', '/r/IsA', '/c/en/company', 0.5)
pb = Path.create_path_from_edge('/c/en/company', '/r/AtLocation', '/c/en/city', 1.0)
print(pa)
print(pb)
if pa.form_single_path_with(pb):
    pc = Path.concatenate(pa, pb)
    print(pc)
    print(pc.is_simple_path(), pc.average_weight())
    print(pc.start, pc.end, pc.length)
"""

/c/en/beavertails$/r/IsA$/c/en/company (0.5)
/c/en/company$/r/AtLocation$/c/en/city (1.0)
/c/en/beavertails$/r/IsA$/c/en/company$/r/AtLocation$/c/en/city (0.5,1.0)
True 0.7071067811865476
/c/en/beavertails /c/en/city 2


In [2]:
"""
class_uri = ['/c/en/company',
            '/c/en/education',
            '/c/en/artist',
            '/c/en/athlete',
            '/c/en/officer',
            '/c/en/transport',
            '/c/en/building',
            '/c/en/nature',
            '/c/en/village',
            '/c/en/animal',
            '/c/en/plant',
            '/c/en/album',
            '/c/en/film',
            '/c/en/writing']
rel_uri = ['/r/IsA', '/r/PartOf', '/r/AtLocation', '/r/RelatedTo']
"""

In [55]:
"""
def add_edges_of(uri, edges = {}, rel = None):
    url_string = 'http://api.conceptnet.io/query?node=' + uri + '&other=/c/en'
    if rel is not None:
        url_string += '&rel=' + rel
    try:
        r = requests.get(url_string)
        json_data = r.json()
    except JSONDecodeError:
        print(r.text)
        print('Cannot decode the json')
        sys.exit(0)
    edges = process_edges(edges, uri, json_data['edges'])
    next_page = 'http://api.conceptnet.io' + json_data['view']['nextPage'] if 'view' in json_data else None # whether it has next page
    while next_page is not None:
        try:
            r = requests.get(next_page)
            json_data = r.json()
        except JSONDecodeError:
            print(r.text)
            print('Cannot decode the json')
            sys.exit(0)
        edges = process_edges(edges, uri, json_data['edges'])
        if 'view' in json_data and 'nextPage' in json_data['view']:
            next_page = 'http://api.conceptnet.io' + json_data['view']['nextPage']  
        else:
            next_page = None # whether it has next page
    return edges
"""

In [64]:
"""
def process_edges(a_dict, uri, edge_data):
    for e in edge_data:
        sub = e['start']['term']
        rel = e['rel']['@id']
        obj = e['end']['term']
        weight = e['weight']
        if sub == uri:
            if obj in a_dict:
                if rel in a_dict[obj]:
                    a_dict[obj][rel] = max(a_dict[obj][rel], weight)
                else:
                    a_dict[obj][rel] = weight
            else:
                a_dict[obj] = { rel:weight }
        elif obj == uri:
            if rel != '/r/RelatedTo': # Bi-directional
                rel = rel.replace('/r/', '/rr/', 1)
            if sub in a_dict:
                if rel in a_dict[sub]:
                    a_dict[sub][rel] = max(a_dict[sub][rel], weight)
                else:
                    a_dict[sub][rel] = weight
            else:
                a_dict[sub] = { rel:weight }
        else:
            print(e)
            assert False, "This edge does not belong to the given uri"
    return a_dict  
"""

In [72]:
"""
def get_neighbors(uri):
    neighbors = {}
    for rel in rel_uri:
        neighbors = add_edges_of(uri, edges = neighbors, rel = rel)
        print('Finish running', uri, rel)
    return neighbors
"""