In [1]:
import pickle, json, requests, csv, copy
from json import JSONDecodeError
import numpy as np
import pprint as pp
import urllib.request, urllib.parse

In [2]:
rel_list = ['/r/IsA', '/rr/IsA', '/r/PartOf', '/rr/PartOf', '/r/AtLocation', '/rr/AtLocation', '/r/RelatedTo']

In [3]:
class_uri = ['/c/en/company',
            '/c/en/education',
            '/c/en/artist',
            '/c/en/athlete',
            '/c/en/officer',
            '/c/en/transport',
            '/c/en/building',
            '/c/en/nature',
            '/c/en/village',
            '/c/en/animal',
            '/c/en/plant',
            '/c/en/album',
            '/c/en/film',
            '/c/en/writing']

In [4]:
class Path:
    
    def __init__(self): # Create an empty path
        self.path_uri = ''
        self.start = None
        self.end = None
        self.length = 0
        self.edges = []
        self.weights = []
        self.nodes = []
        
    def __repr__(self):
        return self.path_uri + ' (' + ','.join([str(x) for x in self.weights]) + ')'
        
    def create_unit_node_path(uri):
        p = Path()
        p.path_uri = uri
        p.start = uri
        p.end = uri
        p.length = 0
        p.edges = []
        p.weights = []
        p.nodes = [uri]
        return p
    
    def create_path_from_edge(sub, rel, obj, weight):
        p = Path()
        p.path_uri = sub + '$' + rel + '$' + obj
        p.start = sub
        p.end = obj
        p.length = 1
        p.edges = [rel]
        p.weights = [weight]
        p.nodes = [sub, obj]
        return p
    
    def concatenate(pa, pb):
        assert pa.end == pb.start, "Cannot concatenate as the end of the first path is not the start of the second path"
        if pb.length == 0:
            return copy.copy(pa)
        if pa.length == 0:
            return copy.copy(pb)
        
        p = Path()
        p.path_uri = pa.path_uri + pb.path_uri[pb.path_uri.find('$'):]
        p.start = pa.start
        p.end = pb.end
        p.length = pa.length + pb.length
        p.edges = list(pa.edges)
        p.edges.extend(pb.edges)
        p.weights = list(pa.weights)
        p.weights.extend(pb.weights)
        p.nodes = list(pa.nodes)
        p.nodes.extend(pb.nodes[1:])
        assert len(p.edges) + 1 == len(p.nodes), "Nodes and edges are inconsistent"
        
        return p
    
    def is_simple_path(self): # Visit each node only once
        return len(set(self.nodes)) == len(self.nodes)
    
    def form_single_path_with(self, p):
        assert self.end == p.start
        if (not self.is_simple_path()) or (not p.is_simple_path()):
            return False
        nodes = list(self.nodes)
        nodes.extend(p.nodes[1:])
        return len(set(nodes)) == len(nodes) 
    
    def average_weight(self):
        if self.weights == []:
            return 1.0
        return Path.geo_mean(self.weights)
    
    def geo_mean(iterable):
        a = np.array(iterable)
        return a.prod()**(1.0/len(a))

In [5]:
PATHS_FROM = {}

In [5]:
PATHS_FROM = pickle.load(open("../wordEmbeddings/PATHS_FROM_01.pickle", "rb"))

In [6]:
def find_edges_of(uri, rel_list = None):
    edges = []
    with open('../wordEmbeddings/conceptnet-assertions-en-filter-5.6.0.csv', 'r', encoding = "utf8") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for line in reader:
            if uri == line[2] or uri + '/' in line[2] or uri == line[3] or uri + '/' in line[3]:
                if rel_list is None or line[1] in rel_list:
                    details = json.loads(line[4])
                    w = details['weight']
                    edges.append({'sub': line[2],
                                 'rel': line[1],
                                 'obj': line[3],
                                 'weight': w})
    return edges

In [7]:
def get_neighbors(uri, rel_list = None):
    neighbors = {}
    edge_list = find_edges_of(uri, rel_list)
    neighbors = process_edges(neighbors, uri, edge_list)
    return neighbors

In [8]:
def remove_word_sense(sub):
    if sub.count('/') > 3:
        if sub.count('/') > 4:
            print(sub)
            assert False, "URI error (with more than 4 slashes)"
        sub = sub[:sub.rfind('/')]
    return sub

In [9]:
def process_edges(a_dict, uri, edge_data):
    for e in edge_data:
        sub = remove_word_sense(e['sub'])
        rel = e['rel']
        obj = remove_word_sense(e['obj'])
        weight = e['weight']
        if weight < 1:
            continue
        if sub == uri:
            if obj in a_dict:
                if rel in a_dict[obj]:
                    a_dict[obj][rel] = max(a_dict[obj][rel], weight)
                else:
                    a_dict[obj][rel] = weight
            else:
                a_dict[obj] = { rel:weight }
        elif obj == uri:
            if rel != '/r/RelatedTo': # Bi-directional
                rel = rel.replace('/r/', '/rr/', 1)
            if sub in a_dict:
                if rel in a_dict[sub]:
                    a_dict[sub][rel] = max(a_dict[sub][rel], weight)
                else:
                    a_dict[sub][rel] = weight
            else:
                a_dict[sub] = { rel:weight }
        else:
            print(e)
            assert False, "This edge does not belong to the given uri"
    return a_dict        

In [23]:
def get_all_paths_from(uri, hops):
    global PATHS_FROM
    assert hops >= 0, "Invalid hops (less than 0)"
    
    if uri in PATHS_FROM and hops in PATHS_FROM[uri]:
        return PATHS_FROM[uri][hops]
    
    if uri not in PATHS_FROM:
        PATHS_FROM[uri] = {}
        
    if hops == 0:
        PATHS_FROM[uri][hops] = [Path.create_unit_node_path(uri)]
    elif hops == 1:
        all_paths = []
        neighbors = get_neighbors(uri)
        for end, edges in neighbors.items():
            for r, w in edges.items():
                all_paths.append(Path.create_path_from_edge(uri, r, end, w))
        PATHS_FROM[uri][hops] = all_paths
    else: # hops > 1
        all_paths = []
        previous_paths = get_all_paths_from(uri, hops - 1)
        for p in previous_paths:
            one_hop_paths = get_all_paths_from(p.end, 1)
            for unit_path in one_hop_paths:
                if p.form_single_path_with(unit_path):
                    new_path = Path.concatenate(p, unit_path)
                    if new_path.average_weight() < 1:
                        continue
                    all_paths.append(new_path)
        if hops <= 2: 
            PATHS_FROM[uri][hops] = all_paths
        else:
            pickle.dump(all_paths, open("../wordEmbeddings/PATHS_FROM_"+uri[uri.rfind('/')+1:]+"_"+str(hops)+".pickle", "wb"))
            print('Finish dump all paths from %s with hops = %d, number of paths = %d' % (uri, hops, len(all_paths)))
            return all_paths
    print('Memorise all paths from %s with hops = %d, number of paths = %d' % (uri, hops, len(PATHS_FROM[uri][hops])))
                    
    return PATHS_FROM[uri][hops]

In [11]:
def get_kg_vectors_for_a_class(uri, max_hops, rel_list):
    total_rel = len(rel_list)
    vector_size = int(((total_rel ** (max_hops + 1)) - 1)/(total_rel - 1)) # Total size of vector
    all_paths = []
    for i in range(max_hops+1):
        all_paths.extend(get_all_paths_from(uri, i))
    
    end_dict = {}
    for p in all_paths:
        assert p.start == uri, "The start node is not the given class"
        if p.end in end_dict:
            end_dict[p.end].append(p)
        else:
            end_dict[p.end] = [p]
    
    vectors = {}
    for end, paths in end_dict.items():
        v = np.zeros(vector_size) 
        for p in paths:
            if p.edges == []:
                v[-1] += 1
            else:
                v[get_index_from_edges(p.edges, max_hops, rel_list)] += p.average_weight()
        vectors[end] = v
    
    return vectors

In [12]:
def get_index_from_edges(edges, max_hops, rel_list):
    assert len(edges) <= max_hops, "The path is longer than the given max_hops"
    total_rel = len(rel_list)
    
    if edges == []:
        return sum([total_rel ** i for i in range(1, max_hops+1)]) # The last index refers to itself
    
    index = 0
    for i, e in enumerate(reversed(edges)):
        if e not in rel_list:
            assert False, "Found an unsupported relation" + e
        index += (total_rel ** i) * rel_list.index(e)
    return index

In [13]:
def get_kg_vectors_for_classes(class_uri, max_hops, rel_list):
    V = {}
    for uri in class_uri:
        V[uri] = get_kg_vectors_for_a_class(uri, max_hops, rel_list)
        print('Finish producing vectors of class', uri, 'Max hops', max_hops)
    return V

In [14]:
def init_PATHS_FROM(rel_list):
    with open('../wordEmbeddings/conceptnet-assertions-en-filter-5.6.0.csv', 'r', encoding = "utf8") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        i = 0
        for line in reader:
            if i%10000 == 0:
                print('Process line %d' % i)
            if rel_list is None or line[1] in rel_list:
                details = json.loads(line[4])
                weight = details['weight']
                sub = remove_word_sense(line[2])
                rel = line[1]
                obj = remove_word_sense(line[3])
                
                if sub not in PATHS_FROM:
                    PATHS_FROM[sub] = {0: [Path.create_unit_node_path(sub)], 1: []}
                if obj not in PATHS_FROM:
                    PATHS_FROM[obj] = {0: [Path.create_unit_node_path(obj)], 1: []}
                    
                # path from subject
                done_subject = False
                expected_path_uri = sub + '$' + rel + '$' + obj
                for p in PATHS_FROM[sub][1]:
                    if p.path_uri == expected_path_uri:
                        p.weights[0] = max(p.weights[0], weight)
                        done_subject = True
                        break
                if not done_subject:
                    PATHS_FROM[sub][1].append(Path.create_path_from_edge(sub, rel, obj, weight))
                
                # path from object
                if rel != '/r/RelatedTo': # Bi-directional
                    rel = rel.replace('/r/', '/rr/', 1)
                    
                done_object = False
                expected_path_uri = obj + '$' + rel + '$' + sub
                for p in PATHS_FROM[obj][1]:
                    if p.path_uri == expected_path_uri:
                        p.weights[0] = max(p.weights[0], weight)
                        done_object = True
                        break
                if not done_object:
                    PATHS_FROM[obj][1].append(Path.create_path_from_edge(obj, rel, sub, weight))
            i += 1   
                    

In [15]:
init_PATHS_FROM(rel_list)

Process line 0
Process line 10000
Process line 20000
Process line 30000
Process line 40000
Process line 50000
Process line 60000
Process line 70000
Process line 80000
Process line 90000
Process line 100000
Process line 110000
Process line 120000
Process line 130000
Process line 140000
Process line 150000
Process line 160000
Process line 170000
Process line 180000
Process line 190000
Process line 200000
Process line 210000
Process line 220000
Process line 230000
Process line 240000
Process line 250000
Process line 260000
Process line 270000
Process line 280000
Process line 290000
Process line 300000
Process line 310000
Process line 320000
Process line 330000
Process line 340000
Process line 350000
Process line 360000
Process line 370000
Process line 380000
Process line 390000
Process line 400000
Process line 410000
Process line 420000
Process line 430000
Process line 440000
Process line 450000
Process line 460000
Process line 470000
Process line 480000
Process line 490000
Process line 5

In [15]:
pp.pprint(PATHS_FROM['/c/en/unit'][1])

[/c/en/unit$/rr/IsA$/c/en/abcoulomb (1.0),
 /c/en/unit$/rr/IsA$/c/en/administrative_unit (2.0),
 /c/en/unit$/rr/IsA$/c/en/chain (2.0),
 /c/en/unit$/rr/IsA$/c/en/combination (2.0),
 /c/en/unit$/rr/IsA$/c/en/company (2.0),
 /c/en/unit$/rr/IsA$/c/en/couple (2.0),
 /c/en/unit$/rr/IsA$/c/en/crew (2.0),
 /c/en/unit$/rr/IsA$/c/en/den (2.0),
 /c/en/unit$/rr/IsA$/c/en/family (2.0),
 /c/en/unit$/rr/IsA$/c/en/farad (1.0),
 /c/en/unit$/rr/IsA$/c/en/fifth_column (2.0),
 /c/en/unit$/rr/IsA$/c/en/gang (2.0),
 /c/en/unit$/rr/IsA$/c/en/general_delivery (2.0),
 /c/en/unit$/rr/IsA$/c/en/group (2.0),
 /c/en/unit$/rr/IsA$/c/en/instrumentality (2.0),
 /c/en/unit$/rr/IsA$/c/en/intensive_care_unit (2.0),
 /c/en/unit$/rr/IsA$/c/en/klavern (2.0),
 /c/en/unit$/rr/IsA$/c/en/member (2.0),
 /c/en/unit$/rr/IsA$/c/en/military_unit (2.0),
 /c/en/unit$/rr/IsA$/c/en/molecule (2.0),
 /c/en/unit$/rr/IsA$/c/en/one (2.0),
 /c/en/unit$/rr/IsA$/c/en/outfit (2.0),
 /c/en/unit$/rr/IsA$/c/en/political_unit (2.0),
 /c/en/unit$/rr

In [25]:
for uri in class_uri: # KG_VECTORS_3
    V = get_kg_vectors_for_a_class(uri, max_hops = 3, rel_list = rel_list)
    print('Finish producing vectors of class', uri, 'Max hops', 3)
    pickle.dump(V, open("../wordEmbeddings/KG_VECTORS_3_"+ uri[uri.rfind('/')+1:]+".pickle", "wb"))

Finish dump all paths from /c/en/company with hops = 3, number of paths = 2368474
Finish producing vectors of class /c/en/company Max hops 3
Finish dump all paths from /c/en/education with hops = 3, number of paths = 808921
Finish producing vectors of class /c/en/education Max hops 3
Finish dump all paths from /c/en/artist with hops = 3, number of paths = 660473
Finish producing vectors of class /c/en/artist Max hops 3
Finish dump all paths from /c/en/athlete with hops = 3, number of paths = 534392
Finish producing vectors of class /c/en/athlete Max hops 3
Finish dump all paths from /c/en/officer with hops = 3, number of paths = 1266289
Finish producing vectors of class /c/en/officer Max hops 3
Finish dump all paths from /c/en/transport with hops = 3, number of paths = 1401887
Finish producing vectors of class /c/en/transport Max hops 3
Finish dump all paths from /c/en/building with hops = 3, number of paths = 5481409
Finish producing vectors of class /c/en/building Max hops 3
Finish d

In [18]:
KG_VECTORS_2 = get_kg_vectors_for_classes(class_uri, max_hops=2, rel_list=rel_list)

Memorise all paths from /c/en/company with hops = 2, number of paths = 28212
Finish producing vectors of class /c/en/company Max hops 2
Memorise all paths from /c/en/education with hops = 2, number of paths = 10189
Finish producing vectors of class /c/en/education Max hops 2
Memorise all paths from /c/en/artist with hops = 2, number of paths = 10435
Finish producing vectors of class /c/en/artist Max hops 2
Memorise all paths from /c/en/athlete with hops = 2, number of paths = 5899
Finish producing vectors of class /c/en/athlete Max hops 2
Memorise all paths from /c/en/officer with hops = 2, number of paths = 16487
Finish producing vectors of class /c/en/officer Max hops 2
Memorise all paths from /c/en/transport with hops = 2, number of paths = 17611
Finish producing vectors of class /c/en/transport Max hops 2
Memorise all paths from /c/en/building with hops = 2, number of paths = 61377
Finish producing vectors of class /c/en/building Max hops 2
Memorise all paths from /c/en/nature with

In [17]:
# pickle.dump(PATHS_FROM, open("../wordEmbeddings/PATHS_FROM_01.pickle", "wb"))

In [20]:
pickle.dump(KG_VECTORS_2, open("../wordEmbeddings/KG_VECTORS_2.pickle", "wb"))

In [23]:
pp.pprint(KG_VECTORS_2['/c/en/company']['/c/en/bank'])
# pp.pprint(len(KG_VECTORS_1['/c/en/company']))

array([ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  4.02429432,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.41421356,  1.10090872,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.66292573,  0.        , 18.00141376,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])


In [None]:
"""
pp.pprint(find_edges_of('/c/en/company'))
"""

In [None]:
"""
pa = Path.create_path_from_edge('/c/en/beavertails', '/r/IsA', '/c/en/company', 0.5)
pb = Path.create_path_from_edge('/c/en/company', '/r/AtLocation', '/c/en/city', 1.0)
print(pa)
print(pb)
if pa.form_single_path_with(pb):
    pc = Path.concatenate(pa, pb)
    print(pc)
    print(pc.is_simple_path(), pc.average_weight())
    print(pc.start, pc.end, pc.length)
"""

In [None]:
"""
class_uri = ['/c/en/company',
            '/c/en/education',
            '/c/en/artist',
            '/c/en/athlete',
            '/c/en/officer',
            '/c/en/transport',
            '/c/en/building',
            '/c/en/nature',
            '/c/en/village',
            '/c/en/animal',
            '/c/en/plant',
            '/c/en/album',
            '/c/en/film',
            '/c/en/writing']
rel_uri = ['/r/IsA', '/r/PartOf', '/r/AtLocation', '/r/RelatedTo']
"""

In [None]:
"""
def add_edges_of(uri, edges = {}, rel = None):
    url_string = 'http://api.conceptnet.io/query?node=' + uri + '&other=/c/en'
    if rel is not None:
        url_string += '&rel=' + rel
    try:
        r = requests.get(url_string)
        json_data = r.json()
    except JSONDecodeError:
        print(r.text)
        print('Cannot decode the json')
        sys.exit(0)
    edges = process_edges(edges, uri, json_data['edges'])
    next_page = 'http://api.conceptnet.io' + json_data['view']['nextPage'] if 'view' in json_data else None # whether it has next page
    while next_page is not None:
        try:
            r = requests.get(next_page)
            json_data = r.json()
        except JSONDecodeError:
            print(r.text)
            print('Cannot decode the json')
            sys.exit(0)
        edges = process_edges(edges, uri, json_data['edges'])
        if 'view' in json_data and 'nextPage' in json_data['view']:
            next_page = 'http://api.conceptnet.io' + json_data['view']['nextPage']  
        else:
            next_page = None # whether it has next page
    return edges
"""

In [None]:
"""
def process_edges(a_dict, uri, edge_data):
    for e in edge_data:
        sub = e['start']['term']
        rel = e['rel']['@id']
        obj = e['end']['term']
        weight = e['weight']
        if sub == uri:
            if obj in a_dict:
                if rel in a_dict[obj]:
                    a_dict[obj][rel] = max(a_dict[obj][rel], weight)
                else:
                    a_dict[obj][rel] = weight
            else:
                a_dict[obj] = { rel:weight }
        elif obj == uri:
            if rel != '/r/RelatedTo': # Bi-directional
                rel = rel.replace('/r/', '/rr/', 1)
            if sub in a_dict:
                if rel in a_dict[sub]:
                    a_dict[sub][rel] = max(a_dict[sub][rel], weight)
                else:
                    a_dict[sub][rel] = weight
            else:
                a_dict[sub] = { rel:weight }
        else:
            print(e)
            assert False, "This edge does not belong to the given uri"
    return a_dict  
"""

In [None]:
"""
def get_neighbors(uri):
    neighbors = {}
    for rel in rel_uri:
        neighbors = add_edges_of(uri, edges = neighbors, rel = rel)
        print('Finish running', uri, rel)
    return neighbors
"""