In [1]:
# this notebook separates s all functions to use freebase knowledge base extracted from class website
# in case we need to use it

# <subject>  <predicate>  <object>

import gzip
import numpy as np
import random
import os

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

rel_ext_data_home = '../cs224u/rel_ext_data'

KBTriple = namedtuple('KBTriple', 'rel, sbj, obj')

def read_kb_triples():
    kb_triples = []
    path = os.path.join(rel_ext_data_home, 'kb.tsv.gz')
    print('Reading KB triples from {} ...'.format(path))
    with gzip.open(path) as f:
        for line in f:
            rel, sbj, obj = line[:-1].decode('utf-8').split('\t')
            kb_triples.append(KBTriple(rel, sbj, obj))
    print('Read {} KB triples'.format(len(kb_triples)))
    return kb_triples

kb_triples = read_kb_triples()

Reading KB triples from ../cs224u/rel_ext_data/kb.tsv.gz ...
Read 56575 KB triples


In [2]:
class KB():

    def __init__(self, kb_triples):
        self._kb_triples = kb_triples
        self._all_relations = []
        self._all_entity_pairs = []
        self._kb_triples_by_relation = {}
        self._kb_triples_by_entities = {}
        self._collect_all_entity_pairs()
        self._index_kb_triples_by_relation()
        self._index_kb_triples_by_entities()

    def _collect_all_entity_pairs(self):
        pairs = set()
        for kbt in self._kb_triples:
            pairs.add((kbt.sbj, kbt.obj))
        self._all_entity_pairs = sorted(list(pairs))
        
    def _index_kb_triples_by_relation(self):
        for kbt in self._kb_triples:
            if kbt.rel not in self._kb_triples_by_relation:
                self._kb_triples_by_relation[kbt.rel] = []
            self._kb_triples_by_relation[kbt.rel].append(kbt)
        self._all_relations = sorted(list(self._kb_triples_by_relation))
    
    def _index_kb_triples_by_entities(self):
        for kbt in self._kb_triples:
            if kbt.sbj not in self._kb_triples_by_entities:
                self._kb_triples_by_entities[kbt.sbj] = {}
            if kbt.obj not in self._kb_triples_by_entities[kbt.sbj]:
                self._kb_triples_by_entities[kbt.sbj][kbt.obj] = []
            self._kb_triples_by_entities[kbt.sbj][kbt.obj].append(kbt)

    def get_triples(self):
        return iter(self._kb_triples)
        
    def get_all_relations(self):
        return self._all_relations
            
    def get_all_entity_pairs(self):
        return self._all_entity_pairs
            
    def get_triples_for_relation(self, rel):
        try:
            return self._kb_triples_by_relation[rel]
        except KeyError:
            return []

    def get_triples_for_entities(self, e1, e2):
        try:
            return self._kb_triples_by_entities[e1][e2]
        except KeyError:
            return []

    def __repr__(self):
        return 'KB with {} triples'.format(len(self._kb_triples))

In [3]:
kb = KB(kb_triples)
kb

KB with 56575 triples

In [4]:
all_relations = kb.get_all_relations()
print(len(all_relations))

16


In [5]:
for rel in all_relations:
    print('{:12d} {}'.format(len(kb.get_triples_for_relation(rel)), rel))

        2140 adjoins
        3316 author
         637 capital
       22489 contains
        4958 film_performance
        2404 founders
        1012 genre
        3280 has_sibling
        3774 has_spouse
        3153 is_a
        1981 nationality
        2013 parents
        1388 place_of_birth
        1031 place_of_death
        1526 profession
        1473 worked_at


In [6]:
for rel in all_relations:
    print(tuple(kb.get_triples_for_relation(rel)[0]))

('adjoins', 'Siegburg', 'Bonn')
('author', 'Uncle_Silas', 'Sheridan_Le_Fanu')
('capital', 'Tunisia', 'Tunis')
('contains', 'Brickfields', 'Kuala_Lumpur_Sentral_railway_station')
('film_performance', 'Colin_Hanks', 'The_Great_Buck_Howard')
('founders', 'Bomis', 'Jimmy_Wales')
('genre', 'SPARQL', 'Semantic_Web')
('has_sibling', 'Ari_Emanuel', 'Rahm_Emanuel')
('has_spouse', 'Percy_Bysshe_Shelley', 'Mary_Shelley')
('is_a', 'Bhanu_Athaiya', 'Costume_designer')
('nationality', 'Ruben_Rausing', 'Sweden')
('parents', 'Prince_Arthur_of_Connaught', 'Prince_Arthur,_Duke_of_Connaught_and_Strathearn')
('place_of_birth', 'William_Penny_Brookes', 'Much_Wenlock')
('place_of_death', 'Jean_Drapeau', 'Montreal')
('profession', 'Rufus_Wainwright', 'Actor')
('worked_at', 'Ray_Jackendoff', 'Tufts_University')
