In [420]:
# freebase knowledge base

# <subject>  <predicate>  <object>

import gzip
import numpy as np
import random
import os

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, pos_tag, ne_chunk

rel_ext_data_home = '../cs224u/rel_ext_data'

In [101]:
KBTriple = namedtuple('KBTriple', 'rel, sbj, obj')

def read_kb_triples():
    kb_triples = []
    path = os.path.join(rel_ext_data_home, 'kb.tsv.gz')
    print('Reading KB triples from {} ...'.format(path))
    with gzip.open(path) as f:
        for line in f:
            rel, sbj, obj = line[:-1].decode('utf-8').split('\t')
            kb_triples.append(KBTriple(rel, sbj, obj))
    print('Read {} KB triples'.format(len(kb_triples)))
    return kb_triples

kb_triples = read_kb_triples()

Reading KB triples from ../cs224u/rel_ext_data\kb.tsv.gz ...
Read 56575 KB triples


In [102]:
class KB():

    def __init__(self, kb_triples):
        self._kb_triples = kb_triples
        self._all_relations = []
        self._all_entity_pairs = []
        self._kb_triples_by_relation = {}
        self._kb_triples_by_entities = {}
        self._collect_all_entity_pairs()
        self._index_kb_triples_by_relation()
        self._index_kb_triples_by_entities()

    def _collect_all_entity_pairs(self):
        pairs = set()
        for kbt in self._kb_triples:
            pairs.add((kbt.sbj, kbt.obj))
        self._all_entity_pairs = sorted(list(pairs))
        
    def _index_kb_triples_by_relation(self):
        for kbt in self._kb_triples:
            if kbt.rel not in self._kb_triples_by_relation:
                self._kb_triples_by_relation[kbt.rel] = []
            self._kb_triples_by_relation[kbt.rel].append(kbt)
        self._all_relations = sorted(list(self._kb_triples_by_relation))
    
    def _index_kb_triples_by_entities(self):
        for kbt in self._kb_triples:
            if kbt.sbj not in self._kb_triples_by_entities:
                self._kb_triples_by_entities[kbt.sbj] = {}
            if kbt.obj not in self._kb_triples_by_entities[kbt.sbj]:
                self._kb_triples_by_entities[kbt.sbj][kbt.obj] = []
            self._kb_triples_by_entities[kbt.sbj][kbt.obj].append(kbt)

    def get_triples(self):
        return iter(self._kb_triples)
        
    def get_all_relations(self):
        return self._all_relations
            
    def get_all_entity_pairs(self):
        return self._all_entity_pairs
            
    def get_triples_for_relation(self, rel):
        try:
            return self._kb_triples_by_relation[rel]
        except KeyError:
            return []

    def get_triples_for_entities(self, e1, e2):
        try:
            return self._kb_triples_by_entities[e1][e2]
        except KeyError:
            return []

    def __repr__(self):
        return 'KB with {} triples'.format(len(self._kb_triples))

In [103]:
kb = KB(kb_triples)

In [104]:
all_relations = kb.get_all_relations()
print(len(all_relations))

16


In [105]:
for rel in all_relations:
    print('{:12d} {}'.format(len(kb.get_triples_for_relation(rel)), rel))

        2140 adjoins
        3316 author
         637 capital
       22489 contains
        4958 film_performance
        2404 founders
        1012 genre
        3280 has_sibling
        3774 has_spouse
        3153 is_a
        1981 nationality
        2013 parents
        1388 place_of_birth
        1031 place_of_death
        1526 profession
        1473 worked_at


In [106]:
for rel in all_relations:
    print(tuple(kb.get_triples_for_relation(rel)[0]))

('adjoins', 'Siegburg', 'Bonn')
('author', 'Uncle_Silas', 'Sheridan_Le_Fanu')
('capital', 'Tunisia', 'Tunis')
('contains', 'Brickfields', 'Kuala_Lumpur_Sentral_railway_station')
('film_performance', 'Colin_Hanks', 'The_Great_Buck_Howard')
('founders', 'Bomis', 'Jimmy_Wales')
('genre', 'SPARQL', 'Semantic_Web')
('has_sibling', 'Ari_Emanuel', 'Rahm_Emanuel')
('has_spouse', 'Percy_Bysshe_Shelley', 'Mary_Shelley')
('is_a', 'Bhanu_Athaiya', 'Costume_designer')
('nationality', 'Ruben_Rausing', 'Sweden')
('parents', 'Prince_Arthur_of_Connaught', 'Prince_Arthur,_Duke_of_Connaught_and_Strathearn')
('place_of_birth', 'William_Penny_Brookes', 'Much_Wenlock')
('place_of_death', 'Jean_Drapeau', 'Montreal')
('profession', 'Rufus_Wainwright', 'Actor')
('worked_at', 'Ray_Jackendoff', 'Tufts_University')


In [107]:
triples = []
for relation in kb.get_all_relations():
    triples.append(kb.get_triples_for_relation(relation)[0])

In [108]:
triples

[KBTriple(rel='adjoins', sbj='Siegburg', obj='Bonn'),
 KBTriple(rel='author', sbj='Uncle_Silas', obj='Sheridan_Le_Fanu'),
 KBTriple(rel='capital', sbj='Tunisia', obj='Tunis'),
 KBTriple(rel='contains', sbj='Brickfields', obj='Kuala_Lumpur_Sentral_railway_station'),
 KBTriple(rel='film_performance', sbj='Colin_Hanks', obj='The_Great_Buck_Howard'),
 KBTriple(rel='founders', sbj='Bomis', obj='Jimmy_Wales'),
 KBTriple(rel='genre', sbj='SPARQL', obj='Semantic_Web'),
 KBTriple(rel='has_sibling', sbj='Ari_Emanuel', obj='Rahm_Emanuel'),
 KBTriple(rel='has_spouse', sbj='Percy_Bysshe_Shelley', obj='Mary_Shelley'),
 KBTriple(rel='is_a', sbj='Bhanu_Athaiya', obj='Costume_designer'),
 KBTriple(rel='nationality', sbj='Ruben_Rausing', obj='Sweden'),
 KBTriple(rel='parents', sbj='Prince_Arthur_of_Connaught', obj='Prince_Arthur,_Duke_of_Connaught_and_Strathearn'),
 KBTriple(rel='place_of_birth', sbj='William_Penny_Brookes', obj='Much_Wenlock'),
 KBTriple(rel='place_of_death', sbj='Jean_Drapeau', obj='M

In [279]:
questions = [
    {'question': 'what is the city adjoining siegburg?', 'topic': 'siegburg', 'answer':'Bonn'},
    {'question': 'who is the author of uncle silas?', 'topic': 'uncle_silas', 'answer':'Sheridan le Fanu'},
    {'question': 'what is the capital of tunisia?', 'topic': 'tunisia', 'answer':'Tunis'},
    {'question': 'where is the kuala lumpur central railway station?', 'topic': 'kuala_lumpur_sentral_railway_station', 'answer':'Brickfields'},
    {'question': 'who stars in the great buck howard?', 'topic': 'the_great_buck_howard', 'answer':'Colin Hanks'},
    {'question': 'who is the founder of bomis?', 'topic': 'bomis', 'answer':'Jimmy Wales'},
    {'question': 'what is sparql?', 'topic': 'sparql', 'answer':'semantic query language'},
    {'question': 'what is the name of ari emanuel\'s brother?', 'topic': 'ari_emanuel', 'answer':'Rahm Emanuel'},
    {'question': 'what is the name of percy bysshe shelley\'s wife?', 'topic': 'percy_bysshe_shelley', 'answer':'Mary Shelley'},
    {'question': 'what does bhanu athaiya do?', 'topic': 'bhanu_athaiya', 'answer':'costume designer'},
    {'question': 'where is ruben rausing from?', 'topic': 'ruben_rausing', 'answer':'Sweden'},
    {'question': 'who is prince arthur of connaught\'s father?', 'topic': 'prince_arthur_of_connaught', 'answer':'Prince Arthur, Duke of Connaught and Strathearn'},
    {'question': 'where was william penny brookes born?', 'topic': 'william_penny_brookes', 'answer':'Much Wenlock'},
    {'question': 'where did jean drapeau die?', 'topic': 'jean_drapeau', 'answer':'Montreal'},
    {'question': 'what does rufus wainwright do?', 'topic': 'rufus_wainwright', 'answer':'actor'},
    {'question': 'where did ray jackendoff work?', 'topic': 'ray_jackendoff', 'answer':'Tufts University'}
]

In [280]:
# write into a file
f = open('train.txt', 'w')
for question in questions:
    f.write(question['question'] +'\n'+question['topic']+'\n'+question['answer']+'\n')
    f.write('\n')
    
f.close()

In [419]:
def identify_entities(question):
    # entities are [anything that starts with 'NN..']
    # questions words [pos starts with 'W']
    entities = []
    pos_tags = [pos_tag(word_tokenize(word)) for word in question.split(' ')]
    print(pos_tags)
    for tag in pos_tags:
        if tag[0][1].startswith('NN') or tag[0][1].startswith('W'):
            entities.append(tag[0])
    return entities

In [411]:
# Named Entity Version, STILL WORKING ON THIS
def pos_tagging(question):
    # entities are [anything that starts with 'NN..']
    # questions words [pos starts with 'W']
    return [pos_tag(word_tokenize(word)) for word in question.split(' ')]

def entity_tagging(question):
    pos_tags = pos_tagging(question)
    question = " " + question + " "
    for tag in pos_tags:
        if tag[0][1].startswith('NN'):
            question = question.replace(tag[0][0], tag[0][0].capitalize())
        else:
            question = question.replace(" " + tag[0][0] + " ", " ")
    # capitilize these nouns?
    ne_tree = ne_chunk(pos_tag(word_tokenize(question)))
    print(ne_tree[2])
    for entity in ne_tree:
        if(type(entity) == tuple):
            print(entity)
        else:
            print(entity.leaves())
      
            
cap_question = entity_tagging(question['question'])
print(cap_question)

('?', '.')
[('Ray', 'NNP')]
[('Jackendoff', 'NNP'), ('Work', 'NNP')]
('?', '.')
None


In [292]:
def create_entity_pairs(entity_list):
    pairs = []
    print(entity_list)
    for i in range(len(entity_list)):
        for j in range(i+1, len(entity_list)):
            pairs.append((entity_list[i][0], entity_list[j][0]))
    print('pairs:', pairs)

In [418]:
# Example: given a question, find the entity pairs 
entities = identify_entities(questions[5]['question'])
create_entity_pairs(entities)

[[('who', 'WP')], [('is', 'VBZ')], [('the', 'DT')], [('founder', 'NN')], [('of', 'IN')], [('bomis', 'NN'), ('?', '.')]]
[('who', 'WP'), ('founder', 'NN'), ('bomis', 'NN')]
pairs: [('who', 'founder'), ('who', 'bomis'), ('founder', 'bomis')]
