In [24]:
# freebase knowledge base

# <subject>  <predicate>  <object>

import gzip
import numpy as np
import random
import os

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

rel_ext_data_home = '../cs224u/rel_ext_data'

In [25]:
KBTriple = namedtuple('KBTriple', 'rel, sbj, obj')

def read_kb_triples():
    kb_triples = []
    path = os.path.join(rel_ext_data_home, 'kb.tsv.gz')
    print('Reading KB triples from {} ...'.format(path))
    with gzip.open(path) as f:
        for line in f:
            rel, sbj, obj = line[:-1].decode('utf-8').split('\t')
            kb_triples.append(KBTriple(rel, sbj, obj))
    print('Read {} KB triples'.format(len(kb_triples)))
    return kb_triples

kb_triples = read_kb_triples()

Reading KB triples from ../cs224u/rel_ext_data/kb.tsv.gz ...
Read 56575 KB triples


In [26]:
class KB():

    def __init__(self, kb_triples):
        self._kb_triples = kb_triples
        self._all_relations = []
        self._all_entity_pairs = []
        self._kb_triples_by_relation = {}
        self._kb_triples_by_entities = {}
        self._collect_all_entity_pairs()
        self._index_kb_triples_by_relation()
        self._index_kb_triples_by_entities()

    def _collect_all_entity_pairs(self):
        pairs = set()
        for kbt in self._kb_triples:
            pairs.add((kbt.sbj, kbt.obj))
        self._all_entity_pairs = sorted(list(pairs))
        
    def _index_kb_triples_by_relation(self):
        for kbt in self._kb_triples:
            if kbt.rel not in self._kb_triples_by_relation:
                self._kb_triples_by_relation[kbt.rel] = []
            self._kb_triples_by_relation[kbt.rel].append(kbt)
        self._all_relations = sorted(list(self._kb_triples_by_relation))
    
    def _index_kb_triples_by_entities(self):
        for kbt in self._kb_triples:
            if kbt.sbj not in self._kb_triples_by_entities:
                self._kb_triples_by_entities[kbt.sbj] = {}
            if kbt.obj not in self._kb_triples_by_entities[kbt.sbj]:
                self._kb_triples_by_entities[kbt.sbj][kbt.obj] = []
            self._kb_triples_by_entities[kbt.sbj][kbt.obj].append(kbt)

    def get_triples(self):
        return iter(self._kb_triples)
        
    def get_all_relations(self):
        return self._all_relations
            
    def get_all_entity_pairs(self):
        return self._all_entity_pairs
            
    def get_triples_for_relation(self, rel):
        try:
            return self._kb_triples_by_relation[rel]
        except KeyError:
            return []

    def get_triples_for_entities(self, e1, e2):
        try:
            return self._kb_triples_by_entities[e1][e2]
        except KeyError:
            return []

    def __repr__(self):
        return 'KB with {} triples'.format(len(self._kb_triples))

In [27]:
kb = KB(kb_triples)
kb

KB with 56575 triples

In [28]:
all_relations = kb.get_all_relations()
print(len(all_relations))

16


In [29]:
for rel in all_relations:
    print('{:12d} {}'.format(len(kb.get_triples_for_relation(rel)), rel))

        2140 adjoins
        3316 author
         637 capital
       22489 contains
        4958 film_performance
        2404 founders
        1012 genre
        3280 has_sibling
        3774 has_spouse
        3153 is_a
        1981 nationality
        2013 parents
        1388 place_of_birth
        1031 place_of_death
        1526 profession
        1473 worked_at


In [30]:
for rel in all_relations:
    print(tuple(kb.get_triples_for_relation(rel)[0]))

('adjoins', 'Siegburg', 'Bonn')
('author', 'Uncle_Silas', 'Sheridan_Le_Fanu')
('capital', 'Tunisia', 'Tunis')
('contains', 'Brickfields', 'Kuala_Lumpur_Sentral_railway_station')
('film_performance', 'Colin_Hanks', 'The_Great_Buck_Howard')
('founders', 'Bomis', 'Jimmy_Wales')
('genre', 'SPARQL', 'Semantic_Web')
('has_sibling', 'Ari_Emanuel', 'Rahm_Emanuel')
('has_spouse', 'Percy_Bysshe_Shelley', 'Mary_Shelley')
('is_a', 'Bhanu_Athaiya', 'Costume_designer')
('nationality', 'Ruben_Rausing', 'Sweden')
('parents', 'Prince_Arthur_of_Connaught', 'Prince_Arthur,_Duke_of_Connaught_and_Strathearn')
('place_of_birth', 'William_Penny_Brookes', 'Much_Wenlock')
('place_of_death', 'Jean_Drapeau', 'Montreal')
('profession', 'Rufus_Wainwright', 'Actor')
('worked_at', 'Ray_Jackendoff', 'Tufts_University')


In [59]:
# import tensorflow as tf

In [119]:
import nltk
from nltk.parse.stanford import StanfordDependencyParser

# need to download the following files yourself since they are too big to be uploaded
# download from: https://nlp.stanford.edu/software/lex-parser.shtml#Download
# please refer this for more details: https://stackoverflow.com/questions/7443330/how-do-i-do-dependency-parsing-in-nltk 

path_to_jar = '/Users/hujiayu/Downloads/stanford-parser-full-2014-08-27/stanford-parser.jar'
path_to_models_jar = '/Users/hujiayu/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models.jar'

dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [109]:
# result = dependency_parser.raw_parse('I shot an elephant in my sleep')
# Example: [Who] did [shaq] first play for ?

result = dependency_parser.raw_parse('Who did shaq play for')
dep = result.__next__()
list(dep.triples())

[(('play', 'VB'), 'aux', ('did', 'VBD')),
 (('play', 'VB'), 'nsubj', ('shaq', 'NNP')),
 (('play', 'VB'), 'prep', ('for', 'IN')),
 (('for', 'IN'), 'pobj', ('Who', 'WP'))]

In [107]:
# list(dep.triples())
result = dependency_parser.raw_parse('Bell, based in Los Angeles, \
                                    makes and distributes electronic, \
                                    computer and building products')
dep = result.__next__()
list(dep.triples())

[(('makes', 'VBZ'), 'nsubj', ('Bell', 'NNP')),
 (('Bell', 'NNP'), 'vmod', ('based', 'VBN')),
 (('based', 'VBN'), 'prep', ('in', 'IN')),
 (('in', 'IN'), 'pobj', ('Angeles', 'NNP')),
 (('Angeles', 'NNP'), 'nn', ('Los', 'NNP')),
 (('makes', 'VBZ'), 'cc', ('and', 'CC')),
 (('makes', 'VBZ'), 'conj', ('distributes', 'VBZ')),
 (('makes', 'VBZ'), 'dobj', ('products', 'NNS')),
 (('products', 'NNS'), 'amod', ('electronic', 'JJ')),
 (('electronic', 'JJ'), 'conj', ('computer', 'NN')),
 (('electronic', 'JJ'), 'cc', ('and', 'CC')),
 (('electronic', 'JJ'), 'conj', ('building', 'NN'))]

In [115]:
# syntactic features

# pobj - prep - play - nsubj 


# to use: 
# word_q, label_path, edge_path = get_shortest_path(list(dep.triples()), mention1, mention2)

"""
parser example: 'Who did shaq play for'

[(('play', 'VB'), 'aux', ('did', 'VBD')),
 (('play', 'VB'), 'nsubj', ('shaq', 'NNP')),
 (('play', 'VB'), 'prep', ('for', 'IN')),
 (('for', 'IN'), 'pobj', ('Who', 'WP'))]
 
"""


def get_shortest_path(tree, mention, question_word):
    """
    returns word_path, label_path and edge_path from two mention words
    the entity mention and the question word are excluded from the dependency path 
   
    """
    label_path = []
    edge_path = []
    visited = [0]*len(tree)
    
    def find_word(curr_w):
        print("looking for word: ", curr_w)
        if curr_w == question_word: return True
        for idx, s in enumerate(tree):
            if visited[idx] == 0 and s[0][0] == curr_w:
                print('found word here: ', s)
                if s[0][0] not in word_q:
                    word_q.append(s[0][0])
                label_path.append(s[1])
                edge_path.append('left')
                word_q.append(s[2][0])
                visited[idx] = 1
                return True
            if visited[idx] == 0 and s[2][0] == curr_w:
                print('found word here: ', s)
                if s[2][0] not in word_q:
                    word_q.append(s[2][0])
                label_path.append(s[1])
                edge_path.append('right')
                word_q.append(s[0][0])
                visited[idx] = 1
                return True
        print("couldn't find word: ", curr_w)
        return False
    
    word_q = [mention]
    
    T = True
    while T:        
        k = word_q[-1]
        if k == question_word: 
            T = False
        if find_word(k) == False: 
            print('getting rid of word: ', k)
            word_q.pop()
            
    return word_q, label_path, edge_path
                                             
mention1 = 'shaq'
mention2 = 'Who'
        
word_q, label_path, edge_path = get_shortest_path(list(dep.triples()), mention1, mention2)

print(word_q)
print(label_path)
print(edge_path)

def extract_feature(word_q, label_path, edge_path):
    """
    returns list of features to be represented in word representation
    
    treats the path as a concatenation of vectors or words, dependency edge directions and dependency labels
    and feed it to the convolution layer
    
    extracting features from word_path, label_path, edge_path
    
    ['shaq', 'play', 'for', 'Who']
    ['nsubj', 'aux', 'prep', 'pobj']
    ['right', 'left', 'left', 'left']
    

    """
    
    
    
    
def get_word_repre(sentence):
    # vectors or words
    pass


def concat():
    
 
    pass
    


looking for word:  shaq
found word here:  (('play', 'VB'), 'nsubj', ('shaq', 'NNP'))
looking for word:  play
found word here:  (('play', 'VB'), 'aux', ('did', 'VBD'))
looking for word:  did
couldn't find word:  did
getting rid of word:  did
looking for word:  play
found word here:  (('play', 'VB'), 'prep', ('for', 'IN'))
looking for word:  for
found word here:  (('for', 'IN'), 'pobj', ('Who', 'WP'))
looking for word:  Who
['shaq', 'play', 'for', 'Who']
['nsubj', 'aux', 'prep', 'pobj']
['right', 'left', 'left', 'left']


In [77]:
# sentential features
# takes the words in the sentence as input excluding the question word and the entity mention

# did first play for
